{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.528, "eval_steps": 100, "global_step": 66000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6507714629173279, "epoch": 8e-05, "grad_norm": 4.429927349090576, "learning_rate": 9e-06, "loss": 0.7301, "mean_token_accuracy": 0.7852319478988647, "num_tokens": 162895.0, "step": 10 }, { "entropy": 0.709451100230217, "epoch": 0.00016, "grad_norm": 5.744102954864502, "learning_rate": 1.9e-05, "loss": 0.701, "mean_token_accuracy": 0.8074600040912628, "num_tokens": 242300.0, "step": 20 }, { "entropy": 0.7808367550373078, "epoch": 0.00024, "grad_norm": 2.358705997467041, "learning_rate": 2.9e-05, "loss": 0.7694, "mean_token_accuracy": 0.7858784079551697, "num_tokens": 334985.0, "step": 30 }, { "entropy": 0.6573775619268417, "epoch": 0.00032, "grad_norm": 5.260980129241943, "learning_rate": 3.9000000000000006e-05, "loss": 0.6865, "mean_token_accuracy": 0.7915938436985016, "num_tokens": 470043.0, "step": 40 }, { "entropy": 0.7195190966129303, "epoch": 0.0004, "grad_norm": 8.754876136779785, "learning_rate": 4.9e-05, "loss": 0.7083, "mean_token_accuracy": 0.8109206974506378, "num_tokens": 504615.0, "step": 50 }, { "entropy": 0.6296031296253204, "epoch": 0.00048, "grad_norm": 2.57316517829895, "learning_rate": 4.999639855942377e-05, "loss": 0.6372, "mean_token_accuracy": 0.7964521169662475, "num_tokens": 668455.0, "step": 60 }, { "entropy": 0.6845487058162689, "epoch": 0.00056, "grad_norm": 7.697474002838135, "learning_rate": 4.999239695878352e-05, "loss": 0.6779, "mean_token_accuracy": 0.8015775620937348, "num_tokens": 770138.0, "step": 70 }, { "entropy": 0.7082340717315674, "epoch": 0.00064, "grad_norm": 4.232672214508057, "learning_rate": 4.998839535814326e-05, "loss": 0.7228, "mean_token_accuracy": 0.7945618093013763, "num_tokens": 863984.0, "step": 80 }, { "entropy": 0.676296204328537, "epoch": 0.00072, "grad_norm": 3.891113519668579, "learning_rate": 4.9984393757503004e-05, "loss": 0.6736, "mean_token_accuracy": 0.7924896955490113, "num_tokens": 998353.0, "step": 90 }, { "entropy": 0.7123007118701935, "epoch": 0.0008, "grad_norm": 8.374579429626465, "learning_rate": 4.998039215686275e-05, "loss": 0.7086, "mean_token_accuracy": 0.8116233587265015, "num_tokens": 1032907.0, "step": 100 }, { "entropy": 0.6769712090492248, "epoch": 0.00088, "grad_norm": 3.712862253189087, "learning_rate": 4.997639055622249e-05, "loss": 0.6829, "mean_token_accuracy": 0.785600870847702, "num_tokens": 1196747.0, "step": 110 }, { "entropy": 0.6452380985021591, "epoch": 0.00096, "grad_norm": 4.673105716705322, "learning_rate": 4.9972388955582235e-05, "loss": 0.6495, "mean_token_accuracy": 0.808261227607727, "num_tokens": 1288887.0, "step": 120 }, { "entropy": 0.8110557496547699, "epoch": 0.00104, "grad_norm": 3.2119204998016357, "learning_rate": 4.996838735494198e-05, "loss": 0.828, "mean_token_accuracy": 0.7745292365550995, "num_tokens": 1383146.0, "step": 130 }, { "entropy": 0.7479136109352111, "epoch": 0.00112, "grad_norm": 3.198303699493408, "learning_rate": 4.996438575430172e-05, "loss": 0.7392, "mean_token_accuracy": 0.7805075764656066, "num_tokens": 1516122.0, "step": 140 }, { "entropy": 0.7278152585029602, "epoch": 0.0012, "grad_norm": 7.114267826080322, "learning_rate": 4.9960384153661467e-05, "loss": 0.7423, "mean_token_accuracy": 0.8055591583251953, "num_tokens": 1550668.0, "step": 150 }, { "entropy": 0.7189743340015411, "epoch": 0.00128, "grad_norm": 2.9275388717651367, "learning_rate": 4.995638255302121e-05, "loss": 0.7208, "mean_token_accuracy": 0.7746395587921142, "num_tokens": 1713990.0, "step": 160 }, { "entropy": 0.8009894967079163, "epoch": 0.00136, "grad_norm": 5.635709285736084, "learning_rate": 4.9952380952380954e-05, "loss": 0.8027, "mean_token_accuracy": 0.7793729066848755, "num_tokens": 1785334.0, "step": 170 }, { "entropy": 0.6930669784545899, "epoch": 0.00144, "grad_norm": 2.4073734283447266, "learning_rate": 4.99483793517407e-05, "loss": 0.7081, "mean_token_accuracy": 0.7958597421646119, "num_tokens": 1879299.0, "step": 180 }, { "entropy": 0.7286915481090546, "epoch": 0.00152, "grad_norm": 5.519005298614502, "learning_rate": 4.994437775110044e-05, "loss": 0.7236, "mean_token_accuracy": 0.7794255375862121, "num_tokens": 2026192.0, "step": 190 }, { "entropy": 0.7027929842472076, "epoch": 0.0016, "grad_norm": 6.971383094787598, "learning_rate": 4.994037615046019e-05, "loss": 0.684, "mean_token_accuracy": 0.8116953372955322, "num_tokens": 2069868.0, "step": 200 }, { "entropy": 0.6777538537979126, "epoch": 0.00168, "grad_norm": 2.186314582824707, "learning_rate": 4.993637454981993e-05, "loss": 0.6794, "mean_token_accuracy": 0.789625060558319, "num_tokens": 2233708.0, "step": 210 }, { "entropy": 0.6882954239845276, "epoch": 0.00176, "grad_norm": 4.689647674560547, "learning_rate": 4.993237294917967e-05, "loss": 0.6992, "mean_token_accuracy": 0.795054417848587, "num_tokens": 2332417.0, "step": 220 }, { "entropy": 0.7382330179214478, "epoch": 0.00184, "grad_norm": 3.0396223068237305, "learning_rate": 4.9928371348539416e-05, "loss": 0.744, "mean_token_accuracy": 0.7894644200801849, "num_tokens": 2426374.0, "step": 230 }, { "entropy": 0.7606924951076508, "epoch": 0.00192, "grad_norm": 5.221226692199707, "learning_rate": 4.992436974789917e-05, "loss": 0.7586, "mean_token_accuracy": 0.7757872760295867, "num_tokens": 2561673.0, "step": 240 }, { "entropy": 0.6579767316579819, "epoch": 0.002, "grad_norm": 9.221600532531738, "learning_rate": 4.9920368147258904e-05, "loss": 0.6603, "mean_token_accuracy": 0.8145594000816345, "num_tokens": 2604534.0, "step": 250 }, { "entropy": 0.7054487943649292, "epoch": 0.00208, "grad_norm": 2.6218857765197754, "learning_rate": 4.991636654661865e-05, "loss": 0.7038, "mean_token_accuracy": 0.7799035251140595, "num_tokens": 2768374.0, "step": 260 }, { "entropy": 0.6492344260215759, "epoch": 0.00216, "grad_norm": 4.125511169433594, "learning_rate": 4.99123649459784e-05, "loss": 0.6516, "mean_token_accuracy": 0.8111377716064453, "num_tokens": 2859789.0, "step": 270 }, { "entropy": 0.6704593420028686, "epoch": 0.00224, "grad_norm": 2.189277410507202, "learning_rate": 4.990836334533814e-05, "loss": 0.6734, "mean_token_accuracy": 0.8055030524730682, "num_tokens": 2955007.0, "step": 280 }, { "entropy": 0.7489970594644546, "epoch": 0.00232, "grad_norm": 3.7625415325164795, "learning_rate": 4.990436174469788e-05, "loss": 0.7436, "mean_token_accuracy": 0.781987339258194, "num_tokens": 3080146.0, "step": 290 }, { "entropy": 0.7006563365459442, "epoch": 0.0024, "grad_norm": 7.541351318359375, "learning_rate": 4.990036014405762e-05, "loss": 0.7146, "mean_token_accuracy": 0.8103268384933472, "num_tokens": 3112829.0, "step": 300 }, { "entropy": 0.7489285886287689, "epoch": 0.00248, "grad_norm": 3.5241951942443848, "learning_rate": 4.989635854341737e-05, "loss": 0.7548, "mean_token_accuracy": 0.7670188069343566, "num_tokens": 3276669.0, "step": 310 }, { "entropy": 0.6862078130245208, "epoch": 0.00256, "grad_norm": 4.359572410583496, "learning_rate": 4.9892356942777116e-05, "loss": 0.688, "mean_token_accuracy": 0.7991568982601166, "num_tokens": 3378253.0, "step": 320 }, { "entropy": 0.6919533133506774, "epoch": 0.00264, "grad_norm": 2.252821445465088, "learning_rate": 4.988835534213685e-05, "loss": 0.6947, "mean_token_accuracy": 0.79412682056427, "num_tokens": 3473469.0, "step": 330 }, { "entropy": 0.7097479224205017, "epoch": 0.00272, "grad_norm": 2.8237907886505127, "learning_rate": 4.9884353741496604e-05, "loss": 0.7029, "mean_token_accuracy": 0.7853358447551727, "num_tokens": 3614108.0, "step": 340 }, { "entropy": 0.7572431981563568, "epoch": 0.0028, "grad_norm": 7.263293266296387, "learning_rate": 4.988035214085635e-05, "loss": 0.774, "mean_token_accuracy": 0.7985301375389099, "num_tokens": 3657223.0, "step": 350 }, { "entropy": 0.7113076210021972, "epoch": 0.00288, "grad_norm": 3.0624890327453613, "learning_rate": 4.987635054021609e-05, "loss": 0.7051, "mean_token_accuracy": 0.7803356051445007, "num_tokens": 3818295.0, "step": 360 }, { "entropy": 0.7431245446205139, "epoch": 0.00296, "grad_norm": 4.9513397216796875, "learning_rate": 4.987234893957583e-05, "loss": 0.7436, "mean_token_accuracy": 0.794154816865921, "num_tokens": 3894668.0, "step": 370 }, { "entropy": 0.7534842073917389, "epoch": 0.00304, "grad_norm": 3.101839303970337, "learning_rate": 4.986834733893558e-05, "loss": 0.7614, "mean_token_accuracy": 0.7856575429439545, "num_tokens": 3989980.0, "step": 380 }, { "entropy": 0.6808123409748077, "epoch": 0.00312, "grad_norm": 2.7980830669403076, "learning_rate": 4.986434573829532e-05, "loss": 0.6772, "mean_token_accuracy": 0.7910939931869507, "num_tokens": 4146019.0, "step": 390 }, { "entropy": 0.7043219983577729, "epoch": 0.0032, "grad_norm": 8.843242645263672, "learning_rate": 4.9860344137655066e-05, "loss": 0.7006, "mean_token_accuracy": 0.8033842384815216, "num_tokens": 4195344.0, "step": 400 }, { "entropy": 0.7284346163272858, "epoch": 0.00328, "grad_norm": 2.8774287700653076, "learning_rate": 4.985634253701481e-05, "loss": 0.7319, "mean_token_accuracy": 0.7754885196685791, "num_tokens": 4359184.0, "step": 410 }, { "entropy": 0.7241209030151368, "epoch": 0.00336, "grad_norm": 4.765346527099609, "learning_rate": 4.9852340936374554e-05, "loss": 0.7049, "mean_token_accuracy": 0.7924767255783081, "num_tokens": 4460284.0, "step": 420 }, { "entropy": 0.7351023375988006, "epoch": 0.00344, "grad_norm": 3.492975950241089, "learning_rate": 4.98483393357343e-05, "loss": 0.7395, "mean_token_accuracy": 0.7895567059516907, "num_tokens": 4556359.0, "step": 430 }, { "entropy": 0.6981489658355713, "epoch": 0.00352, "grad_norm": 3.3943898677825928, "learning_rate": 4.984433773509404e-05, "loss": 0.7005, "mean_token_accuracy": 0.7880447506904602, "num_tokens": 4693573.0, "step": 440 }, { "entropy": 0.7418078720569611, "epoch": 0.0036, "grad_norm": 7.602677822113037, "learning_rate": 4.9840336134453785e-05, "loss": 0.7462, "mean_token_accuracy": 0.8036210119724274, "num_tokens": 4733657.0, "step": 450 }, { "entropy": 0.7272563278675079, "epoch": 0.00368, "grad_norm": 2.3992133140563965, "learning_rate": 4.983633453381353e-05, "loss": 0.7312, "mean_token_accuracy": 0.775311428308487, "num_tokens": 4897497.0, "step": 460 }, { "entropy": 0.7265134453773499, "epoch": 0.00376, "grad_norm": 3.52299427986145, "learning_rate": 4.983233293317327e-05, "loss": 0.7318, "mean_token_accuracy": 0.7904669106006622, "num_tokens": 4993077.0, "step": 470 }, { "entropy": 0.7277519166469574, "epoch": 0.00384, "grad_norm": 2.102905035018921, "learning_rate": 4.9828331332533016e-05, "loss": 0.7237, "mean_token_accuracy": 0.7904852211475373, "num_tokens": 5089685.0, "step": 480 }, { "entropy": 0.7658823788166046, "epoch": 0.00392, "grad_norm": 4.337673187255859, "learning_rate": 4.982432973189276e-05, "loss": 0.7739, "mean_token_accuracy": 0.7719995021820069, "num_tokens": 5217277.0, "step": 490 }, { "entropy": 0.7299220204353333, "epoch": 0.004, "grad_norm": 9.58797836303711, "learning_rate": 4.98203281312525e-05, "loss": 0.7156, "mean_token_accuracy": 0.8075382351875305, "num_tokens": 5257408.0, "step": 500 }, { "entropy": 0.6836311757564545, "epoch": 0.00408, "grad_norm": 2.1089365482330322, "learning_rate": 4.981632653061225e-05, "loss": 0.6879, "mean_token_accuracy": 0.7831765949726105, "num_tokens": 5421248.0, "step": 510 }, { "entropy": 0.6548279821872711, "epoch": 0.00416, "grad_norm": 4.15057373046875, "learning_rate": 4.981232492997199e-05, "loss": 0.6498, "mean_token_accuracy": 0.8104029178619385, "num_tokens": 5501294.0, "step": 520 }, { "entropy": 0.7682763993740082, "epoch": 0.00424, "grad_norm": 2.931243658065796, "learning_rate": 4.9808323329331734e-05, "loss": 0.7718, "mean_token_accuracy": 0.7836812853813171, "num_tokens": 5594311.0, "step": 530 }, { "entropy": 0.7438209891319275, "epoch": 0.00432, "grad_norm": 3.787454128265381, "learning_rate": 4.980432172869148e-05, "loss": 0.7408, "mean_token_accuracy": 0.7753860414028168, "num_tokens": 5728805.0, "step": 540 }, { "entropy": 0.75733622610569, "epoch": 0.0044, "grad_norm": 6.792885780334473, "learning_rate": 4.980032012805123e-05, "loss": 0.7741, "mean_token_accuracy": 0.7943602561950683, "num_tokens": 5763388.0, "step": 550 }, { "entropy": 0.7141270041465759, "epoch": 0.00448, "grad_norm": 2.5870120525360107, "learning_rate": 4.9796318527410966e-05, "loss": 0.7156, "mean_token_accuracy": 0.7750549674034118, "num_tokens": 5927228.0, "step": 560 }, { "entropy": 0.7718937516212463, "epoch": 0.00456, "grad_norm": 4.0935187339782715, "learning_rate": 4.979231692677071e-05, "loss": 0.7699, "mean_token_accuracy": 0.7823116540908813, "num_tokens": 6012763.0, "step": 570 }, { "entropy": 0.6922759592533112, "epoch": 0.00464, "grad_norm": 2.0829005241394043, "learning_rate": 4.978831532613045e-05, "loss": 0.7143, "mean_token_accuracy": 0.795557290315628, "num_tokens": 6106242.0, "step": 580 }, { "entropy": 0.7282635390758514, "epoch": 0.00472, "grad_norm": 3.2144384384155273, "learning_rate": 4.9784313725490203e-05, "loss": 0.7199, "mean_token_accuracy": 0.7807377398014068, "num_tokens": 6238343.0, "step": 590 }, { "entropy": 0.7085329681634903, "epoch": 0.0048, "grad_norm": 6.484807014465332, "learning_rate": 4.978031212484994e-05, "loss": 0.7151, "mean_token_accuracy": 0.8032461702823639, "num_tokens": 6274944.0, "step": 600 }, { "entropy": 0.7168422341346741, "epoch": 0.00488, "grad_norm": 3.0567126274108887, "learning_rate": 4.9776310524209684e-05, "loss": 0.7167, "mean_token_accuracy": 0.7764045000076294, "num_tokens": 6438784.0, "step": 610 }, { "entropy": 0.6203178107738495, "epoch": 0.00496, "grad_norm": 4.5592803955078125, "learning_rate": 4.9772308923569435e-05, "loss": 0.618, "mean_token_accuracy": 0.8136127054691314, "num_tokens": 6535850.0, "step": 620 }, { "entropy": 0.7410872578620911, "epoch": 0.00504, "grad_norm": 2.5633814334869385, "learning_rate": 4.976830732292918e-05, "loss": 0.749, "mean_token_accuracy": 0.7821522176265716, "num_tokens": 6630551.0, "step": 630 }, { "entropy": 0.7187877893447876, "epoch": 0.00512, "grad_norm": 3.1734418869018555, "learning_rate": 4.9764305722288915e-05, "loss": 0.7139, "mean_token_accuracy": 0.7821878612041473, "num_tokens": 6766348.0, "step": 640 }, { "entropy": 0.7991439938545227, "epoch": 0.0052, "grad_norm": 7.224186897277832, "learning_rate": 4.976030412164866e-05, "loss": 0.7944, "mean_token_accuracy": 0.7976825058460235, "num_tokens": 6801061.0, "step": 650 }, { "entropy": 0.7079451203346252, "epoch": 0.00528, "grad_norm": 3.2769289016723633, "learning_rate": 4.975630252100841e-05, "loss": 0.7062, "mean_token_accuracy": 0.7783920526504516, "num_tokens": 6964846.0, "step": 660 }, { "entropy": 0.6748256385326385, "epoch": 0.00536, "grad_norm": 4.328115940093994, "learning_rate": 4.975230092036815e-05, "loss": 0.6778, "mean_token_accuracy": 0.809011447429657, "num_tokens": 7036789.0, "step": 670 }, { "entropy": 0.7209555029869079, "epoch": 0.00544, "grad_norm": 3.608036756515503, "learning_rate": 4.974829931972789e-05, "loss": 0.7111, "mean_token_accuracy": 0.7961807489395142, "num_tokens": 7130374.0, "step": 680 }, { "entropy": 0.7430384159088135, "epoch": 0.00552, "grad_norm": 3.259946346282959, "learning_rate": 4.9744297719087634e-05, "loss": 0.7396, "mean_token_accuracy": 0.777300626039505, "num_tokens": 7272084.0, "step": 690 }, { "entropy": 0.6898618221282959, "epoch": 0.0056, "grad_norm": 8.102684020996094, "learning_rate": 4.9740296118447384e-05, "loss": 0.6971, "mean_token_accuracy": 0.8159768581390381, "num_tokens": 7314269.0, "step": 700 }, { "entropy": 0.6940824151039123, "epoch": 0.00568, "grad_norm": 2.368478298187256, "learning_rate": 4.973629451780713e-05, "loss": 0.6926, "mean_token_accuracy": 0.7815034866333008, "num_tokens": 7477942.0, "step": 710 }, { "entropy": 0.7640352070331573, "epoch": 0.00576, "grad_norm": 4.47512149810791, "learning_rate": 4.9732292917166865e-05, "loss": 0.7538, "mean_token_accuracy": 0.7920064210891724, "num_tokens": 7550728.0, "step": 720 }, { "entropy": 0.7260487377643585, "epoch": 0.00584, "grad_norm": 2.0491504669189453, "learning_rate": 4.9728291316526615e-05, "loss": 0.7449, "mean_token_accuracy": 0.7872433841228486, "num_tokens": 7643777.0, "step": 730 }, { "entropy": 0.6967580676078796, "epoch": 0.00592, "grad_norm": 3.6222317218780518, "learning_rate": 4.972428971588636e-05, "loss": 0.6892, "mean_token_accuracy": 0.7906777679920196, "num_tokens": 7778711.0, "step": 740 }, { "entropy": 0.7963792145252228, "epoch": 0.006, "grad_norm": 6.105340957641602, "learning_rate": 4.97202881152461e-05, "loss": 0.7966, "mean_token_accuracy": 0.7954973042011261, "num_tokens": 7814594.0, "step": 750 }, { "entropy": 0.7209940195083618, "epoch": 0.00608, "grad_norm": 2.8363595008850098, "learning_rate": 4.971628651460584e-05, "loss": 0.717, "mean_token_accuracy": 0.7786260306835174, "num_tokens": 7976099.0, "step": 760 }, { "entropy": 0.745957887172699, "epoch": 0.00616, "grad_norm": 5.3391923904418945, "learning_rate": 4.971228491396559e-05, "loss": 0.751, "mean_token_accuracy": 0.7903479695320129, "num_tokens": 8047763.0, "step": 770 }, { "entropy": 0.7531169831752778, "epoch": 0.00624, "grad_norm": 2.473841667175293, "learning_rate": 4.9708283313325334e-05, "loss": 0.7618, "mean_token_accuracy": 0.7866666913032532, "num_tokens": 8140677.0, "step": 780 }, { "entropy": 0.7598116397857666, "epoch": 0.00632, "grad_norm": 3.589296817779541, "learning_rate": 4.970428171268508e-05, "loss": 0.7545, "mean_token_accuracy": 0.7788083016872406, "num_tokens": 8284690.0, "step": 790 }, { "entropy": 0.6526519536972046, "epoch": 0.0064, "grad_norm": 6.093932628631592, "learning_rate": 4.970028011204482e-05, "loss": 0.6557, "mean_token_accuracy": 0.8204808890819549, "num_tokens": 8327189.0, "step": 800 }, { "entropy": 0.7994199633598328, "epoch": 0.00648, "grad_norm": 2.9298038482666016, "learning_rate": 4.9696278511404565e-05, "loss": 0.8009, "mean_token_accuracy": 0.7551233530044555, "num_tokens": 8491029.0, "step": 810 }, { "entropy": 0.6884256780147553, "epoch": 0.00656, "grad_norm": 4.320605754852295, "learning_rate": 4.969227691076431e-05, "loss": 0.6844, "mean_token_accuracy": 0.8071065306663513, "num_tokens": 8572615.0, "step": 820 }, { "entropy": 0.7652777791023254, "epoch": 0.00664, "grad_norm": 2.685849189758301, "learning_rate": 4.968827531012405e-05, "loss": 0.7822, "mean_token_accuracy": 0.7817608416080475, "num_tokens": 8665649.0, "step": 830 }, { "entropy": 0.7237111687660217, "epoch": 0.00672, "grad_norm": 4.8448615074157715, "learning_rate": 4.9684273709483796e-05, "loss": 0.7178, "mean_token_accuracy": 0.7840542316436767, "num_tokens": 8799085.0, "step": 840 }, { "entropy": 0.7867446959018707, "epoch": 0.0068, "grad_norm": 6.984872341156006, "learning_rate": 4.968027210884354e-05, "loss": 0.7781, "mean_token_accuracy": 0.7900590240955353, "num_tokens": 8836595.0, "step": 850 }, { "entropy": 0.7455411076545715, "epoch": 0.00688, "grad_norm": 2.4253122806549072, "learning_rate": 4.9676270508203284e-05, "loss": 0.7474, "mean_token_accuracy": 0.7762879848480224, "num_tokens": 9000275.0, "step": 860 }, { "entropy": 0.6885509073734284, "epoch": 0.00696, "grad_norm": 5.387843132019043, "learning_rate": 4.967226890756303e-05, "loss": 0.6799, "mean_token_accuracy": 0.810466593503952, "num_tokens": 9076663.0, "step": 870 }, { "entropy": 0.7049158453941345, "epoch": 0.00704, "grad_norm": 3.4967427253723145, "learning_rate": 4.966826730692277e-05, "loss": 0.7223, "mean_token_accuracy": 0.794415545463562, "num_tokens": 9170679.0, "step": 880 }, { "entropy": 0.7885334074497223, "epoch": 0.00712, "grad_norm": 3.2929656505584717, "learning_rate": 4.9664265706282515e-05, "loss": 0.7667, "mean_token_accuracy": 0.7730207145214081, "num_tokens": 9313023.0, "step": 890 }, { "entropy": 0.6188335239887237, "epoch": 0.0072, "grad_norm": 6.817894458770752, "learning_rate": 4.966026410564226e-05, "loss": 0.6264, "mean_token_accuracy": 0.8225165724754333, "num_tokens": 9357029.0, "step": 900 }, { "entropy": 0.7180132091045379, "epoch": 0.00728, "grad_norm": 2.7803525924682617, "learning_rate": 4.9656262505002e-05, "loss": 0.7257, "mean_token_accuracy": 0.7764533519744873, "num_tokens": 9520869.0, "step": 910 }, { "entropy": 0.6292182028293609, "epoch": 0.00736, "grad_norm": 5.283628940582275, "learning_rate": 4.9652260904361746e-05, "loss": 0.6186, "mean_token_accuracy": 0.8190020143985748, "num_tokens": 9604565.0, "step": 920 }, { "entropy": 0.7328912258148194, "epoch": 0.00744, "grad_norm": 2.2813527584075928, "learning_rate": 4.964825930372149e-05, "loss": 0.7444, "mean_token_accuracy": 0.7853749692440033, "num_tokens": 9699480.0, "step": 930 }, { "entropy": 0.7622767984867096, "epoch": 0.00752, "grad_norm": 3.1180367469787598, "learning_rate": 4.964425770308124e-05, "loss": 0.7599, "mean_token_accuracy": 0.7733241140842437, "num_tokens": 9838632.0, "step": 940 }, { "entropy": 0.6907345831394196, "epoch": 0.0076, "grad_norm": 6.140064239501953, "learning_rate": 4.964025610244098e-05, "loss": 0.6915, "mean_token_accuracy": 0.8104984223842621, "num_tokens": 9874775.0, "step": 950 }, { "entropy": 0.6869640409946441, "epoch": 0.00768, "grad_norm": 2.5121326446533203, "learning_rate": 4.963625450180072e-05, "loss": 0.6908, "mean_token_accuracy": 0.7847276568412781, "num_tokens": 10038615.0, "step": 960 }, { "entropy": 0.701366513967514, "epoch": 0.00776, "grad_norm": 4.7564802169799805, "learning_rate": 4.9632252901160465e-05, "loss": 0.6896, "mean_token_accuracy": 0.8000771343708039, "num_tokens": 10127943.0, "step": 970 }, { "entropy": 0.7258500456809998, "epoch": 0.00784, "grad_norm": 1.8875713348388672, "learning_rate": 4.9628251300520215e-05, "loss": 0.7488, "mean_token_accuracy": 0.786683076620102, "num_tokens": 10221986.0, "step": 980 }, { "entropy": 0.734013956785202, "epoch": 0.00792, "grad_norm": 4.364891052246094, "learning_rate": 4.962424969987995e-05, "loss": 0.7271, "mean_token_accuracy": 0.780079698562622, "num_tokens": 10366051.0, "step": 990 }, { "entropy": 0.7808390855789185, "epoch": 0.008, "grad_norm": 6.689591884613037, "learning_rate": 4.9620248099239696e-05, "loss": 0.7741, "mean_token_accuracy": 0.792416387796402, "num_tokens": 10402262.0, "step": 1000 }, { "entropy": 0.761152058839798, "epoch": 0.00808, "grad_norm": 2.9037423133850098, "learning_rate": 4.9616246498599446e-05, "loss": 0.7681, "mean_token_accuracy": 0.7667646467685699, "num_tokens": 10565259.0, "step": 1010 }, { "entropy": 0.6892069637775421, "epoch": 0.00816, "grad_norm": 3.593710422515869, "learning_rate": 4.961224489795919e-05, "loss": 0.6938, "mean_token_accuracy": 0.8026678562164307, "num_tokens": 10645820.0, "step": 1020 }, { "entropy": 0.7236086368560791, "epoch": 0.00824, "grad_norm": 2.7833056449890137, "learning_rate": 4.960824329731893e-05, "loss": 0.7219, "mean_token_accuracy": 0.79210125207901, "num_tokens": 10740477.0, "step": 1030 }, { "entropy": 0.7461738169193268, "epoch": 0.00832, "grad_norm": 2.9896528720855713, "learning_rate": 4.960424169667867e-05, "loss": 0.7546, "mean_token_accuracy": 0.7724983751773834, "num_tokens": 10880937.0, "step": 1040 }, { "entropy": 0.6866599202156067, "epoch": 0.0084, "grad_norm": 7.494067192077637, "learning_rate": 4.960024009603842e-05, "loss": 0.6795, "mean_token_accuracy": 0.817442786693573, "num_tokens": 10922627.0, "step": 1050 }, { "entropy": 0.735072934627533, "epoch": 0.00848, "grad_norm": 3.6251606941223145, "learning_rate": 4.9596238495398165e-05, "loss": 0.7404, "mean_token_accuracy": 0.7716236233711242, "num_tokens": 11084374.0, "step": 1060 }, { "entropy": 0.742744529247284, "epoch": 0.00856, "grad_norm": 5.714815616607666, "learning_rate": 4.95922368947579e-05, "loss": 0.7411, "mean_token_accuracy": 0.7980181038379669, "num_tokens": 11152183.0, "step": 1070 }, { "entropy": 0.7223726451396942, "epoch": 0.00864, "grad_norm": 2.1188604831695557, "learning_rate": 4.958823529411765e-05, "loss": 0.729, "mean_token_accuracy": 0.7864938676357269, "num_tokens": 11246766.0, "step": 1080 }, { "entropy": 0.748977643251419, "epoch": 0.00872, "grad_norm": 3.927739143371582, "learning_rate": 4.9584233693477396e-05, "loss": 0.7487, "mean_token_accuracy": 0.7783907055854797, "num_tokens": 11378762.0, "step": 1090 }, { "entropy": 0.7161160290241242, "epoch": 0.0088, "grad_norm": 6.29900598526001, "learning_rate": 4.958023209283714e-05, "loss": 0.7066, "mean_token_accuracy": 0.8100019454956054, "num_tokens": 11417191.0, "step": 1100 }, { "entropy": 0.7269983321428299, "epoch": 0.00888, "grad_norm": 2.8201794624328613, "learning_rate": 4.9576230492196877e-05, "loss": 0.732, "mean_token_accuracy": 0.7746152877807617, "num_tokens": 11581031.0, "step": 1110 }, { "entropy": 0.7155558109283447, "epoch": 0.00896, "grad_norm": 4.480117321014404, "learning_rate": 4.957222889155663e-05, "loss": 0.716, "mean_token_accuracy": 0.7963277220726013, "num_tokens": 11665096.0, "step": 1120 }, { "entropy": 0.7536137193441391, "epoch": 0.00904, "grad_norm": 2.5301883220672607, "learning_rate": 4.956822729091637e-05, "loss": 0.7383, "mean_token_accuracy": 0.7877518594264984, "num_tokens": 11758889.0, "step": 1130 }, { "entropy": 0.6929598808288574, "epoch": 0.00912, "grad_norm": 3.1165215969085693, "learning_rate": 4.9564225690276114e-05, "loss": 0.6986, "mean_token_accuracy": 0.7866612315177918, "num_tokens": 11900252.0, "step": 1140 }, { "entropy": 0.6941762328147888, "epoch": 0.0092, "grad_norm": 8.410130500793457, "learning_rate": 4.956022408963586e-05, "loss": 0.6908, "mean_token_accuracy": 0.8139756202697754, "num_tokens": 11939153.0, "step": 1150 }, { "entropy": 0.7716194272041321, "epoch": 0.00928, "grad_norm": 2.317866563796997, "learning_rate": 4.95562224889956e-05, "loss": 0.7686, "mean_token_accuracy": 0.7645037651062012, "num_tokens": 12102782.0, "step": 1160 }, { "entropy": 0.7660478591918946, "epoch": 0.00936, "grad_norm": 4.6623454093933105, "learning_rate": 4.9552220888355346e-05, "loss": 0.763, "mean_token_accuracy": 0.7846660494804383, "num_tokens": 12180382.0, "step": 1170 }, { "entropy": 0.7505338907241821, "epoch": 0.00944, "grad_norm": 3.286421537399292, "learning_rate": 4.954821928771509e-05, "loss": 0.7465, "mean_token_accuracy": 0.7895946800708771, "num_tokens": 12273827.0, "step": 1180 }, { "entropy": 0.7099164664745331, "epoch": 0.00952, "grad_norm": 3.5322506427764893, "learning_rate": 4.954421768707483e-05, "loss": 0.7123, "mean_token_accuracy": 0.7869354486465454, "num_tokens": 12401449.0, "step": 1190 }, { "entropy": 0.7346357852220535, "epoch": 0.0096, "grad_norm": 8.395556449890137, "learning_rate": 4.954021608643458e-05, "loss": 0.7379, "mean_token_accuracy": 0.8063245594501496, "num_tokens": 12438931.0, "step": 1200 }, { "entropy": 0.6796200692653656, "epoch": 0.00968, "grad_norm": 3.127192497253418, "learning_rate": 4.953621448579432e-05, "loss": 0.6822, "mean_token_accuracy": 0.7860344529151917, "num_tokens": 12602771.0, "step": 1210 }, { "entropy": 0.7569026798009872, "epoch": 0.00976, "grad_norm": 5.136325359344482, "learning_rate": 4.9532212885154064e-05, "loss": 0.7456, "mean_token_accuracy": 0.7889817535877228, "num_tokens": 12678481.0, "step": 1220 }, { "entropy": 0.7308863699436188, "epoch": 0.00984, "grad_norm": 2.705658197402954, "learning_rate": 4.952821128451381e-05, "loss": 0.7384, "mean_token_accuracy": 0.79003786444664, "num_tokens": 12771259.0, "step": 1230 }, { "entropy": 0.6775107085704803, "epoch": 0.00992, "grad_norm": 3.7612521648406982, "learning_rate": 4.952420968387355e-05, "loss": 0.6765, "mean_token_accuracy": 0.7955106258392334, "num_tokens": 12900770.0, "step": 1240 }, { "entropy": 0.7740995317697525, "epoch": 0.01, "grad_norm": 9.39525318145752, "learning_rate": 4.9520208083233295e-05, "loss": 0.7773, "mean_token_accuracy": 0.7942067801952362, "num_tokens": 12940233.0, "step": 1250 }, { "entropy": 0.7462506592273712, "epoch": 0.01008, "grad_norm": 2.7231998443603516, "learning_rate": 4.951620648259304e-05, "loss": 0.7389, "mean_token_accuracy": 0.7747262418270111, "num_tokens": 13101242.0, "step": 1260 }, { "entropy": 0.70780528485775, "epoch": 0.01016, "grad_norm": 5.397933483123779, "learning_rate": 4.951220488195278e-05, "loss": 0.6993, "mean_token_accuracy": 0.8001505613327027, "num_tokens": 13176359.0, "step": 1270 }, { "entropy": 0.7040972828865051, "epoch": 0.01024, "grad_norm": 2.026827096939087, "learning_rate": 4.9508203281312526e-05, "loss": 0.7246, "mean_token_accuracy": 0.7958213329315186, "num_tokens": 13270206.0, "step": 1280 }, { "entropy": 0.7300345540046692, "epoch": 0.01032, "grad_norm": 3.0028018951416016, "learning_rate": 4.950420168067227e-05, "loss": 0.7213, "mean_token_accuracy": 0.7777849435806274, "num_tokens": 13411766.0, "step": 1290 }, { "entropy": 0.7221931219100952, "epoch": 0.0104, "grad_norm": 7.38576078414917, "learning_rate": 4.9500200080032014e-05, "loss": 0.7122, "mean_token_accuracy": 0.8071524739265442, "num_tokens": 13450444.0, "step": 1300 }, { "entropy": 0.7146982789039612, "epoch": 0.01048, "grad_norm": 2.992337703704834, "learning_rate": 4.949619847939176e-05, "loss": 0.7187, "mean_token_accuracy": 0.7787562012672424, "num_tokens": 13614274.0, "step": 1310 }, { "entropy": 0.7884484529495239, "epoch": 0.01056, "grad_norm": 4.436368465423584, "learning_rate": 4.94921968787515e-05, "loss": 0.7833, "mean_token_accuracy": 0.7763263463974, "num_tokens": 13706100.0, "step": 1320 }, { "entropy": 0.7762396812438965, "epoch": 0.01064, "grad_norm": 1.7831989526748657, "learning_rate": 4.948819527811125e-05, "loss": 0.7777, "mean_token_accuracy": 0.7817827343940735, "num_tokens": 13799647.0, "step": 1330 }, { "entropy": 0.7555424451828003, "epoch": 0.01072, "grad_norm": 3.572188377380371, "learning_rate": 4.948419367747099e-05, "loss": 0.7551, "mean_token_accuracy": 0.7775709569454193, "num_tokens": 13940881.0, "step": 1340 }, { "entropy": 0.7365279912948608, "epoch": 0.0108, "grad_norm": 7.071536540985107, "learning_rate": 4.948019207683073e-05, "loss": 0.7466, "mean_token_accuracy": 0.8030538141727448, "num_tokens": 13981925.0, "step": 1350 }, { "entropy": 0.7196246266365052, "epoch": 0.01088, "grad_norm": 3.2718727588653564, "learning_rate": 4.9476190476190476e-05, "loss": 0.7241, "mean_token_accuracy": 0.7741975486278534, "num_tokens": 14145283.0, "step": 1360 }, { "entropy": 0.7093834221363068, "epoch": 0.01096, "grad_norm": 4.571151256561279, "learning_rate": 4.947218887555023e-05, "loss": 0.7133, "mean_token_accuracy": 0.7952703475952149, "num_tokens": 14222722.0, "step": 1370 }, { "entropy": 0.7467518150806427, "epoch": 0.01104, "grad_norm": 2.6367878913879395, "learning_rate": 4.9468187274909964e-05, "loss": 0.7491, "mean_token_accuracy": 0.7870042264461518, "num_tokens": 14315024.0, "step": 1380 }, { "entropy": 0.7471884667873383, "epoch": 0.01112, "grad_norm": 4.184637546539307, "learning_rate": 4.946418567426971e-05, "loss": 0.7312, "mean_token_accuracy": 0.7794413864612579, "num_tokens": 14456274.0, "step": 1390 }, { "entropy": 0.674929165840149, "epoch": 0.0112, "grad_norm": 6.463746070861816, "learning_rate": 4.946018407362946e-05, "loss": 0.6881, "mean_token_accuracy": 0.812126898765564, "num_tokens": 14495617.0, "step": 1400 }, { "entropy": 0.6990273177623749, "epoch": 0.01128, "grad_norm": 1.8149659633636475, "learning_rate": 4.94561824729892e-05, "loss": 0.7038, "mean_token_accuracy": 0.7780776798725129, "num_tokens": 14659457.0, "step": 1410 }, { "entropy": 0.6872620403766632, "epoch": 0.01136, "grad_norm": 4.145060062408447, "learning_rate": 4.945218087234894e-05, "loss": 0.6703, "mean_token_accuracy": 0.8063830614089966, "num_tokens": 14743589.0, "step": 1420 }, { "entropy": 0.718813356757164, "epoch": 0.01144, "grad_norm": 2.683178663253784, "learning_rate": 4.944817927170868e-05, "loss": 0.7329, "mean_token_accuracy": 0.7907787919044494, "num_tokens": 14836692.0, "step": 1430 }, { "entropy": 0.7652511656284332, "epoch": 0.01152, "grad_norm": 3.208057165145874, "learning_rate": 4.944417767106843e-05, "loss": 0.7535, "mean_token_accuracy": 0.7727805972099304, "num_tokens": 14984860.0, "step": 1440 }, { "entropy": 0.6588312357664108, "epoch": 0.0116, "grad_norm": 6.1912736892700195, "learning_rate": 4.9440176070428176e-05, "loss": 0.6628, "mean_token_accuracy": 0.8169248461723327, "num_tokens": 15026267.0, "step": 1450 }, { "entropy": 0.7049889147281647, "epoch": 0.01168, "grad_norm": 1.7298328876495361, "learning_rate": 4.943617446978791e-05, "loss": 0.7077, "mean_token_accuracy": 0.7781509518623352, "num_tokens": 15190107.0, "step": 1460 }, { "entropy": 0.7237937122583389, "epoch": 0.01176, "grad_norm": 4.088359832763672, "learning_rate": 4.9432172869147664e-05, "loss": 0.7229, "mean_token_accuracy": 0.7906395673751831, "num_tokens": 15282293.0, "step": 1470 }, { "entropy": 0.7065132141113282, "epoch": 0.01184, "grad_norm": 2.3079020977020264, "learning_rate": 4.942817126850741e-05, "loss": 0.7272, "mean_token_accuracy": 0.7884982287883758, "num_tokens": 15376453.0, "step": 1480 }, { "entropy": 0.738176304101944, "epoch": 0.01192, "grad_norm": 3.2427749633789062, "learning_rate": 4.942416966786715e-05, "loss": 0.7221, "mean_token_accuracy": 0.7828199028968811, "num_tokens": 15513766.0, "step": 1490 }, { "entropy": 0.6627549588680267, "epoch": 0.012, "grad_norm": 6.320982933044434, "learning_rate": 4.942016806722689e-05, "loss": 0.6626, "mean_token_accuracy": 0.8209040105342865, "num_tokens": 15552192.0, "step": 1500 }, { "entropy": 0.6590524137020111, "epoch": 0.01208, "grad_norm": 2.022886276245117, "learning_rate": 4.941616646658664e-05, "loss": 0.6636, "mean_token_accuracy": 0.7882327914237977, "num_tokens": 15716032.0, "step": 1510 }, { "entropy": 0.6801579594612122, "epoch": 0.01216, "grad_norm": 4.665550231933594, "learning_rate": 4.941216486594638e-05, "loss": 0.6875, "mean_token_accuracy": 0.7992910742759705, "num_tokens": 15809767.0, "step": 1520 }, { "entropy": 0.7226155638694763, "epoch": 0.01224, "grad_norm": 1.968995213508606, "learning_rate": 4.9408163265306126e-05, "loss": 0.7526, "mean_token_accuracy": 0.785333651304245, "num_tokens": 15904014.0, "step": 1530 }, { "entropy": 0.75511834025383, "epoch": 0.01232, "grad_norm": 4.094054222106934, "learning_rate": 4.940416166466587e-05, "loss": 0.7362, "mean_token_accuracy": 0.7787286579608917, "num_tokens": 16041225.0, "step": 1540 }, { "entropy": 0.800292718410492, "epoch": 0.0124, "grad_norm": 8.763503074645996, "learning_rate": 4.9400160064025614e-05, "loss": 0.8021, "mean_token_accuracy": 0.7947818219661713, "num_tokens": 16081160.0, "step": 1550 }, { "entropy": 0.7729340255260467, "epoch": 0.01248, "grad_norm": 2.6140806674957275, "learning_rate": 4.939615846338536e-05, "loss": 0.7779, "mean_token_accuracy": 0.76690753698349, "num_tokens": 16244962.0, "step": 1560 }, { "entropy": 0.723165100812912, "epoch": 0.01256, "grad_norm": 4.730212211608887, "learning_rate": 4.93921568627451e-05, "loss": 0.7097, "mean_token_accuracy": 0.8029869019985199, "num_tokens": 16318938.0, "step": 1570 }, { "entropy": 0.7506249964237213, "epoch": 0.01264, "grad_norm": 2.521047353744507, "learning_rate": 4.9388155262104845e-05, "loss": 0.772, "mean_token_accuracy": 0.7822383522987366, "num_tokens": 16411466.0, "step": 1580 }, { "entropy": 0.7211664974689483, "epoch": 0.01272, "grad_norm": 3.3893063068389893, "learning_rate": 4.938415366146459e-05, "loss": 0.7126, "mean_token_accuracy": 0.7902637839317321, "num_tokens": 16533346.0, "step": 1590 }, { "entropy": 0.7015581846237182, "epoch": 0.0128, "grad_norm": 7.284299373626709, "learning_rate": 4.938015206082433e-05, "loss": 0.7154, "mean_token_accuracy": 0.8097581624984741, "num_tokens": 16566438.0, "step": 1600 }, { "entropy": 0.703062242269516, "epoch": 0.01288, "grad_norm": 2.1744017601013184, "learning_rate": 4.9376150460184076e-05, "loss": 0.6995, "mean_token_accuracy": 0.7848415911197663, "num_tokens": 16729403.0, "step": 1610 }, { "entropy": 0.7642398953437806, "epoch": 0.01296, "grad_norm": 4.316586971282959, "learning_rate": 4.937214885954382e-05, "loss": 0.7765, "mean_token_accuracy": 0.7787926197052002, "num_tokens": 16810118.0, "step": 1620 }, { "entropy": 0.8270363807678223, "epoch": 0.01304, "grad_norm": 2.022434711456299, "learning_rate": 4.936814725890356e-05, "loss": 0.8256, "mean_token_accuracy": 0.7727617084980011, "num_tokens": 16904791.0, "step": 1630 }, { "entropy": 0.6576340228319169, "epoch": 0.01312, "grad_norm": 4.238187789916992, "learning_rate": 4.936414565826331e-05, "loss": 0.6537, "mean_token_accuracy": 0.802699065208435, "num_tokens": 17021901.0, "step": 1640 }, { "entropy": 0.7202340602874756, "epoch": 0.0132, "grad_norm": 7.45681619644165, "learning_rate": 4.936014405762305e-05, "loss": 0.7194, "mean_token_accuracy": 0.8085145831108094, "num_tokens": 17056294.0, "step": 1650 }, { "entropy": 0.6882968485355377, "epoch": 0.01328, "grad_norm": 2.806755781173706, "learning_rate": 4.9356142456982794e-05, "loss": 0.6906, "mean_token_accuracy": 0.7876548767089844, "num_tokens": 17218621.0, "step": 1660 }, { "entropy": 0.6853957414627075, "epoch": 0.01336, "grad_norm": 4.023336410522461, "learning_rate": 4.935214085634254e-05, "loss": 0.689, "mean_token_accuracy": 0.8060727477073669, "num_tokens": 17291632.0, "step": 1670 }, { "entropy": 0.7394333422183991, "epoch": 0.01344, "grad_norm": 2.6105690002441406, "learning_rate": 4.934813925570229e-05, "loss": 0.7489, "mean_token_accuracy": 0.7883708477020264, "num_tokens": 17383946.0, "step": 1680 }, { "entropy": 0.7704400241374969, "epoch": 0.01352, "grad_norm": 2.5703344345092773, "learning_rate": 4.9344137655062026e-05, "loss": 0.761, "mean_token_accuracy": 0.7734412789344788, "num_tokens": 17525471.0, "step": 1690 }, { "entropy": 0.739964359998703, "epoch": 0.0136, "grad_norm": 6.99268102645874, "learning_rate": 4.934013605442177e-05, "loss": 0.7379, "mean_token_accuracy": 0.8063668549060822, "num_tokens": 17564246.0, "step": 1700 }, { "entropy": 0.7139234721660614, "epoch": 0.01368, "grad_norm": 2.1861586570739746, "learning_rate": 4.933613445378151e-05, "loss": 0.7237, "mean_token_accuracy": 0.7817049443721771, "num_tokens": 17728086.0, "step": 1710 }, { "entropy": 0.7639852881431579, "epoch": 0.01376, "grad_norm": 4.428254127502441, "learning_rate": 4.9332132853141263e-05, "loss": 0.7571, "mean_token_accuracy": 0.7887823641300201, "num_tokens": 17814943.0, "step": 1720 }, { "entropy": 0.7138378620147705, "epoch": 0.01384, "grad_norm": 2.1559863090515137, "learning_rate": 4.9328131252501e-05, "loss": 0.7183, "mean_token_accuracy": 0.7890248119831085, "num_tokens": 17911277.0, "step": 1730 }, { "entropy": 0.7076490700244904, "epoch": 0.01392, "grad_norm": 2.8085882663726807, "learning_rate": 4.9324129651860744e-05, "loss": 0.701, "mean_token_accuracy": 0.7887916147708893, "num_tokens": 18046257.0, "step": 1740 }, { "entropy": 0.7487620413303375, "epoch": 0.014, "grad_norm": 5.119711875915527, "learning_rate": 4.9320128051220495e-05, "loss": 0.7596, "mean_token_accuracy": 0.8022952497005462, "num_tokens": 18080821.0, "step": 1750 }, { "entropy": 0.6655975699424743, "epoch": 0.01408, "grad_norm": 2.83647084236145, "learning_rate": 4.931612645058024e-05, "loss": 0.6637, "mean_token_accuracy": 0.7882083475589752, "num_tokens": 18244661.0, "step": 1760 }, { "entropy": 0.7466322958469391, "epoch": 0.01416, "grad_norm": 4.299594879150391, "learning_rate": 4.9312124849939975e-05, "loss": 0.732, "mean_token_accuracy": 0.7958033740520477, "num_tokens": 18333615.0, "step": 1770 }, { "entropy": 0.7363991856575012, "epoch": 0.01424, "grad_norm": 2.031769037246704, "learning_rate": 4.930812324929972e-05, "loss": 0.7735, "mean_token_accuracy": 0.7849662661552429, "num_tokens": 18429331.0, "step": 1780 }, { "entropy": 0.8081060856580734, "epoch": 0.01432, "grad_norm": 3.1572766304016113, "learning_rate": 4.930412164865947e-05, "loss": 0.7917, "mean_token_accuracy": 0.7715187788009643, "num_tokens": 18549501.0, "step": 1790 }, { "entropy": 0.7053633213043213, "epoch": 0.0144, "grad_norm": 5.936098575592041, "learning_rate": 4.930012004801921e-05, "loss": 0.7131, "mean_token_accuracy": 0.8098117411136627, "num_tokens": 18584110.0, "step": 1800 }, { "entropy": 0.6939439952373505, "epoch": 0.01448, "grad_norm": 1.8099853992462158, "learning_rate": 4.929611844737895e-05, "loss": 0.7021, "mean_token_accuracy": 0.7775464177131652, "num_tokens": 18747950.0, "step": 1810 }, { "entropy": 0.7038142293691635, "epoch": 0.01456, "grad_norm": 4.3821868896484375, "learning_rate": 4.9292116846738694e-05, "loss": 0.6964, "mean_token_accuracy": 0.8010329008102417, "num_tokens": 18832182.0, "step": 1820 }, { "entropy": 0.6951277077198028, "epoch": 0.01464, "grad_norm": 4.193861961364746, "learning_rate": 4.9288115246098444e-05, "loss": 0.7112, "mean_token_accuracy": 0.7908429563045501, "num_tokens": 18927348.0, "step": 1830 }, { "entropy": 0.7288851082324982, "epoch": 0.01472, "grad_norm": 4.878921031951904, "learning_rate": 4.928411364545819e-05, "loss": 0.7261, "mean_token_accuracy": 0.787833571434021, "num_tokens": 19043684.0, "step": 1840 }, { "entropy": 0.7665198028087616, "epoch": 0.0148, "grad_norm": 7.43062162399292, "learning_rate": 4.9280112044817925e-05, "loss": 0.7665, "mean_token_accuracy": 0.8058547258377076, "num_tokens": 19076675.0, "step": 1850 }, { "entropy": 0.7143730640411377, "epoch": 0.01488, "grad_norm": 1.9996775388717651, "learning_rate": 4.9276110444177675e-05, "loss": 0.7149, "mean_token_accuracy": 0.7768247365951538, "num_tokens": 19239324.0, "step": 1860 }, { "entropy": 0.7429340839385986, "epoch": 0.01496, "grad_norm": 4.707919597625732, "learning_rate": 4.927210884353742e-05, "loss": 0.7339, "mean_token_accuracy": 0.7924903869628906, "num_tokens": 19311219.0, "step": 1870 }, { "entropy": 0.7730087876319885, "epoch": 0.01504, "grad_norm": 3.031806707382202, "learning_rate": 4.926810724289716e-05, "loss": 0.7906, "mean_token_accuracy": 0.7777944505214691, "num_tokens": 19403593.0, "step": 1880 }, { "entropy": 0.7335931479930877, "epoch": 0.01512, "grad_norm": 3.2720248699188232, "learning_rate": 4.92641056422569e-05, "loss": 0.729, "mean_token_accuracy": 0.7777323126792908, "num_tokens": 19551725.0, "step": 1890 }, { "entropy": 0.686473160982132, "epoch": 0.0152, "grad_norm": 6.55836296081543, "learning_rate": 4.926010404161665e-05, "loss": 0.6719, "mean_token_accuracy": 0.8170666694641113, "num_tokens": 19597283.0, "step": 1900 }, { "entropy": 0.6349276721477508, "epoch": 0.01528, "grad_norm": 2.0298166275024414, "learning_rate": 4.9256102440976394e-05, "loss": 0.6433, "mean_token_accuracy": 0.794565212726593, "num_tokens": 19761123.0, "step": 1910 }, { "entropy": 0.6913957893848419, "epoch": 0.01536, "grad_norm": 5.493407249450684, "learning_rate": 4.925210084033614e-05, "loss": 0.68, "mean_token_accuracy": 0.8051874935626984, "num_tokens": 19845268.0, "step": 1920 }, { "entropy": 0.7211273014545441, "epoch": 0.01544, "grad_norm": 2.2753801345825195, "learning_rate": 4.924809923969588e-05, "loss": 0.7522, "mean_token_accuracy": 0.7877909958362579, "num_tokens": 19940344.0, "step": 1930 }, { "entropy": 0.7095130741596222, "epoch": 0.01552, "grad_norm": 3.1009318828582764, "learning_rate": 4.9244097639055625e-05, "loss": 0.6947, "mean_token_accuracy": 0.7921667456626892, "num_tokens": 20065514.0, "step": 1940 }, { "entropy": 0.7691795051097869, "epoch": 0.0156, "grad_norm": 8.907858848571777, "learning_rate": 4.924009603841537e-05, "loss": 0.7852, "mean_token_accuracy": 0.7939061284065246, "num_tokens": 20100674.0, "step": 1950 }, { "entropy": 0.6877204596996307, "epoch": 0.01568, "grad_norm": 3.0445213317871094, "learning_rate": 4.923609443777511e-05, "loss": 0.6855, "mean_token_accuracy": 0.7866174459457398, "num_tokens": 20261769.0, "step": 1960 }, { "entropy": 0.6305505692958832, "epoch": 0.01576, "grad_norm": 4.121096134185791, "learning_rate": 4.9232092837134856e-05, "loss": 0.6167, "mean_token_accuracy": 0.8215989947319031, "num_tokens": 20333793.0, "step": 1970 }, { "entropy": 0.7370203793048858, "epoch": 0.01584, "grad_norm": 3.1083943843841553, "learning_rate": 4.92280912364946e-05, "loss": 0.7722, "mean_token_accuracy": 0.7815394639968872, "num_tokens": 20427462.0, "step": 1980 }, { "entropy": 0.7156701743602752, "epoch": 0.01592, "grad_norm": 2.2496771812438965, "learning_rate": 4.9224089635854344e-05, "loss": 0.6996, "mean_token_accuracy": 0.7832954943180084, "num_tokens": 20572521.0, "step": 1990 }, { "entropy": 0.6569605737924575, "epoch": 0.016, "grad_norm": 5.435658931732178, "learning_rate": 4.922008803521409e-05, "loss": 0.6554, "mean_token_accuracy": 0.8204359710216522, "num_tokens": 20615619.0, "step": 2000 }, { "entropy": 0.6849721252918244, "epoch": 0.01608, "grad_norm": 2.661966323852539, "learning_rate": 4.921608643457383e-05, "loss": 0.6933, "mean_token_accuracy": 0.7850632250308991, "num_tokens": 20779315.0, "step": 2010 }, { "entropy": 0.7483791887760163, "epoch": 0.01616, "grad_norm": 3.757647752761841, "learning_rate": 4.9212084833933575e-05, "loss": 0.7564, "mean_token_accuracy": 0.7874340236186981, "num_tokens": 20862068.0, "step": 2020 }, { "entropy": 0.731062388420105, "epoch": 0.01624, "grad_norm": 2.0911216735839844, "learning_rate": 4.920808323329332e-05, "loss": 0.7207, "mean_token_accuracy": 0.7994942307472229, "num_tokens": 20953430.0, "step": 2030 }, { "entropy": 0.7369845032691955, "epoch": 0.01632, "grad_norm": 3.413022756576538, "learning_rate": 4.920408163265306e-05, "loss": 0.732, "mean_token_accuracy": 0.7809330284595489, "num_tokens": 21087609.0, "step": 2040 }, { "entropy": 0.7195231497287751, "epoch": 0.0164, "grad_norm": 8.068056106567383, "learning_rate": 4.9200080032012806e-05, "loss": 0.731, "mean_token_accuracy": 0.8029634356498718, "num_tokens": 21126570.0, "step": 2050 }, { "entropy": 0.7511311948299408, "epoch": 0.01648, "grad_norm": 1.9743188619613647, "learning_rate": 4.919607843137255e-05, "loss": 0.7481, "mean_token_accuracy": 0.7726184666156769, "num_tokens": 21290410.0, "step": 2060 }, { "entropy": 0.7849664807319641, "epoch": 0.01656, "grad_norm": 3.858701467514038, "learning_rate": 4.91920768307323e-05, "loss": 0.7862, "mean_token_accuracy": 0.7788618803024292, "num_tokens": 21388119.0, "step": 2070 }, { "entropy": 0.7268196225166321, "epoch": 0.01664, "grad_norm": 1.9189808368682861, "learning_rate": 4.918807523009204e-05, "loss": 0.7318, "mean_token_accuracy": 0.7895957171916962, "num_tokens": 21482844.0, "step": 2080 }, { "entropy": 0.7474613308906555, "epoch": 0.01672, "grad_norm": 2.665999412536621, "learning_rate": 4.918407362945178e-05, "loss": 0.7333, "mean_token_accuracy": 0.7799427390098572, "num_tokens": 21621012.0, "step": 2090 }, { "entropy": 0.7403556883335114, "epoch": 0.0168, "grad_norm": 6.23445463180542, "learning_rate": 4.9180072028811525e-05, "loss": 0.7536, "mean_token_accuracy": 0.8058221399784088, "num_tokens": 21657414.0, "step": 2100 }, { "entropy": 0.6868530333042144, "epoch": 0.01688, "grad_norm": 2.0600368976593018, "learning_rate": 4.9176070428171275e-05, "loss": 0.6903, "mean_token_accuracy": 0.7830300569534302, "num_tokens": 21821254.0, "step": 2110 }, { "entropy": 0.7181076884269715, "epoch": 0.01696, "grad_norm": 4.331672668457031, "learning_rate": 4.917206882753101e-05, "loss": 0.7152, "mean_token_accuracy": 0.797038197517395, "num_tokens": 21906806.0, "step": 2120 }, { "entropy": 0.7748300194740295, "epoch": 0.01704, "grad_norm": 2.0349488258361816, "learning_rate": 4.9168067226890756e-05, "loss": 0.7885, "mean_token_accuracy": 0.7821911692619323, "num_tokens": 21999844.0, "step": 2130 }, { "entropy": 0.7976775884628295, "epoch": 0.01712, "grad_norm": 3.306980848312378, "learning_rate": 4.9164065626250506e-05, "loss": 0.78, "mean_token_accuracy": 0.7748634338378906, "num_tokens": 22133909.0, "step": 2140 }, { "entropy": 0.7229164898395538, "epoch": 0.0172, "grad_norm": 7.1719889640808105, "learning_rate": 4.916006402561025e-05, "loss": 0.7364, "mean_token_accuracy": 0.8085440456867218, "num_tokens": 22168984.0, "step": 2150 }, { "entropy": 0.6932688474655151, "epoch": 0.01728, "grad_norm": 2.3488235473632812, "learning_rate": 4.915606242496999e-05, "loss": 0.6916, "mean_token_accuracy": 0.7855251431465149, "num_tokens": 22331683.0, "step": 2160 }, { "entropy": 0.697521761059761, "epoch": 0.01736, "grad_norm": 5.013505458831787, "learning_rate": 4.915206082432973e-05, "loss": 0.684, "mean_token_accuracy": 0.8082835018634796, "num_tokens": 22404703.0, "step": 2170 }, { "entropy": 0.7458869874477386, "epoch": 0.01744, "grad_norm": 1.9726402759552002, "learning_rate": 4.914805922368948e-05, "loss": 0.7599, "mean_token_accuracy": 0.7834469258785248, "num_tokens": 22497865.0, "step": 2180 }, { "entropy": 0.7242090463638305, "epoch": 0.01752, "grad_norm": 4.543374538421631, "learning_rate": 4.9144057623049225e-05, "loss": 0.7116, "mean_token_accuracy": 0.7853207647800445, "num_tokens": 22641553.0, "step": 2190 }, { "entropy": 0.776950192451477, "epoch": 0.0176, "grad_norm": 7.255126476287842, "learning_rate": 4.914005602240896e-05, "loss": 0.7949, "mean_token_accuracy": 0.790094393491745, "num_tokens": 22677959.0, "step": 2200 }, { "entropy": 0.7580593705177308, "epoch": 0.01768, "grad_norm": 2.240267753601074, "learning_rate": 4.913605442176871e-05, "loss": 0.7552, "mean_token_accuracy": 0.7689477622509002, "num_tokens": 22841292.0, "step": 2210 }, { "entropy": 0.6655529499053955, "epoch": 0.01776, "grad_norm": 3.5556223392486572, "learning_rate": 4.9132052821128456e-05, "loss": 0.6654, "mean_token_accuracy": 0.8058184862136841, "num_tokens": 22929268.0, "step": 2220 }, { "entropy": 0.6898381233215332, "epoch": 0.01784, "grad_norm": 2.176741600036621, "learning_rate": 4.91280512204882e-05, "loss": 0.6942, "mean_token_accuracy": 0.7968876838684082, "num_tokens": 23024515.0, "step": 2230 }, { "entropy": 0.7187251031398774, "epoch": 0.01792, "grad_norm": 2.631596565246582, "learning_rate": 4.9124049619847937e-05, "loss": 0.7082, "mean_token_accuracy": 0.7858498990535736, "num_tokens": 23162204.0, "step": 2240 }, { "entropy": 0.681105625629425, "epoch": 0.018, "grad_norm": 6.879893779754639, "learning_rate": 4.912004801920769e-05, "loss": 0.6941, "mean_token_accuracy": 0.8159716725349426, "num_tokens": 23196886.0, "step": 2250 }, { "entropy": 0.7151732325553894, "epoch": 0.01808, "grad_norm": 2.35425066947937, "learning_rate": 4.911604641856743e-05, "loss": 0.7153, "mean_token_accuracy": 0.7767769932746887, "num_tokens": 23360726.0, "step": 2260 }, { "entropy": 0.7184282124042511, "epoch": 0.01816, "grad_norm": 3.5283377170562744, "learning_rate": 4.9112044817927174e-05, "loss": 0.7291, "mean_token_accuracy": 0.790639591217041, "num_tokens": 23443915.0, "step": 2270 }, { "entropy": 0.7312062859535218, "epoch": 0.01824, "grad_norm": 2.254810094833374, "learning_rate": 4.910804321728692e-05, "loss": 0.7449, "mean_token_accuracy": 0.791928231716156, "num_tokens": 23536834.0, "step": 2280 }, { "entropy": 0.7038416147232056, "epoch": 0.01832, "grad_norm": 4.299782752990723, "learning_rate": 4.910404161664666e-05, "loss": 0.6855, "mean_token_accuracy": 0.7918828725814819, "num_tokens": 23673329.0, "step": 2290 }, { "entropy": 0.7426240682601929, "epoch": 0.0184, "grad_norm": 6.186606407165527, "learning_rate": 4.9100040016006406e-05, "loss": 0.7655, "mean_token_accuracy": 0.7969763696193695, "num_tokens": 23713515.0, "step": 2300 }, { "entropy": 0.698635196685791, "epoch": 0.01848, "grad_norm": 2.2506041526794434, "learning_rate": 4.909603841536615e-05, "loss": 0.7044, "mean_token_accuracy": 0.7821507155895233, "num_tokens": 23877355.0, "step": 2310 }, { "entropy": 0.7054764240980148, "epoch": 0.01856, "grad_norm": 4.311376571655273, "learning_rate": 4.909203681472589e-05, "loss": 0.686, "mean_token_accuracy": 0.8048795104026795, "num_tokens": 23952031.0, "step": 2320 }, { "entropy": 0.6941352009773254, "epoch": 0.01864, "grad_norm": 2.3210690021514893, "learning_rate": 4.908803521408564e-05, "loss": 0.7273, "mean_token_accuracy": 0.7939295589923858, "num_tokens": 24045049.0, "step": 2330 }, { "entropy": 0.7633919000625611, "epoch": 0.01872, "grad_norm": 2.398254871368408, "learning_rate": 4.908403361344538e-05, "loss": 0.7441, "mean_token_accuracy": 0.7799079835414886, "num_tokens": 24180895.0, "step": 2340 }, { "entropy": 0.7836434125900269, "epoch": 0.0188, "grad_norm": 6.497384548187256, "learning_rate": 4.9080032012805124e-05, "loss": 0.7968, "mean_token_accuracy": 0.7929092228412629, "num_tokens": 24217877.0, "step": 2350 }, { "entropy": 0.7242812275886535, "epoch": 0.01888, "grad_norm": 2.940361261367798, "learning_rate": 4.907603041216487e-05, "loss": 0.7193, "mean_token_accuracy": 0.7759870767593384, "num_tokens": 24381384.0, "step": 2360 }, { "entropy": 0.6728922843933105, "epoch": 0.01896, "grad_norm": 2.941958427429199, "learning_rate": 4.907202881152461e-05, "loss": 0.6612, "mean_token_accuracy": 0.8060136198997497, "num_tokens": 24471732.0, "step": 2370 }, { "entropy": 0.7287032902240753, "epoch": 0.01904, "grad_norm": 1.969248652458191, "learning_rate": 4.9068027210884355e-05, "loss": 0.7684, "mean_token_accuracy": 0.7839639842510223, "num_tokens": 24565236.0, "step": 2380 }, { "entropy": 0.7614886283874511, "epoch": 0.01912, "grad_norm": 3.4235217571258545, "learning_rate": 4.90640256102441e-05, "loss": 0.7371, "mean_token_accuracy": 0.7780382692813873, "num_tokens": 24702908.0, "step": 2390 }, { "entropy": 0.7511681735515594, "epoch": 0.0192, "grad_norm": 6.10679292678833, "learning_rate": 4.906002400960384e-05, "loss": 0.7483, "mean_token_accuracy": 0.8016413271427154, "num_tokens": 24740793.0, "step": 2400 }, { "entropy": 0.6858961641788482, "epoch": 0.01928, "grad_norm": 3.158172369003296, "learning_rate": 4.9056022408963586e-05, "loss": 0.6954, "mean_token_accuracy": 0.7834080696105957, "num_tokens": 24904227.0, "step": 2410 }, { "entropy": 0.6956956088542938, "epoch": 0.01936, "grad_norm": 3.4067583084106445, "learning_rate": 4.905202080832333e-05, "loss": 0.6922, "mean_token_accuracy": 0.8060825347900391, "num_tokens": 24973767.0, "step": 2420 }, { "entropy": 0.768239575624466, "epoch": 0.01944, "grad_norm": 1.8181655406951904, "learning_rate": 4.9048019207683074e-05, "loss": 0.7734, "mean_token_accuracy": 0.7814831733703613, "num_tokens": 25066171.0, "step": 2430 }, { "entropy": 0.6940690398216247, "epoch": 0.01952, "grad_norm": 2.8333563804626465, "learning_rate": 4.904401760704282e-05, "loss": 0.6821, "mean_token_accuracy": 0.7886664032936096, "num_tokens": 25206580.0, "step": 2440 }, { "entropy": 0.7463361203670502, "epoch": 0.0196, "grad_norm": 7.173440933227539, "learning_rate": 4.904001600640256e-05, "loss": 0.7378, "mean_token_accuracy": 0.8018305897712708, "num_tokens": 25246132.0, "step": 2450 }, { "entropy": 0.6432995855808258, "epoch": 0.01968, "grad_norm": 1.690577745437622, "learning_rate": 4.903601440576231e-05, "loss": 0.6554, "mean_token_accuracy": 0.7922752797603607, "num_tokens": 25409972.0, "step": 2460 }, { "entropy": 0.7037009119987487, "epoch": 0.01976, "grad_norm": 3.6172828674316406, "learning_rate": 4.903201280512205e-05, "loss": 0.6906, "mean_token_accuracy": 0.7955339968204498, "num_tokens": 25510580.0, "step": 2470 }, { "entropy": 0.7251846253871918, "epoch": 0.01984, "grad_norm": 2.31469988822937, "learning_rate": 4.902801120448179e-05, "loss": 0.7284, "mean_token_accuracy": 0.793659770488739, "num_tokens": 25605585.0, "step": 2480 }, { "entropy": 0.7120417952537537, "epoch": 0.01992, "grad_norm": 2.507295846939087, "learning_rate": 4.9024009603841536e-05, "loss": 0.7098, "mean_token_accuracy": 0.7843227505683898, "num_tokens": 25749655.0, "step": 2490 }, { "entropy": 0.7418233633041382, "epoch": 0.02, "grad_norm": 5.769105434417725, "learning_rate": 4.902000800320129e-05, "loss": 0.7354, "mean_token_accuracy": 0.8019288778305054, "num_tokens": 25793850.0, "step": 2500 }, { "entropy": 0.6791086673736573, "epoch": 0.02008, "grad_norm": 2.8122305870056152, "learning_rate": 4.9016006402561024e-05, "loss": 0.6853, "mean_token_accuracy": 0.7893032848834991, "num_tokens": 25957410.0, "step": 2510 }, { "entropy": 0.6701219260692597, "epoch": 0.02016, "grad_norm": 3.9957923889160156, "learning_rate": 4.901200480192077e-05, "loss": 0.673, "mean_token_accuracy": 0.8048186659812927, "num_tokens": 26043373.0, "step": 2520 }, { "entropy": 0.6848908245563508, "epoch": 0.02024, "grad_norm": 2.7058675289154053, "learning_rate": 4.900800320128052e-05, "loss": 0.688, "mean_token_accuracy": 0.8027214646339417, "num_tokens": 26136151.0, "step": 2530 }, { "entropy": 0.7172627925872803, "epoch": 0.02032, "grad_norm": 3.0802907943725586, "learning_rate": 4.900400160064026e-05, "loss": 0.705, "mean_token_accuracy": 0.7854557275772095, "num_tokens": 26264680.0, "step": 2540 }, { "entropy": 0.7340508431196213, "epoch": 0.0204, "grad_norm": 6.354677200317383, "learning_rate": 4.9e-05, "loss": 0.7561, "mean_token_accuracy": 0.7999424755573272, "num_tokens": 26301463.0, "step": 2550 }, { "entropy": 0.7147069990634918, "epoch": 0.02048, "grad_norm": 2.7016654014587402, "learning_rate": 4.899599839935974e-05, "loss": 0.7189, "mean_token_accuracy": 0.7764289319515228, "num_tokens": 26465303.0, "step": 2560 }, { "entropy": 0.6914149463176728, "epoch": 0.02056, "grad_norm": 4.442059516906738, "learning_rate": 4.899199679871949e-05, "loss": 0.6976, "mean_token_accuracy": 0.8001210272312165, "num_tokens": 26553245.0, "step": 2570 }, { "entropy": 0.7528919696807861, "epoch": 0.02064, "grad_norm": 1.7540302276611328, "learning_rate": 4.8987995198079236e-05, "loss": 0.7443, "mean_token_accuracy": 0.7858653366565704, "num_tokens": 26645925.0, "step": 2580 }, { "entropy": 0.8117323875427246, "epoch": 0.02072, "grad_norm": 2.7747199535369873, "learning_rate": 4.898399359743897e-05, "loss": 0.7987, "mean_token_accuracy": 0.7667474687099457, "num_tokens": 26785641.0, "step": 2590 }, { "entropy": 0.6699220180511475, "epoch": 0.0208, "grad_norm": 8.674081802368164, "learning_rate": 4.8979991996798724e-05, "loss": 0.6717, "mean_token_accuracy": 0.8168933928012848, "num_tokens": 26826274.0, "step": 2600 }, { "entropy": 0.7324708998203278, "epoch": 0.02088, "grad_norm": 2.3795669078826904, "learning_rate": 4.897599039615847e-05, "loss": 0.7425, "mean_token_accuracy": 0.7721954166889191, "num_tokens": 26988341.0, "step": 2610 }, { "entropy": 0.8171143531799316, "epoch": 0.02096, "grad_norm": 4.651123523712158, "learning_rate": 4.897198879551821e-05, "loss": 0.8099, "mean_token_accuracy": 0.7809204757213593, "num_tokens": 27061412.0, "step": 2620 }, { "entropy": 0.7232203125953675, "epoch": 0.02104, "grad_norm": 2.4803590774536133, "learning_rate": 4.896798719487795e-05, "loss": 0.7209, "mean_token_accuracy": 0.788596099615097, "num_tokens": 27156066.0, "step": 2630 }, { "entropy": 0.8008165121078491, "epoch": 0.02112, "grad_norm": 2.668334722518921, "learning_rate": 4.89639855942377e-05, "loss": 0.7941, "mean_token_accuracy": 0.7654213786125184, "num_tokens": 27293182.0, "step": 2640 }, { "entropy": 0.7301748394966125, "epoch": 0.0212, "grad_norm": 6.499892711639404, "learning_rate": 4.895998399359744e-05, "loss": 0.7335, "mean_token_accuracy": 0.8105039417743682, "num_tokens": 27331584.0, "step": 2650 }, { "entropy": 0.7285056471824646, "epoch": 0.02128, "grad_norm": 2.383312225341797, "learning_rate": 4.8955982392957186e-05, "loss": 0.7334, "mean_token_accuracy": 0.7758439242839813, "num_tokens": 27495319.0, "step": 2660 }, { "entropy": 0.749692514538765, "epoch": 0.02136, "grad_norm": 3.9349417686462402, "learning_rate": 4.895198079231693e-05, "loss": 0.7439, "mean_token_accuracy": 0.7956937670707702, "num_tokens": 27572000.0, "step": 2670 }, { "entropy": 0.7304976582527161, "epoch": 0.02144, "grad_norm": 2.9061594009399414, "learning_rate": 4.8947979191676673e-05, "loss": 0.7529, "mean_token_accuracy": 0.7874346435070038, "num_tokens": 27665891.0, "step": 2680 }, { "entropy": 0.7112353265285491, "epoch": 0.02152, "grad_norm": 3.104005813598633, "learning_rate": 4.894397759103642e-05, "loss": 0.7079, "mean_token_accuracy": 0.786689031124115, "num_tokens": 27804174.0, "step": 2690 }, { "entropy": 0.6914333999156952, "epoch": 0.0216, "grad_norm": 5.114171028137207, "learning_rate": 4.893997599039616e-05, "loss": 0.6949, "mean_token_accuracy": 0.8134482145309448, "num_tokens": 27842953.0, "step": 2700 }, { "entropy": 0.6572482228279114, "epoch": 0.02168, "grad_norm": 2.3616247177124023, "learning_rate": 4.8935974389755905e-05, "loss": 0.6558, "mean_token_accuracy": 0.7935986876487732, "num_tokens": 28005621.0, "step": 2710 }, { "entropy": 0.6790615618228912, "epoch": 0.02176, "grad_norm": 5.170160293579102, "learning_rate": 4.893197278911565e-05, "loss": 0.6736, "mean_token_accuracy": 0.8081604897975921, "num_tokens": 28075384.0, "step": 2720 }, { "entropy": 0.6716658890247345, "epoch": 0.02184, "grad_norm": 2.261577606201172, "learning_rate": 4.892797118847539e-05, "loss": 0.6842, "mean_token_accuracy": 0.8012938499450684, "num_tokens": 28169005.0, "step": 2730 }, { "entropy": 0.8217464864253998, "epoch": 0.02192, "grad_norm": 2.8059489727020264, "learning_rate": 4.8923969587835136e-05, "loss": 0.7991, "mean_token_accuracy": 0.7674567878246308, "num_tokens": 28310257.0, "step": 2740 }, { "entropy": 0.6272239625453949, "epoch": 0.022, "grad_norm": 7.06040096282959, "learning_rate": 4.891996798719488e-05, "loss": 0.6373, "mean_token_accuracy": 0.8243892788887024, "num_tokens": 28346296.0, "step": 2750 }, { "entropy": 0.7363572716712952, "epoch": 0.02208, "grad_norm": 2.3114430904388428, "learning_rate": 4.891596638655462e-05, "loss": 0.7439, "mean_token_accuracy": 0.7719956040382385, "num_tokens": 28510136.0, "step": 2760 }, { "entropy": 0.7002906262874603, "epoch": 0.02216, "grad_norm": 4.178305149078369, "learning_rate": 4.891196478591437e-05, "loss": 0.6958, "mean_token_accuracy": 0.7976906716823577, "num_tokens": 28599782.0, "step": 2770 }, { "entropy": 0.7178764700889587, "epoch": 0.02224, "grad_norm": 2.040062189102173, "learning_rate": 4.890796318527411e-05, "loss": 0.7321, "mean_token_accuracy": 0.7957144975662231, "num_tokens": 28693456.0, "step": 2780 }, { "entropy": 0.8076128542423249, "epoch": 0.02232, "grad_norm": 2.596829414367676, "learning_rate": 4.8903961584633854e-05, "loss": 0.797, "mean_token_accuracy": 0.7638094544410705, "num_tokens": 28833760.0, "step": 2790 }, { "entropy": 0.6698639750480652, "epoch": 0.0224, "grad_norm": 7.673576831817627, "learning_rate": 4.88999599839936e-05, "loss": 0.6772, "mean_token_accuracy": 0.8173590838909149, "num_tokens": 28873214.0, "step": 2800 }, { "entropy": 0.6763679146766662, "epoch": 0.02248, "grad_norm": 2.149092197418213, "learning_rate": 4.889595838335335e-05, "loss": 0.6758, "mean_token_accuracy": 0.7878786087036133, "num_tokens": 29037054.0, "step": 2810 }, { "entropy": 0.6948373436927795, "epoch": 0.02256, "grad_norm": 3.5476572513580322, "learning_rate": 4.8891956782713085e-05, "loss": 0.6914, "mean_token_accuracy": 0.801122635602951, "num_tokens": 29124490.0, "step": 2820 }, { "entropy": 0.7224424839019775, "epoch": 0.02264, "grad_norm": 2.1704185009002686, "learning_rate": 4.888795518207283e-05, "loss": 0.744, "mean_token_accuracy": 0.789102065563202, "num_tokens": 29216818.0, "step": 2830 }, { "entropy": 0.725955355167389, "epoch": 0.02272, "grad_norm": 4.27885627746582, "learning_rate": 4.888395358143257e-05, "loss": 0.7187, "mean_token_accuracy": 0.7847002744674683, "num_tokens": 29344602.0, "step": 2840 }, { "entropy": 0.6338236063718796, "epoch": 0.0228, "grad_norm": 7.099063396453857, "learning_rate": 4.887995198079232e-05, "loss": 0.6311, "mean_token_accuracy": 0.8253203451633453, "num_tokens": 29384090.0, "step": 2850 }, { "entropy": 0.7195428550243378, "epoch": 0.02288, "grad_norm": 2.8732879161834717, "learning_rate": 4.887595038015206e-05, "loss": 0.7193, "mean_token_accuracy": 0.7814423501491546, "num_tokens": 29547930.0, "step": 2860 }, { "entropy": 0.7097188770771027, "epoch": 0.02296, "grad_norm": 4.520651817321777, "learning_rate": 4.8871948779511804e-05, "loss": 0.7138, "mean_token_accuracy": 0.7965745449066162, "num_tokens": 29647269.0, "step": 2870 }, { "entropy": 0.7630622118711472, "epoch": 0.02304, "grad_norm": 3.446641445159912, "learning_rate": 4.8867947178871555e-05, "loss": 0.7821, "mean_token_accuracy": 0.7854799807071686, "num_tokens": 29740664.0, "step": 2880 }, { "entropy": 0.7416740119457245, "epoch": 0.02312, "grad_norm": 2.939789056777954, "learning_rate": 4.88639455782313e-05, "loss": 0.7231, "mean_token_accuracy": 0.7825192391872406, "num_tokens": 29880563.0, "step": 2890 }, { "entropy": 0.7019606113433838, "epoch": 0.0232, "grad_norm": 6.802233695983887, "learning_rate": 4.8859943977591035e-05, "loss": 0.7217, "mean_token_accuracy": 0.8121614694595337, "num_tokens": 29913277.0, "step": 2900 }, { "entropy": 0.671284967660904, "epoch": 0.02328, "grad_norm": 2.269779682159424, "learning_rate": 4.885594237695078e-05, "loss": 0.6769, "mean_token_accuracy": 0.7863886237144471, "num_tokens": 30077117.0, "step": 2910 }, { "entropy": 0.7451409816741943, "epoch": 0.02336, "grad_norm": 4.112615585327148, "learning_rate": 4.885194077631053e-05, "loss": 0.738, "mean_token_accuracy": 0.7888492703437805, "num_tokens": 30167518.0, "step": 2920 }, { "entropy": 0.7404127359390259, "epoch": 0.02344, "grad_norm": 2.1681833267211914, "learning_rate": 4.884793917567027e-05, "loss": 0.7389, "mean_token_accuracy": 0.7888350903987884, "num_tokens": 30261583.0, "step": 2930 }, { "entropy": 0.8294391453266143, "epoch": 0.02352, "grad_norm": 3.3588881492614746, "learning_rate": 4.884393757503001e-05, "loss": 0.8175, "mean_token_accuracy": 0.7602410554885864, "num_tokens": 30409155.0, "step": 2940 }, { "entropy": 0.6610238790512085, "epoch": 0.0236, "grad_norm": 5.890703201293945, "learning_rate": 4.8839935974389754e-05, "loss": 0.6676, "mean_token_accuracy": 0.818226981163025, "num_tokens": 30454376.0, "step": 2950 }, { "entropy": 0.6473793029785156, "epoch": 0.02368, "grad_norm": 1.991442322731018, "learning_rate": 4.8835934373749504e-05, "loss": 0.652, "mean_token_accuracy": 0.7931668341159821, "num_tokens": 30618216.0, "step": 2960 }, { "entropy": 0.7378620386123658, "epoch": 0.02376, "grad_norm": 5.117310523986816, "learning_rate": 4.883193277310925e-05, "loss": 0.7359, "mean_token_accuracy": 0.7958198130130768, "num_tokens": 30697313.0, "step": 2970 }, { "entropy": 0.7611470460891724, "epoch": 0.02384, "grad_norm": 2.126880645751953, "learning_rate": 4.8827931172468985e-05, "loss": 0.7833, "mean_token_accuracy": 0.7807770729064941, "num_tokens": 30791575.0, "step": 2980 }, { "entropy": 0.7802484214305878, "epoch": 0.02392, "grad_norm": 3.563014507293701, "learning_rate": 4.8823929571828735e-05, "loss": 0.7663, "mean_token_accuracy": 0.7722851932048798, "num_tokens": 30929193.0, "step": 2990 }, { "entropy": 0.6475292831659317, "epoch": 0.024, "grad_norm": 6.105114936828613, "learning_rate": 4.881992797118848e-05, "loss": 0.6507, "mean_token_accuracy": 0.819639253616333, "num_tokens": 30968250.0, "step": 3000 }, { "entropy": 0.6572730362415313, "epoch": 0.02408, "grad_norm": 2.499779462814331, "learning_rate": 4.881592637054822e-05, "loss": 0.656, "mean_token_accuracy": 0.7930996596813202, "num_tokens": 31132090.0, "step": 3010 }, { "entropy": 0.6799336969852448, "epoch": 0.02416, "grad_norm": 4.459892749786377, "learning_rate": 4.881192476990796e-05, "loss": 0.6794, "mean_token_accuracy": 0.8008711576461792, "num_tokens": 31226979.0, "step": 3020 }, { "entropy": 0.8065119802951812, "epoch": 0.02424, "grad_norm": 2.3651506900787354, "learning_rate": 4.880792316926771e-05, "loss": 0.8286, "mean_token_accuracy": 0.7734039962291718, "num_tokens": 31321048.0, "step": 3030 }, { "entropy": 0.7449450492858887, "epoch": 0.02432, "grad_norm": 3.7420153617858887, "learning_rate": 4.8803921568627454e-05, "loss": 0.7263, "mean_token_accuracy": 0.7828940451145172, "num_tokens": 31465146.0, "step": 3040 }, { "entropy": 0.7828267335891723, "epoch": 0.0244, "grad_norm": 5.264736175537109, "learning_rate": 4.87999199679872e-05, "loss": 0.7757, "mean_token_accuracy": 0.7957457423210144, "num_tokens": 31504716.0, "step": 3050 }, { "entropy": 0.6328795552253723, "epoch": 0.02448, "grad_norm": 2.640345573425293, "learning_rate": 4.879591836734694e-05, "loss": 0.6389, "mean_token_accuracy": 0.7967940986156463, "num_tokens": 31668556.0, "step": 3060 }, { "entropy": 0.6620375990867615, "epoch": 0.02456, "grad_norm": 3.3788366317749023, "learning_rate": 4.8791916766706685e-05, "loss": 0.6548, "mean_token_accuracy": 0.8049805700778961, "num_tokens": 31766079.0, "step": 3070 }, { "entropy": 0.6728892624378204, "epoch": 0.02464, "grad_norm": 1.9122313261032104, "learning_rate": 4.878791516606643e-05, "loss": 0.6905, "mean_token_accuracy": 0.7992009162902832, "num_tokens": 31862077.0, "step": 3080 }, { "entropy": 0.7081407994031906, "epoch": 0.02472, "grad_norm": 3.080549478530884, "learning_rate": 4.878391356542617e-05, "loss": 0.6794, "mean_token_accuracy": 0.7939058363437652, "num_tokens": 31994728.0, "step": 3090 }, { "entropy": 0.6898452281951905, "epoch": 0.0248, "grad_norm": 6.533700942993164, "learning_rate": 4.8779911964785916e-05, "loss": 0.6937, "mean_token_accuracy": 0.8115132212638855, "num_tokens": 32029753.0, "step": 3100 }, { "entropy": 0.7245996296405792, "epoch": 0.02488, "grad_norm": 2.109569787979126, "learning_rate": 4.877591036414566e-05, "loss": 0.7373, "mean_token_accuracy": 0.7784621775150299, "num_tokens": 32189868.0, "step": 3110 }, { "entropy": 0.7425740182399749, "epoch": 0.02496, "grad_norm": 3.9410858154296875, "learning_rate": 4.8771908763505404e-05, "loss": 0.7372, "mean_token_accuracy": 0.7936085939407349, "num_tokens": 32264529.0, "step": 3120 }, { "entropy": 0.7893040597438812, "epoch": 0.02504, "grad_norm": 2.5522754192352295, "learning_rate": 4.876790716286515e-05, "loss": 0.8026, "mean_token_accuracy": 0.7737445831298828, "num_tokens": 32359283.0, "step": 3130 }, { "entropy": 0.7763759613037109, "epoch": 0.02512, "grad_norm": 2.753666400909424, "learning_rate": 4.876390556222489e-05, "loss": 0.7671, "mean_token_accuracy": 0.7742923498153687, "num_tokens": 32502594.0, "step": 3140 }, { "entropy": 0.7397145926952362, "epoch": 0.0252, "grad_norm": 6.098538875579834, "learning_rate": 4.8759903961584635e-05, "loss": 0.725, "mean_token_accuracy": 0.8029037415981293, "num_tokens": 32543146.0, "step": 3150 }, { "entropy": 0.6883262097835541, "epoch": 0.02528, "grad_norm": 3.274611473083496, "learning_rate": 4.875590236094438e-05, "loss": 0.7019, "mean_token_accuracy": 0.7838560700416565, "num_tokens": 32706183.0, "step": 3160 }, { "entropy": 0.7625058650970459, "epoch": 0.02536, "grad_norm": 5.222766399383545, "learning_rate": 4.875190076030412e-05, "loss": 0.7552, "mean_token_accuracy": 0.7933647990226745, "num_tokens": 32777132.0, "step": 3170 }, { "entropy": 0.653196144104004, "epoch": 0.02544, "grad_norm": 1.8034346103668213, "learning_rate": 4.8747899159663866e-05, "loss": 0.648, "mean_token_accuracy": 0.8098230123519897, "num_tokens": 32869457.0, "step": 3180 }, { "entropy": 0.6989150166511535, "epoch": 0.02552, "grad_norm": 2.672744035720825, "learning_rate": 4.874389755902361e-05, "loss": 0.709, "mean_token_accuracy": 0.7881196320056916, "num_tokens": 33009439.0, "step": 3190 }, { "entropy": 0.7803011178970337, "epoch": 0.0256, "grad_norm": 5.969078540802002, "learning_rate": 4.873989595838336e-05, "loss": 0.7557, "mean_token_accuracy": 0.7983561754226685, "num_tokens": 33047225.0, "step": 3200 }, { "entropy": 0.6770975112915039, "epoch": 0.02568, "grad_norm": 2.050699234008789, "learning_rate": 4.87358943577431e-05, "loss": 0.6801, "mean_token_accuracy": 0.7836529076099396, "num_tokens": 33211065.0, "step": 3210 }, { "entropy": 0.7397586941719055, "epoch": 0.02576, "grad_norm": 3.4407095909118652, "learning_rate": 4.873189275710284e-05, "loss": 0.7374, "mean_token_accuracy": 0.7875098466873169, "num_tokens": 33300495.0, "step": 3220 }, { "entropy": 0.7095887184143066, "epoch": 0.02584, "grad_norm": 1.7816894054412842, "learning_rate": 4.8727891156462584e-05, "loss": 0.7256, "mean_token_accuracy": 0.7961576044559479, "num_tokens": 33393665.0, "step": 3230 }, { "entropy": 0.7202504575252533, "epoch": 0.02592, "grad_norm": 4.0221848487854, "learning_rate": 4.8723889555822335e-05, "loss": 0.7123, "mean_token_accuracy": 0.7841461598873138, "num_tokens": 33541797.0, "step": 3240 }, { "entropy": 0.7046086490154266, "epoch": 0.026, "grad_norm": 6.3341169357299805, "learning_rate": 4.871988795518207e-05, "loss": 0.7078, "mean_token_accuracy": 0.8143023610115051, "num_tokens": 33582516.0, "step": 3250 }, { "entropy": 0.6845048606395722, "epoch": 0.02608, "grad_norm": 1.6917880773544312, "learning_rate": 4.8715886354541816e-05, "loss": 0.6904, "mean_token_accuracy": 0.7830972254276276, "num_tokens": 33746356.0, "step": 3260 }, { "entropy": 0.6578941166400909, "epoch": 0.02616, "grad_norm": 3.6678435802459717, "learning_rate": 4.8711884753901566e-05, "loss": 0.6527, "mean_token_accuracy": 0.8045732378959656, "num_tokens": 33840648.0, "step": 3270 }, { "entropy": 0.6879896283149719, "epoch": 0.02624, "grad_norm": 2.918776512145996, "learning_rate": 4.870788315326131e-05, "loss": 0.7107, "mean_token_accuracy": 0.7949449300765992, "num_tokens": 33935500.0, "step": 3280 }, { "entropy": 0.7077608048915863, "epoch": 0.02632, "grad_norm": 4.077775001525879, "learning_rate": 4.870388155262105e-05, "loss": 0.687, "mean_token_accuracy": 0.7908284306526184, "num_tokens": 34073088.0, "step": 3290 }, { "entropy": 0.6874390542507172, "epoch": 0.0264, "grad_norm": 6.449263572692871, "learning_rate": 4.869987995198079e-05, "loss": 0.6878, "mean_token_accuracy": 0.8150155901908874, "num_tokens": 34113666.0, "step": 3300 }, { "entropy": 0.672713303565979, "epoch": 0.02648, "grad_norm": 1.963557243347168, "learning_rate": 4.869587835134054e-05, "loss": 0.6768, "mean_token_accuracy": 0.7838910758495331, "num_tokens": 34277506.0, "step": 3310 }, { "entropy": 0.8219260394573211, "epoch": 0.02656, "grad_norm": 4.043260097503662, "learning_rate": 4.8691876750700285e-05, "loss": 0.8176, "mean_token_accuracy": 0.7791335761547089, "num_tokens": 34351823.0, "step": 3320 }, { "entropy": 0.6958089590072631, "epoch": 0.02664, "grad_norm": 2.1680588722229004, "learning_rate": 4.868787515006002e-05, "loss": 0.7107, "mean_token_accuracy": 0.8021677553653717, "num_tokens": 34443108.0, "step": 3330 }, { "entropy": 0.7094524085521698, "epoch": 0.02672, "grad_norm": 3.2990455627441406, "learning_rate": 4.868387354941977e-05, "loss": 0.69, "mean_token_accuracy": 0.7909932017326355, "num_tokens": 34574270.0, "step": 3340 }, { "entropy": 0.7457550436258316, "epoch": 0.0268, "grad_norm": 6.442076206207275, "learning_rate": 4.8679871948779516e-05, "loss": 0.7592, "mean_token_accuracy": 0.8003584027290345, "num_tokens": 34612673.0, "step": 3350 }, { "entropy": 0.6830656886100769, "epoch": 0.02688, "grad_norm": 1.862870216369629, "learning_rate": 4.867587034813926e-05, "loss": 0.6835, "mean_token_accuracy": 0.7861669838428498, "num_tokens": 34775638.0, "step": 3360 }, { "entropy": 0.6682548671960831, "epoch": 0.02696, "grad_norm": 3.8543732166290283, "learning_rate": 4.8671868747498996e-05, "loss": 0.6583, "mean_token_accuracy": 0.8117214143276215, "num_tokens": 34852805.0, "step": 3370 }, { "entropy": 0.7161305248737335, "epoch": 0.02704, "grad_norm": 2.3962514400482178, "learning_rate": 4.866786714685875e-05, "loss": 0.7341, "mean_token_accuracy": 0.7923914432525635, "num_tokens": 34945830.0, "step": 3380 }, { "entropy": 0.7227741420269013, "epoch": 0.02712, "grad_norm": 2.998955249786377, "learning_rate": 4.866386554621849e-05, "loss": 0.7097, "mean_token_accuracy": 0.787630832195282, "num_tokens": 35076425.0, "step": 3390 }, { "entropy": 0.7278798639774322, "epoch": 0.0272, "grad_norm": 7.6753010749816895, "learning_rate": 4.8659863945578234e-05, "loss": 0.7173, "mean_token_accuracy": 0.8094954490661621, "num_tokens": 35111771.0, "step": 3400 }, { "entropy": 0.6973368495702743, "epoch": 0.02728, "grad_norm": 2.439821720123291, "learning_rate": 4.865586234493798e-05, "loss": 0.7038, "mean_token_accuracy": 0.7852036654949188, "num_tokens": 35274347.0, "step": 3410 }, { "entropy": 0.7263358950614929, "epoch": 0.02736, "grad_norm": 4.342244625091553, "learning_rate": 4.865186074429772e-05, "loss": 0.7253, "mean_token_accuracy": 0.7959439396858216, "num_tokens": 35353606.0, "step": 3420 }, { "entropy": 0.7684261798858643, "epoch": 0.02744, "grad_norm": 2.6469712257385254, "learning_rate": 4.8647859143657466e-05, "loss": 0.7795, "mean_token_accuracy": 0.783309155702591, "num_tokens": 35446578.0, "step": 3430 }, { "entropy": 0.7664063274860382, "epoch": 0.02752, "grad_norm": 3.7330009937286377, "learning_rate": 4.864385754301721e-05, "loss": 0.7603, "mean_token_accuracy": 0.7756162524223328, "num_tokens": 35580122.0, "step": 3440 }, { "entropy": 0.6178101390600205, "epoch": 0.0276, "grad_norm": 6.7943291664123535, "learning_rate": 4.863985594237695e-05, "loss": 0.6177, "mean_token_accuracy": 0.8306280851364136, "num_tokens": 35619584.0, "step": 3450 }, { "entropy": 0.679541540145874, "epoch": 0.02768, "grad_norm": 2.6590065956115723, "learning_rate": 4.86358543417367e-05, "loss": 0.6814, "mean_token_accuracy": 0.7852650165557862, "num_tokens": 35783424.0, "step": 3460 }, { "entropy": 0.6508220255374908, "epoch": 0.02776, "grad_norm": 3.176647186279297, "learning_rate": 4.863185274109644e-05, "loss": 0.6513, "mean_token_accuracy": 0.8100606083869935, "num_tokens": 35873012.0, "step": 3470 }, { "entropy": 0.725865113735199, "epoch": 0.02784, "grad_norm": 2.3940484523773193, "learning_rate": 4.8627851140456184e-05, "loss": 0.7419, "mean_token_accuracy": 0.7874645173549653, "num_tokens": 35966985.0, "step": 3480 }, { "entropy": 0.7799921989440918, "epoch": 0.02792, "grad_norm": 3.377959966659546, "learning_rate": 4.862384953981593e-05, "loss": 0.7581, "mean_token_accuracy": 0.7762013792991638, "num_tokens": 36112603.0, "step": 3490 }, { "entropy": 0.6595322728157044, "epoch": 0.028, "grad_norm": 6.063498497009277, "learning_rate": 4.861984793917567e-05, "loss": 0.6797, "mean_token_accuracy": 0.8124588012695313, "num_tokens": 36157431.0, "step": 3500 }, { "entropy": 0.6499921053647995, "epoch": 0.02808, "grad_norm": 2.0988099575042725, "learning_rate": 4.8615846338535415e-05, "loss": 0.6499, "mean_token_accuracy": 0.7942782163619995, "num_tokens": 36321271.0, "step": 3510 }, { "entropy": 0.7380377978086472, "epoch": 0.02816, "grad_norm": 3.447129011154175, "learning_rate": 4.8611844737895166e-05, "loss": 0.7355, "mean_token_accuracy": 0.7922214150428772, "num_tokens": 36413496.0, "step": 3520 }, { "entropy": 0.7175328254699707, "epoch": 0.02824, "grad_norm": 3.160722255706787, "learning_rate": 4.86078431372549e-05, "loss": 0.7275, "mean_token_accuracy": 0.7924160182476043, "num_tokens": 36505631.0, "step": 3530 }, { "entropy": 0.682248717546463, "epoch": 0.02832, "grad_norm": 2.938002109527588, "learning_rate": 4.8603841536614646e-05, "loss": 0.6702, "mean_token_accuracy": 0.7957422614097596, "num_tokens": 36637532.0, "step": 3540 }, { "entropy": 0.7062933176755906, "epoch": 0.0284, "grad_norm": 6.057903289794922, "learning_rate": 4.859983993597439e-05, "loss": 0.7168, "mean_token_accuracy": 0.8081153273582459, "num_tokens": 36673154.0, "step": 3550 }, { "entropy": 0.6261008560657502, "epoch": 0.02848, "grad_norm": 1.7074294090270996, "learning_rate": 4.859583833533414e-05, "loss": 0.6247, "mean_token_accuracy": 0.8000545024871826, "num_tokens": 36836666.0, "step": 3560 }, { "entropy": 0.6115414768457412, "epoch": 0.02856, "grad_norm": 4.514317512512207, "learning_rate": 4.859183673469388e-05, "loss": 0.6036, "mean_token_accuracy": 0.823433804512024, "num_tokens": 36905601.0, "step": 3570 }, { "entropy": 0.7417728722095489, "epoch": 0.02864, "grad_norm": 1.857154130935669, "learning_rate": 4.858783513405362e-05, "loss": 0.7762, "mean_token_accuracy": 0.7839767694473266, "num_tokens": 36998656.0, "step": 3580 }, { "entropy": 0.7157347142696381, "epoch": 0.02872, "grad_norm": 2.4326133728027344, "learning_rate": 4.858383353341337e-05, "loss": 0.7024, "mean_token_accuracy": 0.7895205080509186, "num_tokens": 37135350.0, "step": 3590 }, { "entropy": 0.7251774847507477, "epoch": 0.0288, "grad_norm": 5.4248151779174805, "learning_rate": 4.8579831932773115e-05, "loss": 0.7187, "mean_token_accuracy": 0.8076128423213959, "num_tokens": 37180927.0, "step": 3600 }, { "entropy": 0.6965439319610596, "epoch": 0.02888, "grad_norm": 1.9491922855377197, "learning_rate": 4.857583033213285e-05, "loss": 0.6994, "mean_token_accuracy": 0.7827735722064972, "num_tokens": 37344767.0, "step": 3610 }, { "entropy": 0.7125351309776307, "epoch": 0.02896, "grad_norm": 3.7980659008026123, "learning_rate": 4.8571828731492596e-05, "loss": 0.7044, "mean_token_accuracy": 0.7997235536575318, "num_tokens": 37431797.0, "step": 3620 }, { "entropy": 0.6712880671024323, "epoch": 0.02904, "grad_norm": 2.534447431564331, "learning_rate": 4.8567827130852347e-05, "loss": 0.6774, "mean_token_accuracy": 0.8011751532554626, "num_tokens": 37525700.0, "step": 3630 }, { "entropy": 0.7325255036354065, "epoch": 0.02912, "grad_norm": 3.2565786838531494, "learning_rate": 4.856382553021209e-05, "loss": 0.7269, "mean_token_accuracy": 0.7789775729179382, "num_tokens": 37660078.0, "step": 3640 }, { "entropy": 0.710261783003807, "epoch": 0.0292, "grad_norm": 6.5651116371154785, "learning_rate": 4.855982392957183e-05, "loss": 0.7057, "mean_token_accuracy": 0.8141268074512482, "num_tokens": 37698439.0, "step": 3650 }, { "entropy": 0.6456060707569122, "epoch": 0.02928, "grad_norm": 2.195666551589966, "learning_rate": 4.855582232893158e-05, "loss": 0.6486, "mean_token_accuracy": 0.796647846698761, "num_tokens": 37861683.0, "step": 3660 }, { "entropy": 0.7833813071250916, "epoch": 0.02936, "grad_norm": 4.223024368286133, "learning_rate": 4.855182072829132e-05, "loss": 0.768, "mean_token_accuracy": 0.7916329801082611, "num_tokens": 37929853.0, "step": 3670 }, { "entropy": 0.7134731113910675, "epoch": 0.02944, "grad_norm": 1.8442155122756958, "learning_rate": 4.8547819127651065e-05, "loss": 0.7279, "mean_token_accuracy": 0.7941513299942017, "num_tokens": 38021615.0, "step": 3680 }, { "entropy": 0.7260631203651429, "epoch": 0.02952, "grad_norm": 3.563992738723755, "learning_rate": 4.85438175270108e-05, "loss": 0.7223, "mean_token_accuracy": 0.7836956024169922, "num_tokens": 38156684.0, "step": 3690 }, { "entropy": 0.7171819806098938, "epoch": 0.0296, "grad_norm": 6.491350173950195, "learning_rate": 4.853981592637055e-05, "loss": 0.7035, "mean_token_accuracy": 0.8122252583503723, "num_tokens": 38196226.0, "step": 3700 }, { "entropy": 0.6663235366344452, "epoch": 0.02968, "grad_norm": 2.224949598312378, "learning_rate": 4.8535814325730296e-05, "loss": 0.6747, "mean_token_accuracy": 0.7887937724590302, "num_tokens": 38359938.0, "step": 3710 }, { "entropy": 0.6514915376901627, "epoch": 0.02976, "grad_norm": 3.802734136581421, "learning_rate": 4.853181272509004e-05, "loss": 0.6506, "mean_token_accuracy": 0.8138774931430817, "num_tokens": 38436829.0, "step": 3720 }, { "entropy": 0.7358646869659424, "epoch": 0.02984, "grad_norm": 1.6645610332489014, "learning_rate": 4.8527811124449784e-05, "loss": 0.7391, "mean_token_accuracy": 0.7931171238422394, "num_tokens": 38529433.0, "step": 3730 }, { "entropy": 0.7030475020408631, "epoch": 0.02992, "grad_norm": 2.732557773590088, "learning_rate": 4.852380952380953e-05, "loss": 0.6933, "mean_token_accuracy": 0.7926405787467956, "num_tokens": 38660938.0, "step": 3740 }, { "entropy": 0.7358006536960602, "epoch": 0.03, "grad_norm": 7.073787212371826, "learning_rate": 4.851980792316927e-05, "loss": 0.7352, "mean_token_accuracy": 0.8079196393489838, "num_tokens": 38696961.0, "step": 3750 }, { "entropy": 0.6882951974868774, "epoch": 0.03008, "grad_norm": 2.02770733833313, "learning_rate": 4.8515806322529015e-05, "loss": 0.6974, "mean_token_accuracy": 0.7818209588527679, "num_tokens": 38860801.0, "step": 3760 }, { "entropy": 0.6735322326421738, "epoch": 0.03016, "grad_norm": 5.068566799163818, "learning_rate": 4.851180472188876e-05, "loss": 0.6724, "mean_token_accuracy": 0.805264002084732, "num_tokens": 38947879.0, "step": 3770 }, { "entropy": 0.7024335145950318, "epoch": 0.03024, "grad_norm": 2.432997465133667, "learning_rate": 4.85078031212485e-05, "loss": 0.7129, "mean_token_accuracy": 0.7991443514823914, "num_tokens": 39042418.0, "step": 3780 }, { "entropy": 0.6967293977737427, "epoch": 0.03032, "grad_norm": 2.9631309509277344, "learning_rate": 4.8503801520608246e-05, "loss": 0.6813, "mean_token_accuracy": 0.7881213784217834, "num_tokens": 39189266.0, "step": 3790 }, { "entropy": 0.7899814724922181, "epoch": 0.0304, "grad_norm": 6.4735636711120605, "learning_rate": 4.849979991996799e-05, "loss": 0.821, "mean_token_accuracy": 0.7900913715362549, "num_tokens": 39235225.0, "step": 3800 }, { "entropy": 0.734240448474884, "epoch": 0.03048, "grad_norm": 2.2725253105163574, "learning_rate": 4.8495798319327733e-05, "loss": 0.7305, "mean_token_accuracy": 0.7751947045326233, "num_tokens": 39399028.0, "step": 3810 }, { "entropy": 0.7426746308803558, "epoch": 0.03056, "grad_norm": 4.523082256317139, "learning_rate": 4.849179671868748e-05, "loss": 0.7402, "mean_token_accuracy": 0.7931177735328674, "num_tokens": 39479570.0, "step": 3820 }, { "entropy": 0.7246798634529114, "epoch": 0.03064, "grad_norm": 2.8490898609161377, "learning_rate": 4.848779511804722e-05, "loss": 0.7423, "mean_token_accuracy": 0.7899782240390778, "num_tokens": 39573115.0, "step": 3830 }, { "entropy": 0.7181077182292939, "epoch": 0.03072, "grad_norm": 4.522273063659668, "learning_rate": 4.8483793517406965e-05, "loss": 0.7034, "mean_token_accuracy": 0.7878127157688141, "num_tokens": 39714319.0, "step": 3840 }, { "entropy": 0.7610641896724701, "epoch": 0.0308, "grad_norm": 6.506425380706787, "learning_rate": 4.847979191676671e-05, "loss": 0.752, "mean_token_accuracy": 0.8017332017421722, "num_tokens": 39756220.0, "step": 3850 }, { "entropy": 0.7390204071998596, "epoch": 0.03088, "grad_norm": 2.0331614017486572, "learning_rate": 4.847579031612645e-05, "loss": 0.7484, "mean_token_accuracy": 0.7690098524093628, "num_tokens": 39919642.0, "step": 3860 }, { "entropy": 0.8063155353069306, "epoch": 0.03096, "grad_norm": 3.9870855808258057, "learning_rate": 4.84717887154862e-05, "loss": 0.7982, "mean_token_accuracy": 0.7799156665802002, "num_tokens": 39994136.0, "step": 3870 }, { "entropy": 0.7078857183456421, "epoch": 0.03104, "grad_norm": 1.7492754459381104, "learning_rate": 4.846778711484594e-05, "loss": 0.7299, "mean_token_accuracy": 0.7900642514228821, "num_tokens": 40087267.0, "step": 3880 }, { "entropy": 0.727300626039505, "epoch": 0.03112, "grad_norm": 2.88659930229187, "learning_rate": 4.846378551420568e-05, "loss": 0.7178, "mean_token_accuracy": 0.7863938927650451, "num_tokens": 40215538.0, "step": 3890 }, { "entropy": 0.7275132179260254, "epoch": 0.0312, "grad_norm": 5.512736797332764, "learning_rate": 4.845978391356543e-05, "loss": 0.7187, "mean_token_accuracy": 0.8095134019851684, "num_tokens": 40248302.0, "step": 3900 }, { "entropy": 0.6612066507339478, "epoch": 0.03128, "grad_norm": 2.621650457382202, "learning_rate": 4.845578231292518e-05, "loss": 0.6714, "mean_token_accuracy": 0.7884220838546753, "num_tokens": 40412142.0, "step": 3910 }, { "entropy": 0.6723111093044281, "epoch": 0.03136, "grad_norm": 4.428321361541748, "learning_rate": 4.8451780712284914e-05, "loss": 0.6668, "mean_token_accuracy": 0.8051137566566468, "num_tokens": 40505973.0, "step": 3920 }, { "entropy": 0.7534540712833404, "epoch": 0.03144, "grad_norm": 2.218877077102661, "learning_rate": 4.844777911164466e-05, "loss": 0.754, "mean_token_accuracy": 0.7886212885379791, "num_tokens": 40600976.0, "step": 3930 }, { "entropy": 0.760883903503418, "epoch": 0.03152, "grad_norm": 2.7209694385528564, "learning_rate": 4.844377751100441e-05, "loss": 0.761, "mean_token_accuracy": 0.77606680393219, "num_tokens": 40737266.0, "step": 3940 }, { "entropy": 0.6924985647201538, "epoch": 0.0316, "grad_norm": 7.654540061950684, "learning_rate": 4.843977591036415e-05, "loss": 0.6935, "mean_token_accuracy": 0.8169920921325684, "num_tokens": 40776094.0, "step": 3950 }, { "entropy": 0.7011850774288177, "epoch": 0.03168, "grad_norm": 1.803519368171692, "learning_rate": 4.843577430972389e-05, "loss": 0.6961, "mean_token_accuracy": 0.7825781583786011, "num_tokens": 40939934.0, "step": 3960 }, { "entropy": 0.7451089143753051, "epoch": 0.03176, "grad_norm": 5.1460113525390625, "learning_rate": 4.843177270908363e-05, "loss": 0.7398, "mean_token_accuracy": 0.7914555907249451, "num_tokens": 41020407.0, "step": 3970 }, { "entropy": 0.7579535603523254, "epoch": 0.03184, "grad_norm": 1.913365125656128, "learning_rate": 4.842777110844338e-05, "loss": 0.7508, "mean_token_accuracy": 0.7894254803657532, "num_tokens": 41114145.0, "step": 3980 }, { "entropy": 0.6519062966108322, "epoch": 0.03192, "grad_norm": 3.785051107406616, "learning_rate": 4.842376950780313e-05, "loss": 0.6535, "mean_token_accuracy": 0.8003473818302155, "num_tokens": 41238196.0, "step": 3990 }, { "entropy": 0.7729546219110489, "epoch": 0.032, "grad_norm": 6.883191108703613, "learning_rate": 4.8419767907162864e-05, "loss": 0.7753, "mean_token_accuracy": 0.797053724527359, "num_tokens": 41273383.0, "step": 4000 }, { "entropy": 0.7560874938964843, "epoch": 0.03208, "grad_norm": 2.3520290851593018, "learning_rate": 4.8415766306522614e-05, "loss": 0.7513, "mean_token_accuracy": 0.7724385797977448, "num_tokens": 41436760.0, "step": 4010 }, { "entropy": 0.7428998827934266, "epoch": 0.03216, "grad_norm": 3.8789923191070557, "learning_rate": 4.841176470588236e-05, "loss": 0.7338, "mean_token_accuracy": 0.7942632615566254, "num_tokens": 41508812.0, "step": 4020 }, { "entropy": 0.7295500576496124, "epoch": 0.03224, "grad_norm": 1.7021645307540894, "learning_rate": 4.84077631052421e-05, "loss": 0.728, "mean_token_accuracy": 0.796038806438446, "num_tokens": 41602170.0, "step": 4030 }, { "entropy": 0.7352355241775512, "epoch": 0.03232, "grad_norm": 3.560145854949951, "learning_rate": 4.840376150460184e-05, "loss": 0.7319, "mean_token_accuracy": 0.7773521304130554, "num_tokens": 41746628.0, "step": 4040 }, { "entropy": 0.6646139353513718, "epoch": 0.0324, "grad_norm": 5.270575046539307, "learning_rate": 4.839975990396159e-05, "loss": 0.6611, "mean_token_accuracy": 0.8228362500667572, "num_tokens": 41787489.0, "step": 4050 }, { "entropy": 0.7477988481521607, "epoch": 0.03248, "grad_norm": 2.024467945098877, "learning_rate": 4.839575830332133e-05, "loss": 0.7564, "mean_token_accuracy": 0.7740779280662536, "num_tokens": 41951329.0, "step": 4060 }, { "entropy": 0.757029938697815, "epoch": 0.03256, "grad_norm": 5.721101760864258, "learning_rate": 4.839175670268108e-05, "loss": 0.7496, "mean_token_accuracy": 0.7857408463954926, "num_tokens": 42034766.0, "step": 4070 }, { "entropy": 0.7335658729076385, "epoch": 0.03264, "grad_norm": 1.5997384786605835, "learning_rate": 4.8387755102040814e-05, "loss": 0.7398, "mean_token_accuracy": 0.7902748763561249, "num_tokens": 42127716.0, "step": 4080 }, { "entropy": 0.7505945801734925, "epoch": 0.03272, "grad_norm": 2.5477771759033203, "learning_rate": 4.8383753501400564e-05, "loss": 0.7388, "mean_token_accuracy": 0.7763095080852509, "num_tokens": 42263581.0, "step": 4090 }, { "entropy": 0.6538977682590484, "epoch": 0.0328, "grad_norm": 5.627225399017334, "learning_rate": 4.837975190076031e-05, "loss": 0.6583, "mean_token_accuracy": 0.8211481034755707, "num_tokens": 42298139.0, "step": 4100 }, { "entropy": 0.6750572323799133, "epoch": 0.03288, "grad_norm": 2.171431541442871, "learning_rate": 4.837575030012005e-05, "loss": 0.6867, "mean_token_accuracy": 0.783176600933075, "num_tokens": 42461979.0, "step": 4110 }, { "entropy": 0.6870258986949921, "epoch": 0.03296, "grad_norm": 4.570237159729004, "learning_rate": 4.8371748699479795e-05, "loss": 0.6797, "mean_token_accuracy": 0.7967105746269226, "num_tokens": 42567730.0, "step": 4120 }, { "entropy": 0.7000396072864532, "epoch": 0.03304, "grad_norm": 1.9192873239517212, "learning_rate": 4.836774709883954e-05, "loss": 0.7129, "mean_token_accuracy": 0.7963673710823059, "num_tokens": 42663165.0, "step": 4130 }, { "entropy": 0.6908801525831223, "epoch": 0.03312, "grad_norm": 3.2977120876312256, "learning_rate": 4.836374549819928e-05, "loss": 0.6855, "mean_token_accuracy": 0.7911885201930999, "num_tokens": 42791087.0, "step": 4140 }, { "entropy": 0.7198667764663697, "epoch": 0.0332, "grad_norm": 6.187575340270996, "learning_rate": 4.8359743897559026e-05, "loss": 0.7114, "mean_token_accuracy": 0.8115277171134949, "num_tokens": 42829935.0, "step": 4150 }, { "entropy": 0.6988134503364563, "epoch": 0.03328, "grad_norm": 2.1023550033569336, "learning_rate": 4.835574229691877e-05, "loss": 0.7039, "mean_token_accuracy": 0.7810164332389832, "num_tokens": 42991121.0, "step": 4160 }, { "entropy": 0.6791294038295745, "epoch": 0.03336, "grad_norm": 3.3575382232666016, "learning_rate": 4.8351740696278514e-05, "loss": 0.6719, "mean_token_accuracy": 0.8090853273868561, "num_tokens": 43061644.0, "step": 4170 }, { "entropy": 0.6739405035972595, "epoch": 0.03344, "grad_norm": 2.1269609928131104, "learning_rate": 4.834773909563826e-05, "loss": 0.6781, "mean_token_accuracy": 0.803870564699173, "num_tokens": 43153633.0, "step": 4180 }, { "entropy": 0.723643833398819, "epoch": 0.03352, "grad_norm": 3.4855921268463135, "learning_rate": 4.8343737494998e-05, "loss": 0.7167, "mean_token_accuracy": 0.7841819107532502, "num_tokens": 43294349.0, "step": 4190 }, { "entropy": 0.7604012846946716, "epoch": 0.0336, "grad_norm": 7.27557897567749, "learning_rate": 4.8339735894357745e-05, "loss": 0.7499, "mean_token_accuracy": 0.8028838396072387, "num_tokens": 43333982.0, "step": 4200 }, { "entropy": 0.7050741910934448, "epoch": 0.03368, "grad_norm": 2.6518402099609375, "learning_rate": 4.833573429371749e-05, "loss": 0.7101, "mean_token_accuracy": 0.7771616995334625, "num_tokens": 43497822.0, "step": 4210 }, { "entropy": 0.7528229594230652, "epoch": 0.03376, "grad_norm": 3.4105799198150635, "learning_rate": 4.833173269307723e-05, "loss": 0.7516, "mean_token_accuracy": 0.7842597544193268, "num_tokens": 43599569.0, "step": 4220 }, { "entropy": 0.7547823965549469, "epoch": 0.03384, "grad_norm": 1.9741036891937256, "learning_rate": 4.8327731092436976e-05, "loss": 0.753, "mean_token_accuracy": 0.7871851980686188, "num_tokens": 43694361.0, "step": 4230 }, { "entropy": 0.7251629829406738, "epoch": 0.03392, "grad_norm": 2.573793888092041, "learning_rate": 4.832372949179672e-05, "loss": 0.7314, "mean_token_accuracy": 0.7754022955894471, "num_tokens": 43838861.0, "step": 4240 }, { "entropy": 0.6809614956378937, "epoch": 0.034, "grad_norm": 7.2496418952941895, "learning_rate": 4.8319727891156464e-05, "loss": 0.6672, "mean_token_accuracy": 0.8237184882164001, "num_tokens": 43878289.0, "step": 4250 }, { "entropy": 0.7209748268127442, "epoch": 0.03408, "grad_norm": 2.2246737480163574, "learning_rate": 4.8315726290516214e-05, "loss": 0.726, "mean_token_accuracy": 0.77633615732193, "num_tokens": 44040795.0, "step": 4260 }, { "entropy": 0.705244642496109, "epoch": 0.03416, "grad_norm": 3.8288378715515137, "learning_rate": 4.831172468987595e-05, "loss": 0.6995, "mean_token_accuracy": 0.799130380153656, "num_tokens": 44119607.0, "step": 4270 }, { "entropy": 0.7947374522686005, "epoch": 0.03424, "grad_norm": 1.8025362491607666, "learning_rate": 4.8307723089235695e-05, "loss": 0.7914, "mean_token_accuracy": 0.7784997463226319, "num_tokens": 44214124.0, "step": 4280 }, { "entropy": 0.7105306148529053, "epoch": 0.03432, "grad_norm": 2.561469078063965, "learning_rate": 4.830372148859544e-05, "loss": 0.711, "mean_token_accuracy": 0.7812679827213287, "num_tokens": 44358512.0, "step": 4290 }, { "entropy": 0.6754411488771439, "epoch": 0.0344, "grad_norm": 4.8965020179748535, "learning_rate": 4.829971988795519e-05, "loss": 0.6702, "mean_token_accuracy": 0.8178167581558228, "num_tokens": 44399480.0, "step": 4300 }, { "entropy": 0.7074183344841003, "epoch": 0.03448, "grad_norm": 2.7199134826660156, "learning_rate": 4.8295718287314926e-05, "loss": 0.7201, "mean_token_accuracy": 0.7789264798164368, "num_tokens": 44563320.0, "step": 4310 }, { "entropy": 0.7025749117136002, "epoch": 0.03456, "grad_norm": 3.949568510055542, "learning_rate": 4.829171668667467e-05, "loss": 0.6857, "mean_token_accuracy": 0.8057377219200135, "num_tokens": 44647555.0, "step": 4320 }, { "entropy": 0.6783758223056793, "epoch": 0.03464, "grad_norm": 2.736252546310425, "learning_rate": 4.828771508603442e-05, "loss": 0.6826, "mean_token_accuracy": 0.8041194200515747, "num_tokens": 44740568.0, "step": 4330 }, { "entropy": 0.6868331491947174, "epoch": 0.03472, "grad_norm": 2.396848201751709, "learning_rate": 4.8283713485394164e-05, "loss": 0.6849, "mean_token_accuracy": 0.7902329981327056, "num_tokens": 44879260.0, "step": 4340 }, { "entropy": 0.7014800488948822, "epoch": 0.0348, "grad_norm": 4.980861663818359, "learning_rate": 4.82797118847539e-05, "loss": 0.7, "mean_token_accuracy": 0.8115508854389191, "num_tokens": 44920130.0, "step": 4350 }, { "entropy": 0.6678951323032379, "epoch": 0.03488, "grad_norm": 1.786534070968628, "learning_rate": 4.8275710284113644e-05, "loss": 0.6721, "mean_token_accuracy": 0.788837319612503, "num_tokens": 45083970.0, "step": 4360 }, { "entropy": 0.6756253361701965, "epoch": 0.03496, "grad_norm": 3.4787232875823975, "learning_rate": 4.8271708683473395e-05, "loss": 0.6708, "mean_token_accuracy": 0.8072477161884308, "num_tokens": 45177676.0, "step": 4370 }, { "entropy": 0.674566650390625, "epoch": 0.03504, "grad_norm": 2.5119190216064453, "learning_rate": 4.826770708283314e-05, "loss": 0.6615, "mean_token_accuracy": 0.8039086818695068, "num_tokens": 45271093.0, "step": 4380 }, { "entropy": 0.7185208559036255, "epoch": 0.03512, "grad_norm": 2.782975196838379, "learning_rate": 4.8263705482192876e-05, "loss": 0.7153, "mean_token_accuracy": 0.7852619528770447, "num_tokens": 45411664.0, "step": 4390 }, { "entropy": 0.5964642763137817, "epoch": 0.0352, "grad_norm": 6.003465175628662, "learning_rate": 4.8259703881552626e-05, "loss": 0.6002, "mean_token_accuracy": 0.8333630979061126, "num_tokens": 45450148.0, "step": 4400 }, { "entropy": 0.6767998158931732, "epoch": 0.03528, "grad_norm": 2.1037514209747314, "learning_rate": 4.825570228091237e-05, "loss": 0.6802, "mean_token_accuracy": 0.7850329875946045, "num_tokens": 45613988.0, "step": 4410 }, { "entropy": 0.8024369001388549, "epoch": 0.03536, "grad_norm": 4.441103458404541, "learning_rate": 4.8251700680272114e-05, "loss": 0.8006, "mean_token_accuracy": 0.776738840341568, "num_tokens": 45709516.0, "step": 4420 }, { "entropy": 0.7283270180225372, "epoch": 0.03544, "grad_norm": 1.8646516799926758, "learning_rate": 4.824769907963185e-05, "loss": 0.7302, "mean_token_accuracy": 0.796229487657547, "num_tokens": 45803071.0, "step": 4430 }, { "entropy": 0.6535360932350158, "epoch": 0.03552, "grad_norm": 2.634823799133301, "learning_rate": 4.82436974789916e-05, "loss": 0.6553, "mean_token_accuracy": 0.8016288220882416, "num_tokens": 45914857.0, "step": 4440 }, { "entropy": 0.7039425611495972, "epoch": 0.0356, "grad_norm": 6.097160339355469, "learning_rate": 4.8239695878351345e-05, "loss": 0.6834, "mean_token_accuracy": 0.8163934528827668, "num_tokens": 45944484.0, "step": 4450 }, { "entropy": 0.6736361086368561, "epoch": 0.03568, "grad_norm": 5.742124557495117, "learning_rate": 4.823569427771109e-05, "loss": 0.6702, "mean_token_accuracy": 0.7889838874340057, "num_tokens": 46108324.0, "step": 4460 }, { "entropy": 0.6488422453403473, "epoch": 0.03576, "grad_norm": 4.090909004211426, "learning_rate": 4.823169267707083e-05, "loss": 0.6581, "mean_token_accuracy": 0.8091081261634827, "num_tokens": 46186708.0, "step": 4470 }, { "entropy": 0.7735714733600616, "epoch": 0.03584, "grad_norm": 1.8699606657028198, "learning_rate": 4.8227691076430576e-05, "loss": 0.7639, "mean_token_accuracy": 0.7853198111057281, "num_tokens": 46281468.0, "step": 4480 }, { "entropy": 0.7051559507846832, "epoch": 0.03592, "grad_norm": 3.28519344329834, "learning_rate": 4.822368947579032e-05, "loss": 0.7185, "mean_token_accuracy": 0.7809759974479675, "num_tokens": 46430824.0, "step": 4490 }, { "entropy": 0.6643428564071655, "epoch": 0.036, "grad_norm": 6.997048377990723, "learning_rate": 4.821968787515006e-05, "loss": 0.6539, "mean_token_accuracy": 0.8224227130413055, "num_tokens": 46469904.0, "step": 4500 }, { "entropy": 0.6474813461303711, "epoch": 0.03608, "grad_norm": 1.9997632503509521, "learning_rate": 4.821568627450981e-05, "loss": 0.6504, "mean_token_accuracy": 0.7927515864372253, "num_tokens": 46633744.0, "step": 4510 }, { "entropy": 0.7283110558986664, "epoch": 0.03616, "grad_norm": 2.9958043098449707, "learning_rate": 4.821168467386955e-05, "loss": 0.7404, "mean_token_accuracy": 0.7891905963420868, "num_tokens": 46725347.0, "step": 4520 }, { "entropy": 0.7485415399074554, "epoch": 0.03624, "grad_norm": 1.9156650304794312, "learning_rate": 4.8207683073229294e-05, "loss": 0.7367, "mean_token_accuracy": 0.7906634330749511, "num_tokens": 46819624.0, "step": 4530 }, { "entropy": 0.6946033298969269, "epoch": 0.03632, "grad_norm": 3.018332004547119, "learning_rate": 4.820368147258904e-05, "loss": 0.6894, "mean_token_accuracy": 0.7885665714740753, "num_tokens": 46961580.0, "step": 4540 }, { "entropy": 0.6823936581611634, "epoch": 0.0364, "grad_norm": 5.8464884757995605, "learning_rate": 4.819967987194878e-05, "loss": 0.6976, "mean_token_accuracy": 0.8147928476333618, "num_tokens": 46999460.0, "step": 4550 }, { "entropy": 0.6690657556056976, "epoch": 0.03648, "grad_norm": 1.7365763187408447, "learning_rate": 4.8195678271308525e-05, "loss": 0.6685, "mean_token_accuracy": 0.7922437191009521, "num_tokens": 47163066.0, "step": 4560 }, { "entropy": 0.7214426249265671, "epoch": 0.03656, "grad_norm": 4.218018054962158, "learning_rate": 4.819167667066827e-05, "loss": 0.7177, "mean_token_accuracy": 0.8043698966503143, "num_tokens": 47235879.0, "step": 4570 }, { "entropy": 0.7800520479679107, "epoch": 0.03664, "grad_norm": 2.1298863887786865, "learning_rate": 4.818767507002801e-05, "loss": 0.7869, "mean_token_accuracy": 0.7830573737621307, "num_tokens": 47328985.0, "step": 4580 }, { "entropy": 0.7989256918430329, "epoch": 0.03672, "grad_norm": 3.202766180038452, "learning_rate": 4.818367346938776e-05, "loss": 0.7838, "mean_token_accuracy": 0.7685755074024201, "num_tokens": 47469960.0, "step": 4590 }, { "entropy": 0.6794513881206512, "epoch": 0.0368, "grad_norm": 5.595240116119385, "learning_rate": 4.81796718687475e-05, "loss": 0.6789, "mean_token_accuracy": 0.8154929757118226, "num_tokens": 47510770.0, "step": 4600 }, { "entropy": 0.7068628013134003, "epoch": 0.03688, "grad_norm": 2.1805291175842285, "learning_rate": 4.8175670268107244e-05, "loss": 0.7141, "mean_token_accuracy": 0.7769724130630493, "num_tokens": 47674610.0, "step": 4610 }, { "entropy": 0.6588216036558151, "epoch": 0.03696, "grad_norm": 3.44203519821167, "learning_rate": 4.817166866746699e-05, "loss": 0.6553, "mean_token_accuracy": 0.8137295186519623, "num_tokens": 47755708.0, "step": 4620 }, { "entropy": 0.7147222816944122, "epoch": 0.03704, "grad_norm": 2.7118098735809326, "learning_rate": 4.816766706682673e-05, "loss": 0.7271, "mean_token_accuracy": 0.797221964597702, "num_tokens": 47849586.0, "step": 4630 }, { "entropy": 0.7299727499485016, "epoch": 0.03712, "grad_norm": 2.6941349506378174, "learning_rate": 4.8163665466186475e-05, "loss": 0.7267, "mean_token_accuracy": 0.778790819644928, "num_tokens": 47994231.0, "step": 4640 }, { "entropy": 0.6290138840675354, "epoch": 0.0372, "grad_norm": 6.009731769561768, "learning_rate": 4.8159663865546226e-05, "loss": 0.6119, "mean_token_accuracy": 0.8267909169197083, "num_tokens": 48038738.0, "step": 4650 }, { "entropy": 0.6618703007698059, "epoch": 0.03728, "grad_norm": 1.861856460571289, "learning_rate": 4.815566226490596e-05, "loss": 0.6689, "mean_token_accuracy": 0.7893316924571991, "num_tokens": 48202492.0, "step": 4660 }, { "entropy": 0.5738766103982925, "epoch": 0.03736, "grad_norm": 3.881251335144043, "learning_rate": 4.8151660664265706e-05, "loss": 0.5769, "mean_token_accuracy": 0.8284800469875335, "num_tokens": 48283636.0, "step": 4670 }, { "entropy": 0.7025204062461853, "epoch": 0.03744, "grad_norm": 3.060230016708374, "learning_rate": 4.814765906362545e-05, "loss": 0.7039, "mean_token_accuracy": 0.7952620327472687, "num_tokens": 48378176.0, "step": 4680 }, { "entropy": 0.7900924503803253, "epoch": 0.03752, "grad_norm": 2.330845832824707, "learning_rate": 4.81436574629852e-05, "loss": 0.7824, "mean_token_accuracy": 0.7624780595302582, "num_tokens": 48536350.0, "step": 4690 }, { "entropy": 0.6602253139019012, "epoch": 0.0376, "grad_norm": 6.082082271575928, "learning_rate": 4.813965586234494e-05, "loss": 0.6717, "mean_token_accuracy": 0.8185149967670441, "num_tokens": 48580190.0, "step": 4700 }, { "entropy": 0.6668957054615021, "epoch": 0.03768, "grad_norm": 2.1078858375549316, "learning_rate": 4.813565426170468e-05, "loss": 0.6645, "mean_token_accuracy": 0.787756472826004, "num_tokens": 48744030.0, "step": 4710 }, { "entropy": 0.6659129023551941, "epoch": 0.03776, "grad_norm": 4.908674716949463, "learning_rate": 4.813165266106443e-05, "loss": 0.6691, "mean_token_accuracy": 0.8062359929084778, "num_tokens": 48830618.0, "step": 4720 }, { "entropy": 0.7534015774726868, "epoch": 0.03784, "grad_norm": 2.9099864959716797, "learning_rate": 4.8127651060424175e-05, "loss": 0.7526, "mean_token_accuracy": 0.7845961570739746, "num_tokens": 48925036.0, "step": 4730 }, { "entropy": 0.7545804917812348, "epoch": 0.03792, "grad_norm": 3.8165969848632812, "learning_rate": 4.812364945978391e-05, "loss": 0.745, "mean_token_accuracy": 0.7769435763359069, "num_tokens": 49071398.0, "step": 4740 }, { "entropy": 0.7477825194597244, "epoch": 0.038, "grad_norm": 8.170304298400879, "learning_rate": 4.8119647859143656e-05, "loss": 0.7404, "mean_token_accuracy": 0.8007571458816528, "num_tokens": 49109903.0, "step": 4750 }, { "entropy": 0.6792748272418976, "epoch": 0.03808, "grad_norm": 1.9548534154891968, "learning_rate": 4.8115646258503407e-05, "loss": 0.6824, "mean_token_accuracy": 0.785931134223938, "num_tokens": 49273052.0, "step": 4760 }, { "entropy": 0.6246152102947236, "epoch": 0.03816, "grad_norm": 3.428365707397461, "learning_rate": 4.811164465786315e-05, "loss": 0.6149, "mean_token_accuracy": 0.8182383477687836, "num_tokens": 49360454.0, "step": 4770 }, { "entropy": 0.7190926134586334, "epoch": 0.03824, "grad_norm": 1.724233627319336, "learning_rate": 4.810764305722289e-05, "loss": 0.7312, "mean_token_accuracy": 0.7939857065677642, "num_tokens": 49453385.0, "step": 4780 }, { "entropy": 0.7187237739562988, "epoch": 0.03832, "grad_norm": 3.537163019180298, "learning_rate": 4.810364145658264e-05, "loss": 0.7097, "mean_token_accuracy": 0.7843483090400696, "num_tokens": 49597102.0, "step": 4790 }, { "entropy": 0.6479239463806152, "epoch": 0.0384, "grad_norm": 7.896326065063477, "learning_rate": 4.809963985594238e-05, "loss": 0.6636, "mean_token_accuracy": 0.8223153710365295, "num_tokens": 49639300.0, "step": 4800 }, { "entropy": 0.6836530864238739, "epoch": 0.03848, "grad_norm": 1.8153437376022339, "learning_rate": 4.8095638255302125e-05, "loss": 0.6833, "mean_token_accuracy": 0.7845933198928833, "num_tokens": 49803140.0, "step": 4810 }, { "entropy": 0.7308858871459961, "epoch": 0.03856, "grad_norm": 3.1822288036346436, "learning_rate": 4.809163665466186e-05, "loss": 0.7255, "mean_token_accuracy": 0.7914009153842926, "num_tokens": 49891994.0, "step": 4820 }, { "entropy": 0.8129294335842132, "epoch": 0.03864, "grad_norm": 2.476325273513794, "learning_rate": 4.808763505402161e-05, "loss": 0.8227, "mean_token_accuracy": 0.7728870511054993, "num_tokens": 49987070.0, "step": 4830 }, { "entropy": 0.760396808385849, "epoch": 0.03872, "grad_norm": 2.9942209720611572, "learning_rate": 4.8083633453381356e-05, "loss": 0.7462, "mean_token_accuracy": 0.7763717949390412, "num_tokens": 50125735.0, "step": 4840 }, { "entropy": 0.6375310391187667, "epoch": 0.0388, "grad_norm": 5.826013565063477, "learning_rate": 4.80796318527411e-05, "loss": 0.6286, "mean_token_accuracy": 0.8284865975379944, "num_tokens": 50165195.0, "step": 4850 }, { "entropy": 0.6523686110973358, "epoch": 0.03888, "grad_norm": 2.5750935077667236, "learning_rate": 4.8075630252100844e-05, "loss": 0.6587, "mean_token_accuracy": 0.796226191520691, "num_tokens": 50329035.0, "step": 4860 }, { "entropy": 0.6947959065437317, "epoch": 0.03896, "grad_norm": 4.130143165588379, "learning_rate": 4.807162865146059e-05, "loss": 0.6931, "mean_token_accuracy": 0.7990310490131378, "num_tokens": 50426270.0, "step": 4870 }, { "entropy": 0.7425068199634552, "epoch": 0.03904, "grad_norm": 2.3629627227783203, "learning_rate": 4.806762705082033e-05, "loss": 0.7566, "mean_token_accuracy": 0.7902230679988861, "num_tokens": 50521265.0, "step": 4880 }, { "entropy": 0.733547306060791, "epoch": 0.03912, "grad_norm": 2.8247668743133545, "learning_rate": 4.8063625450180075e-05, "loss": 0.7287, "mean_token_accuracy": 0.7806491851806641, "num_tokens": 50655071.0, "step": 4890 }, { "entropy": 0.8036587059497833, "epoch": 0.0392, "grad_norm": 7.1645426750183105, "learning_rate": 4.805962384953982e-05, "loss": 0.7887, "mean_token_accuracy": 0.7940803170204163, "num_tokens": 50691124.0, "step": 4900 }, { "entropy": 0.7113596975803376, "epoch": 0.03928, "grad_norm": 1.6861546039581299, "learning_rate": 4.805562224889956e-05, "loss": 0.7134, "mean_token_accuracy": 0.7802210628986359, "num_tokens": 50854964.0, "step": 4910 }, { "entropy": 0.6617418527603149, "epoch": 0.03936, "grad_norm": 4.042732238769531, "learning_rate": 4.8051620648259306e-05, "loss": 0.6577, "mean_token_accuracy": 0.8053695559501648, "num_tokens": 50948434.0, "step": 4920 }, { "entropy": 0.6786246240139008, "epoch": 0.03944, "grad_norm": 2.597411632537842, "learning_rate": 4.804761904761905e-05, "loss": 0.6825, "mean_token_accuracy": 0.8017117619514466, "num_tokens": 51043449.0, "step": 4930 }, { "entropy": 0.6755678713321686, "epoch": 0.03952, "grad_norm": 2.960108757019043, "learning_rate": 4.804361744697879e-05, "loss": 0.6696, "mean_token_accuracy": 0.7943134248256684, "num_tokens": 51189043.0, "step": 4940 }, { "entropy": 0.7050810605287552, "epoch": 0.0396, "grad_norm": 4.861456871032715, "learning_rate": 4.803961584633854e-05, "loss": 0.697, "mean_token_accuracy": 0.8103299260139465, "num_tokens": 51234857.0, "step": 4950 }, { "entropy": 0.7358559042215347, "epoch": 0.03968, "grad_norm": 2.393392562866211, "learning_rate": 4.803561424569828e-05, "loss": 0.7365, "mean_token_accuracy": 0.7783062160015106, "num_tokens": 51396203.0, "step": 4960 }, { "entropy": 0.7001936197280884, "epoch": 0.03976, "grad_norm": 3.2671849727630615, "learning_rate": 4.8031612645058025e-05, "loss": 0.691, "mean_token_accuracy": 0.804521256685257, "num_tokens": 51468184.0, "step": 4970 }, { "entropy": 0.7745805561542511, "epoch": 0.03984, "grad_norm": 2.1836278438568115, "learning_rate": 4.802761104441777e-05, "loss": 0.7943, "mean_token_accuracy": 0.778523737192154, "num_tokens": 51562402.0, "step": 4980 }, { "entropy": 0.7546119332313538, "epoch": 0.03992, "grad_norm": 2.9069855213165283, "learning_rate": 4.802360944377751e-05, "loss": 0.7515, "mean_token_accuracy": 0.7731499969959259, "num_tokens": 51694676.0, "step": 4990 }, { "entropy": 0.6496682941913605, "epoch": 0.04, "grad_norm": 6.467473983764648, "learning_rate": 4.801960784313726e-05, "loss": 0.6622, "mean_token_accuracy": 0.8228402972221375, "num_tokens": 51732326.0, "step": 5000 }, { "entropy": 0.7130129218101502, "epoch": 0.04008, "grad_norm": 1.6345436573028564, "learning_rate": 4.8015606242497e-05, "loss": 0.7161, "mean_token_accuracy": 0.7785465836524963, "num_tokens": 51896159.0, "step": 5010 }, { "entropy": 0.692997682094574, "epoch": 0.04016, "grad_norm": 3.1146421432495117, "learning_rate": 4.801160464185674e-05, "loss": 0.68, "mean_token_accuracy": 0.8053612291812897, "num_tokens": 51983431.0, "step": 5020 }, { "entropy": 0.7284348666667938, "epoch": 0.04024, "grad_norm": 2.49003529548645, "learning_rate": 4.800760304121649e-05, "loss": 0.7366, "mean_token_accuracy": 0.7906265377998352, "num_tokens": 52076597.0, "step": 5030 }, { "entropy": 0.7132462739944458, "epoch": 0.04032, "grad_norm": 2.620363712310791, "learning_rate": 4.800360144057624e-05, "loss": 0.7091, "mean_token_accuracy": 0.7854506134986877, "num_tokens": 52212478.0, "step": 5040 }, { "entropy": 0.6811992079019547, "epoch": 0.0404, "grad_norm": 7.278005123138428, "learning_rate": 4.7999599839935974e-05, "loss": 0.6586, "mean_token_accuracy": 0.8241325855255127, "num_tokens": 52249347.0, "step": 5050 }, { "entropy": 0.7310978055000306, "epoch": 0.04048, "grad_norm": 1.9576667547225952, "learning_rate": 4.799559823929572e-05, "loss": 0.7413, "mean_token_accuracy": 0.7700537323951722, "num_tokens": 52413187.0, "step": 5060 }, { "entropy": 0.704056191444397, "epoch": 0.04056, "grad_norm": 3.530623197555542, "learning_rate": 4.799159663865547e-05, "loss": 0.6972, "mean_token_accuracy": 0.7981068551540375, "num_tokens": 52513920.0, "step": 5070 }, { "entropy": 0.719338321685791, "epoch": 0.04064, "grad_norm": 2.1747210025787354, "learning_rate": 4.798759503801521e-05, "loss": 0.7477, "mean_token_accuracy": 0.7868665754795074, "num_tokens": 52610083.0, "step": 5080 }, { "entropy": 0.7136290073394775, "epoch": 0.04072, "grad_norm": 3.1477231979370117, "learning_rate": 4.798359343737495e-05, "loss": 0.703, "mean_token_accuracy": 0.7866449236869812, "num_tokens": 52745689.0, "step": 5090 }, { "entropy": 0.7336921870708466, "epoch": 0.0408, "grad_norm": 5.825625896453857, "learning_rate": 4.797959183673469e-05, "loss": 0.7302, "mean_token_accuracy": 0.8068737685680389, "num_tokens": 52783379.0, "step": 5100 }, { "entropy": 0.7015091180801392, "epoch": 0.04088, "grad_norm": 1.809576153755188, "learning_rate": 4.797559023609444e-05, "loss": 0.7081, "mean_token_accuracy": 0.7779910087585449, "num_tokens": 52947023.0, "step": 5110 }, { "entropy": 0.7506760478019714, "epoch": 0.04096, "grad_norm": 3.648249626159668, "learning_rate": 4.797158863545419e-05, "loss": 0.7339, "mean_token_accuracy": 0.7933019399642944, "num_tokens": 53020847.0, "step": 5120 }, { "entropy": 0.7648399591445922, "epoch": 0.04104, "grad_norm": 2.229670763015747, "learning_rate": 4.7967587034813924e-05, "loss": 0.7839, "mean_token_accuracy": 0.78153036236763, "num_tokens": 53113542.0, "step": 5130 }, { "entropy": 0.7571798264980316, "epoch": 0.04112, "grad_norm": 3.171285390853882, "learning_rate": 4.796358543417367e-05, "loss": 0.7527, "mean_token_accuracy": 0.7738554000854492, "num_tokens": 53255023.0, "step": 5140 }, { "entropy": 0.7452671587467193, "epoch": 0.0412, "grad_norm": 7.154531478881836, "learning_rate": 4.795958383353342e-05, "loss": 0.7376, "mean_token_accuracy": 0.7964910387992858, "num_tokens": 53294137.0, "step": 5150 }, { "entropy": 0.6922197461128234, "epoch": 0.04128, "grad_norm": 2.705259323120117, "learning_rate": 4.795558223289316e-05, "loss": 0.697, "mean_token_accuracy": 0.7870997369289399, "num_tokens": 53453835.0, "step": 5160 }, { "entropy": 0.7898511171340943, "epoch": 0.04136, "grad_norm": 3.678938865661621, "learning_rate": 4.79515806322529e-05, "loss": 0.7795, "mean_token_accuracy": 0.78216592669487, "num_tokens": 53525150.0, "step": 5170 }, { "entropy": 0.6734453797340393, "epoch": 0.04144, "grad_norm": 1.8744237422943115, "learning_rate": 4.794757903161265e-05, "loss": 0.6831, "mean_token_accuracy": 0.8048866808414459, "num_tokens": 53616780.0, "step": 5180 }, { "entropy": 0.698311197757721, "epoch": 0.04152, "grad_norm": 2.6434710025787354, "learning_rate": 4.794357743097239e-05, "loss": 0.686, "mean_token_accuracy": 0.791432774066925, "num_tokens": 53757352.0, "step": 5190 }, { "entropy": 0.6963941752910614, "epoch": 0.0416, "grad_norm": 5.189845561981201, "learning_rate": 4.793957583033214e-05, "loss": 0.7017, "mean_token_accuracy": 0.8066257953643798, "num_tokens": 53799753.0, "step": 5200 }, { "entropy": 0.7037115216255188, "epoch": 0.04168, "grad_norm": 3.3147482872009277, "learning_rate": 4.7935574229691874e-05, "loss": 0.705, "mean_token_accuracy": 0.7799340486526489, "num_tokens": 53963593.0, "step": 5210 }, { "entropy": 0.6602469205856323, "epoch": 0.04176, "grad_norm": 5.270522117614746, "learning_rate": 4.7931572629051624e-05, "loss": 0.6547, "mean_token_accuracy": 0.8078347742557526, "num_tokens": 54064328.0, "step": 5220 }, { "entropy": 0.7397026538848877, "epoch": 0.04184, "grad_norm": 2.158170223236084, "learning_rate": 4.792757102841137e-05, "loss": 0.7451, "mean_token_accuracy": 0.7849696040153503, "num_tokens": 54160216.0, "step": 5230 }, { "entropy": 0.7325901985168457, "epoch": 0.04192, "grad_norm": 3.2324862480163574, "learning_rate": 4.792356942777111e-05, "loss": 0.7242, "mean_token_accuracy": 0.7783321440219879, "num_tokens": 54303773.0, "step": 5240 }, { "entropy": 0.7160784602165222, "epoch": 0.042, "grad_norm": 5.491610050201416, "learning_rate": 4.7919567827130855e-05, "loss": 0.7298, "mean_token_accuracy": 0.8069950699806213, "num_tokens": 54343810.0, "step": 5250 }, { "entropy": 0.6586057662963867, "epoch": 0.04208, "grad_norm": 2.30183482170105, "learning_rate": 4.79155662264906e-05, "loss": 0.658, "mean_token_accuracy": 0.7893014192581177, "num_tokens": 54507650.0, "step": 5260 }, { "entropy": 0.7501360476016998, "epoch": 0.04216, "grad_norm": 3.9700093269348145, "learning_rate": 4.791156462585034e-05, "loss": 0.745, "mean_token_accuracy": 0.791434383392334, "num_tokens": 54599766.0, "step": 5270 }, { "entropy": 0.7485993504524231, "epoch": 0.04224, "grad_norm": 2.0106658935546875, "learning_rate": 4.7907563025210086e-05, "loss": 0.7641, "mean_token_accuracy": 0.7820076286792755, "num_tokens": 54694002.0, "step": 5280 }, { "entropy": 0.7504442155361175, "epoch": 0.04232, "grad_norm": 2.6643097400665283, "learning_rate": 4.790356142456983e-05, "loss": 0.7334, "mean_token_accuracy": 0.7756803095340729, "num_tokens": 54832595.0, "step": 5290 }, { "entropy": 0.7525924563407898, "epoch": 0.0424, "grad_norm": 5.930164337158203, "learning_rate": 4.7899559823929574e-05, "loss": 0.7585, "mean_token_accuracy": 0.8016638875007629, "num_tokens": 54867279.0, "step": 5300 }, { "entropy": 0.6984841644763946, "epoch": 0.04248, "grad_norm": 1.6568193435668945, "learning_rate": 4.789555822328932e-05, "loss": 0.701, "mean_token_accuracy": 0.7832132518291474, "num_tokens": 55031119.0, "step": 5310 }, { "entropy": 0.7007336109876633, "epoch": 0.04256, "grad_norm": 4.4345269203186035, "learning_rate": 4.789155662264906e-05, "loss": 0.6954, "mean_token_accuracy": 0.7987052023410797, "num_tokens": 55120567.0, "step": 5320 }, { "entropy": 0.726545649766922, "epoch": 0.04264, "grad_norm": 1.7034324407577515, "learning_rate": 4.7887555022008805e-05, "loss": 0.7197, "mean_token_accuracy": 0.7918780207633972, "num_tokens": 55212981.0, "step": 5330 }, { "entropy": 0.6744953334331513, "epoch": 0.04272, "grad_norm": 2.90922474861145, "learning_rate": 4.788355342136855e-05, "loss": 0.6759, "mean_token_accuracy": 0.7912065744400024, "num_tokens": 55353231.0, "step": 5340 }, { "entropy": 0.7527335494756698, "epoch": 0.0428, "grad_norm": 5.516479015350342, "learning_rate": 4.787955182072829e-05, "loss": 0.7518, "mean_token_accuracy": 0.806070750951767, "num_tokens": 55389851.0, "step": 5350 }, { "entropy": 0.7270814418792725, "epoch": 0.04288, "grad_norm": 2.0260982513427734, "learning_rate": 4.7875550220088036e-05, "loss": 0.7308, "mean_token_accuracy": 0.7768125653266906, "num_tokens": 55551163.0, "step": 5360 }, { "entropy": 0.6645094931125641, "epoch": 0.04296, "grad_norm": 4.440942287445068, "learning_rate": 4.787154861944778e-05, "loss": 0.652, "mean_token_accuracy": 0.8122438669204712, "num_tokens": 55618946.0, "step": 5370 }, { "entropy": 0.6987506926059723, "epoch": 0.04304, "grad_norm": 2.2744247913360596, "learning_rate": 4.7867547018807524e-05, "loss": 0.7103, "mean_token_accuracy": 0.7941207587718964, "num_tokens": 55710645.0, "step": 5380 }, { "entropy": 0.6917520880699157, "epoch": 0.04312, "grad_norm": 2.6507632732391357, "learning_rate": 4.7863545418167274e-05, "loss": 0.6905, "mean_token_accuracy": 0.7914841413497925, "num_tokens": 55837312.0, "step": 5390 }, { "entropy": 0.7417845129966736, "epoch": 0.0432, "grad_norm": 10.089064598083496, "learning_rate": 4.785954381752701e-05, "loss": 0.7176, "mean_token_accuracy": 0.8100192248821259, "num_tokens": 55872164.0, "step": 5400 }, { "entropy": 0.6928104221820831, "epoch": 0.04328, "grad_norm": 1.7243930101394653, "learning_rate": 4.7855542216886755e-05, "loss": 0.7, "mean_token_accuracy": 0.7795849919319153, "num_tokens": 56035653.0, "step": 5410 }, { "entropy": 0.6815612435340881, "epoch": 0.04336, "grad_norm": 4.081684112548828, "learning_rate": 4.78515406162465e-05, "loss": 0.6661, "mean_token_accuracy": 0.8086305141448975, "num_tokens": 56111152.0, "step": 5420 }, { "entropy": 0.7400597870349884, "epoch": 0.04344, "grad_norm": 2.269352436065674, "learning_rate": 4.784753901560625e-05, "loss": 0.7591, "mean_token_accuracy": 0.7871508717536926, "num_tokens": 56203535.0, "step": 5430 }, { "entropy": 0.748316478729248, "epoch": 0.04352, "grad_norm": 2.4651806354522705, "learning_rate": 4.7843537414965986e-05, "loss": 0.7531, "mean_token_accuracy": 0.7756399273872375, "num_tokens": 56340508.0, "step": 5440 }, { "entropy": 0.8099813163280487, "epoch": 0.0436, "grad_norm": 5.972700119018555, "learning_rate": 4.783953581432573e-05, "loss": 0.7854, "mean_token_accuracy": 0.7924055755138397, "num_tokens": 56376697.0, "step": 5450 }, { "entropy": 0.6598595440387726, "epoch": 0.04368, "grad_norm": 1.5870208740234375, "learning_rate": 4.783553421368548e-05, "loss": 0.6609, "mean_token_accuracy": 0.7910906255245209, "num_tokens": 56540537.0, "step": 5460 }, { "entropy": 0.6895890265703202, "epoch": 0.04376, "grad_norm": 4.083963394165039, "learning_rate": 4.7831532613045224e-05, "loss": 0.6873, "mean_token_accuracy": 0.8038787841796875, "num_tokens": 56624622.0, "step": 5470 }, { "entropy": 0.696142578125, "epoch": 0.04384, "grad_norm": 1.849357008934021, "learning_rate": 4.782753101240496e-05, "loss": 0.7201, "mean_token_accuracy": 0.796375185251236, "num_tokens": 56717282.0, "step": 5480 }, { "entropy": 0.7595447361469269, "epoch": 0.04392, "grad_norm": 3.520247459411621, "learning_rate": 4.7823529411764704e-05, "loss": 0.7565, "mean_token_accuracy": 0.7720610082149506, "num_tokens": 56858621.0, "step": 5490 }, { "entropy": 0.6652926325798034, "epoch": 0.044, "grad_norm": 5.5743842124938965, "learning_rate": 4.7819527811124455e-05, "loss": 0.6471, "mean_token_accuracy": 0.8215480268001556, "num_tokens": 56895513.0, "step": 5500 }, { "entropy": 0.7007119655609131, "epoch": 0.04408, "grad_norm": 1.9067409038543701, "learning_rate": 4.78155262104842e-05, "loss": 0.7055, "mean_token_accuracy": 0.7801418364048004, "num_tokens": 57059189.0, "step": 5510 }, { "entropy": 0.6839358597993851, "epoch": 0.04416, "grad_norm": 3.204331874847412, "learning_rate": 4.7811524609843936e-05, "loss": 0.681, "mean_token_accuracy": 0.8024631559848785, "num_tokens": 57149129.0, "step": 5520 }, { "entropy": 0.774282819032669, "epoch": 0.04424, "grad_norm": 2.4242193698883057, "learning_rate": 4.7807523009203686e-05, "loss": 0.7712, "mean_token_accuracy": 0.780720341205597, "num_tokens": 57243466.0, "step": 5530 }, { "entropy": 0.7518071234226227, "epoch": 0.04432, "grad_norm": 2.514758586883545, "learning_rate": 4.780352140856343e-05, "loss": 0.7468, "mean_token_accuracy": 0.7702369332313538, "num_tokens": 57387131.0, "step": 5540 }, { "entropy": 0.6591311126947403, "epoch": 0.0444, "grad_norm": 6.438097953796387, "learning_rate": 4.7799519807923173e-05, "loss": 0.6588, "mean_token_accuracy": 0.8219592869281769, "num_tokens": 57430741.0, "step": 5550 }, { "entropy": 0.6329597949981689, "epoch": 0.04448, "grad_norm": 2.0198493003845215, "learning_rate": 4.779551820728291e-05, "loss": 0.6362, "mean_token_accuracy": 0.7961111903190613, "num_tokens": 57594577.0, "step": 5560 }, { "entropy": 0.7296894609928131, "epoch": 0.04456, "grad_norm": 3.819321393966675, "learning_rate": 4.779151660664266e-05, "loss": 0.7287, "mean_token_accuracy": 0.793030059337616, "num_tokens": 57675856.0, "step": 5570 }, { "entropy": 0.7794949233531951, "epoch": 0.04464, "grad_norm": 1.8539412021636963, "learning_rate": 4.7787515006002405e-05, "loss": 0.7942, "mean_token_accuracy": 0.7808453261852264, "num_tokens": 57769030.0, "step": 5580 }, { "entropy": 0.7236875355243683, "epoch": 0.04472, "grad_norm": 3.1004059314727783, "learning_rate": 4.778351340536215e-05, "loss": 0.719, "mean_token_accuracy": 0.7831384837627411, "num_tokens": 57899759.0, "step": 5590 }, { "entropy": 0.7588581323623658, "epoch": 0.0448, "grad_norm": 5.536323070526123, "learning_rate": 4.777951180472189e-05, "loss": 0.7449, "mean_token_accuracy": 0.8084441483020782, "num_tokens": 57930040.0, "step": 5600 }, { "entropy": 0.7234276413917542, "epoch": 0.04488, "grad_norm": 1.5486987829208374, "learning_rate": 4.7775510204081636e-05, "loss": 0.724, "mean_token_accuracy": 0.7765384078025818, "num_tokens": 58093852.0, "step": 5610 }, { "entropy": 0.7167797207832336, "epoch": 0.04496, "grad_norm": 3.321321964263916, "learning_rate": 4.777150860344138e-05, "loss": 0.7079, "mean_token_accuracy": 0.7955447793006897, "num_tokens": 58189941.0, "step": 5620 }, { "entropy": 0.6834010303020477, "epoch": 0.04504, "grad_norm": 2.53576922416687, "learning_rate": 4.776750700280112e-05, "loss": 0.6998, "mean_token_accuracy": 0.7984066009521484, "num_tokens": 58284811.0, "step": 5630 }, { "entropy": 0.7111078202724457, "epoch": 0.04512, "grad_norm": 3.258430004119873, "learning_rate": 4.776350540216087e-05, "loss": 0.7033, "mean_token_accuracy": 0.7864590883255005, "num_tokens": 58424964.0, "step": 5640 }, { "entropy": 0.6768788546323776, "epoch": 0.0452, "grad_norm": 5.7018890380859375, "learning_rate": 4.775950380152061e-05, "loss": 0.6812, "mean_token_accuracy": 0.8172376275062561, "num_tokens": 58465248.0, "step": 5650 }, { "entropy": 0.6810993015766144, "epoch": 0.04528, "grad_norm": 1.9356552362442017, "learning_rate": 4.7755502200880354e-05, "loss": 0.6828, "mean_token_accuracy": 0.7855364263057709, "num_tokens": 58628278.0, "step": 5660 }, { "entropy": 0.7204315692186356, "epoch": 0.04536, "grad_norm": 3.7491800785064697, "learning_rate": 4.77515006002401e-05, "loss": 0.713, "mean_token_accuracy": 0.8018514752388001, "num_tokens": 58697804.0, "step": 5670 }, { "entropy": 0.6813587963581085, "epoch": 0.04544, "grad_norm": 2.146314859390259, "learning_rate": 4.774749899959984e-05, "loss": 0.6906, "mean_token_accuracy": 0.799734252691269, "num_tokens": 58792384.0, "step": 5680 }, { "entropy": 0.8080059349536896, "epoch": 0.04552, "grad_norm": 2.275578498840332, "learning_rate": 4.7743497398959585e-05, "loss": 0.7989, "mean_token_accuracy": 0.7638884782791138, "num_tokens": 58933845.0, "step": 5690 }, { "entropy": 0.7577962338924408, "epoch": 0.0456, "grad_norm": 5.551760673522949, "learning_rate": 4.773949579831933e-05, "loss": 0.7549, "mean_token_accuracy": 0.7987237870693207, "num_tokens": 58974165.0, "step": 5700 }, { "entropy": 0.7140808403491974, "epoch": 0.04568, "grad_norm": 2.752366304397583, "learning_rate": 4.773549419767907e-05, "loss": 0.7162, "mean_token_accuracy": 0.7785723090171814, "num_tokens": 59138005.0, "step": 5710 }, { "entropy": 0.7530312538146973, "epoch": 0.04576, "grad_norm": 3.671849489212036, "learning_rate": 4.7731492597038817e-05, "loss": 0.7458, "mean_token_accuracy": 0.7858708202838898, "num_tokens": 59235787.0, "step": 5720 }, { "entropy": 0.7180475473403931, "epoch": 0.04584, "grad_norm": 2.0311026573181152, "learning_rate": 4.772749099639856e-05, "loss": 0.7156, "mean_token_accuracy": 0.7972632467746734, "num_tokens": 59329184.0, "step": 5730 }, { "entropy": 0.6979123294353485, "epoch": 0.04592, "grad_norm": 3.8089687824249268, "learning_rate": 4.7723489395758304e-05, "loss": 0.6964, "mean_token_accuracy": 0.7927297830581665, "num_tokens": 59456905.0, "step": 5740 }, { "entropy": 0.744138377904892, "epoch": 0.046, "grad_norm": 4.816619873046875, "learning_rate": 4.771948779511805e-05, "loss": 0.75, "mean_token_accuracy": 0.8031122863292695, "num_tokens": 59493568.0, "step": 5750 }, { "entropy": 0.6795098304748535, "epoch": 0.04608, "grad_norm": 2.0757012367248535, "learning_rate": 4.771548619447779e-05, "loss": 0.6815, "mean_token_accuracy": 0.785832941532135, "num_tokens": 59657408.0, "step": 5760 }, { "entropy": 0.7325932085514069, "epoch": 0.04616, "grad_norm": 4.158412933349609, "learning_rate": 4.7711484593837535e-05, "loss": 0.7224, "mean_token_accuracy": 0.7971942543983459, "num_tokens": 59743815.0, "step": 5770 }, { "entropy": 0.7740574836730957, "epoch": 0.04624, "grad_norm": 1.6105356216430664, "learning_rate": 4.7707482993197286e-05, "loss": 0.769, "mean_token_accuracy": 0.7815605938434601, "num_tokens": 59837230.0, "step": 5780 }, { "entropy": 0.7124489188194275, "epoch": 0.04632, "grad_norm": 2.2128701210021973, "learning_rate": 4.770348139255702e-05, "loss": 0.7042, "mean_token_accuracy": 0.7843249022960663, "num_tokens": 59983927.0, "step": 5790 }, { "entropy": 0.7202678799629212, "epoch": 0.0464, "grad_norm": 6.653172969818115, "learning_rate": 4.7699479791916766e-05, "loss": 0.7217, "mean_token_accuracy": 0.8048225164413452, "num_tokens": 60031771.0, "step": 5800 }, { "entropy": 0.6962988138198852, "epoch": 0.04648, "grad_norm": 1.6483711004257202, "learning_rate": 4.769547819127651e-05, "loss": 0.6964, "mean_token_accuracy": 0.7850635051727295, "num_tokens": 60195611.0, "step": 5810 }, { "entropy": 0.7279796838760376, "epoch": 0.04656, "grad_norm": 3.525409460067749, "learning_rate": 4.769147659063626e-05, "loss": 0.7204, "mean_token_accuracy": 0.794351440668106, "num_tokens": 60285784.0, "step": 5820 }, { "entropy": 0.7211357414722442, "epoch": 0.04664, "grad_norm": 2.6057212352752686, "learning_rate": 4.7687474989996e-05, "loss": 0.7364, "mean_token_accuracy": 0.7930623114109039, "num_tokens": 60379017.0, "step": 5830 }, { "entropy": 0.767609179019928, "epoch": 0.04672, "grad_norm": 2.6500983238220215, "learning_rate": 4.768347338935574e-05, "loss": 0.764, "mean_token_accuracy": 0.7730633318424225, "num_tokens": 60519542.0, "step": 5840 }, { "entropy": 0.6854040175676346, "epoch": 0.0468, "grad_norm": 5.837286949157715, "learning_rate": 4.767947178871549e-05, "loss": 0.6787, "mean_token_accuracy": 0.8177229404449463, "num_tokens": 60557760.0, "step": 5850 }, { "entropy": 0.7118045568466187, "epoch": 0.04688, "grad_norm": 2.471794605255127, "learning_rate": 4.7675470188075235e-05, "loss": 0.7198, "mean_token_accuracy": 0.7763861775398254, "num_tokens": 60721600.0, "step": 5860 }, { "entropy": 0.8098539620637893, "epoch": 0.04696, "grad_norm": 3.801581382751465, "learning_rate": 4.767146858743497e-05, "loss": 0.8009, "mean_token_accuracy": 0.7803339183330535, "num_tokens": 60804760.0, "step": 5870 }, { "entropy": 0.7070278465747833, "epoch": 0.04704, "grad_norm": 2.1851189136505127, "learning_rate": 4.7667466986794716e-05, "loss": 0.707, "mean_token_accuracy": 0.7961211502552032, "num_tokens": 60898067.0, "step": 5880 }, { "entropy": 0.7132425546646118, "epoch": 0.04712, "grad_norm": 2.5041563510894775, "learning_rate": 4.7663465386154466e-05, "loss": 0.7082, "mean_token_accuracy": 0.7856746852397919, "num_tokens": 61040394.0, "step": 5890 }, { "entropy": 0.6787249773740769, "epoch": 0.0472, "grad_norm": 5.392166614532471, "learning_rate": 4.765946378551421e-05, "loss": 0.6763, "mean_token_accuracy": 0.8135004758834838, "num_tokens": 61081090.0, "step": 5900 }, { "entropy": 0.6746223151683808, "epoch": 0.04728, "grad_norm": 2.1180338859558105, "learning_rate": 4.765546218487395e-05, "loss": 0.6781, "mean_token_accuracy": 0.7872862815856934, "num_tokens": 61244930.0, "step": 5910 }, { "entropy": 0.6851675808429718, "epoch": 0.04736, "grad_norm": 3.4565746784210205, "learning_rate": 4.76514605842337e-05, "loss": 0.6815, "mean_token_accuracy": 0.8052472651004792, "num_tokens": 61326698.0, "step": 5920 }, { "entropy": 0.7070559978485107, "epoch": 0.04744, "grad_norm": 1.9951765537261963, "learning_rate": 4.764745898359344e-05, "loss": 0.7137, "mean_token_accuracy": 0.7945944845676423, "num_tokens": 61420082.0, "step": 5930 }, { "entropy": 0.7408281147480011, "epoch": 0.04752, "grad_norm": 4.314788341522217, "learning_rate": 4.7643457382953185e-05, "loss": 0.7397, "mean_token_accuracy": 0.7783434927463532, "num_tokens": 61559509.0, "step": 5940 }, { "entropy": 0.7332434594631195, "epoch": 0.0476, "grad_norm": 6.307979106903076, "learning_rate": 4.763945578231292e-05, "loss": 0.7333, "mean_token_accuracy": 0.8049087226390839, "num_tokens": 61598685.0, "step": 5950 }, { "entropy": 0.6695318222045898, "epoch": 0.04768, "grad_norm": 2.0076918601989746, "learning_rate": 4.763545418167267e-05, "loss": 0.6728, "mean_token_accuracy": 0.7858146071434021, "num_tokens": 61762525.0, "step": 5960 }, { "entropy": 0.6911145448684692, "epoch": 0.04776, "grad_norm": 3.078498363494873, "learning_rate": 4.7631452581032416e-05, "loss": 0.6858, "mean_token_accuracy": 0.8002315402030945, "num_tokens": 61858895.0, "step": 5970 }, { "entropy": 0.7213624596595765, "epoch": 0.04784, "grad_norm": 1.543789029121399, "learning_rate": 4.762745098039216e-05, "loss": 0.7249, "mean_token_accuracy": 0.79437096118927, "num_tokens": 61953917.0, "step": 5980 }, { "entropy": 0.683866810798645, "epoch": 0.04792, "grad_norm": 3.7344048023223877, "learning_rate": 4.7623449379751904e-05, "loss": 0.6719, "mean_token_accuracy": 0.7944526910781861, "num_tokens": 62090898.0, "step": 5990 }, { "entropy": 0.7281621932983399, "epoch": 0.048, "grad_norm": 5.158105373382568, "learning_rate": 4.761944777911165e-05, "loss": 0.7333, "mean_token_accuracy": 0.8059649169445038, "num_tokens": 62133420.0, "step": 6000 }, { "entropy": 0.6756906807422638, "epoch": 0.04808, "grad_norm": 2.4537997245788574, "learning_rate": 4.761544617847139e-05, "loss": 0.6772, "mean_token_accuracy": 0.7871381640434265, "num_tokens": 62296945.0, "step": 6010 }, { "entropy": 0.7611733615398407, "epoch": 0.04816, "grad_norm": 4.358886241912842, "learning_rate": 4.7611444577831135e-05, "loss": 0.7546, "mean_token_accuracy": 0.7901750683784485, "num_tokens": 62377523.0, "step": 6020 }, { "entropy": 0.7442758917808533, "epoch": 0.04824, "grad_norm": 1.860619068145752, "learning_rate": 4.760744297719088e-05, "loss": 0.7533, "mean_token_accuracy": 0.7840140700340271, "num_tokens": 62471218.0, "step": 6030 }, { "entropy": 0.7206055343151092, "epoch": 0.04832, "grad_norm": 3.6093692779541016, "learning_rate": 4.760344137655062e-05, "loss": 0.7127, "mean_token_accuracy": 0.7874792337417602, "num_tokens": 62597882.0, "step": 6040 }, { "entropy": 0.702993506193161, "epoch": 0.0484, "grad_norm": 6.491499900817871, "learning_rate": 4.7599439775910366e-05, "loss": 0.7, "mean_token_accuracy": 0.8130348503589631, "num_tokens": 62633874.0, "step": 6050 }, { "entropy": 0.7183920204639435, "epoch": 0.04848, "grad_norm": 2.297117233276367, "learning_rate": 4.759543817527011e-05, "loss": 0.7191, "mean_token_accuracy": 0.7782963991165162, "num_tokens": 62796372.0, "step": 6060 }, { "entropy": 0.7194522798061371, "epoch": 0.04856, "grad_norm": 4.206053256988525, "learning_rate": 4.759143657462985e-05, "loss": 0.713, "mean_token_accuracy": 0.7998963177204133, "num_tokens": 62867241.0, "step": 6070 }, { "entropy": 0.7082658171653747, "epoch": 0.04864, "grad_norm": 1.7576611042022705, "learning_rate": 4.75874349739896e-05, "loss": 0.7204, "mean_token_accuracy": 0.796032041311264, "num_tokens": 62961834.0, "step": 6080 }, { "entropy": 0.7432098627090454, "epoch": 0.04872, "grad_norm": 3.51834774017334, "learning_rate": 4.758343337334934e-05, "loss": 0.7435, "mean_token_accuracy": 0.7765458166599274, "num_tokens": 63110804.0, "step": 6090 }, { "entropy": 0.7385327965021133, "epoch": 0.0488, "grad_norm": 5.73309850692749, "learning_rate": 4.7579431772709084e-05, "loss": 0.7495, "mean_token_accuracy": 0.799317616224289, "num_tokens": 63151335.0, "step": 6100 }, { "entropy": 0.6590437948703766, "epoch": 0.04888, "grad_norm": 1.9406189918518066, "learning_rate": 4.757543017206883e-05, "loss": 0.6586, "mean_token_accuracy": 0.7937897086143494, "num_tokens": 63315175.0, "step": 6110 }, { "entropy": 0.8311211824417114, "epoch": 0.04896, "grad_norm": 3.614868640899658, "learning_rate": 4.757142857142857e-05, "loss": 0.8207, "mean_token_accuracy": 0.7759351253509521, "num_tokens": 63399743.0, "step": 6120 }, { "entropy": 0.7188091039657593, "epoch": 0.04904, "grad_norm": 2.430478811264038, "learning_rate": 4.756742697078832e-05, "loss": 0.7275, "mean_token_accuracy": 0.7940598487854004, "num_tokens": 63491817.0, "step": 6130 }, { "entropy": 0.7442274451255798, "epoch": 0.04912, "grad_norm": 3.225045680999756, "learning_rate": 4.756342537014806e-05, "loss": 0.7341, "mean_token_accuracy": 0.7820181310176849, "num_tokens": 63628217.0, "step": 6140 }, { "entropy": 0.704285079240799, "epoch": 0.0492, "grad_norm": 7.089394569396973, "learning_rate": 4.75594237695078e-05, "loss": 0.6928, "mean_token_accuracy": 0.8142695605754853, "num_tokens": 63663576.0, "step": 6150 }, { "entropy": 0.6901265859603882, "epoch": 0.04928, "grad_norm": 2.360837697982788, "learning_rate": 4.755542216886755e-05, "loss": 0.6891, "mean_token_accuracy": 0.7846055209636689, "num_tokens": 63827416.0, "step": 6160 }, { "entropy": 0.6554663836956024, "epoch": 0.04936, "grad_norm": 4.357183456420898, "learning_rate": 4.75514205682273e-05, "loss": 0.6558, "mean_token_accuracy": 0.8091155230998993, "num_tokens": 63919002.0, "step": 6170 }, { "entropy": 0.6954095780849456, "epoch": 0.04944, "grad_norm": 1.566711664199829, "learning_rate": 4.7547418967587034e-05, "loss": 0.6961, "mean_token_accuracy": 0.7939906001091004, "num_tokens": 64013482.0, "step": 6180 }, { "entropy": 0.7522185474634171, "epoch": 0.04952, "grad_norm": 2.2245733737945557, "learning_rate": 4.754341736694678e-05, "loss": 0.7394, "mean_token_accuracy": 0.7806460976600647, "num_tokens": 64157395.0, "step": 6190 }, { "entropy": 0.7553378194570541, "epoch": 0.0496, "grad_norm": 7.157981872558594, "learning_rate": 4.753941576630653e-05, "loss": 0.7629, "mean_token_accuracy": 0.8002427279949188, "num_tokens": 64199364.0, "step": 6200 }, { "entropy": 0.7281157076358795, "epoch": 0.04968, "grad_norm": 2.161128282546997, "learning_rate": 4.753541416566627e-05, "loss": 0.7256, "mean_token_accuracy": 0.7799462676048279, "num_tokens": 64363204.0, "step": 6210 }, { "entropy": 0.7054681956768036, "epoch": 0.04976, "grad_norm": 3.808588743209839, "learning_rate": 4.753141256502601e-05, "loss": 0.6982, "mean_token_accuracy": 0.7971499025821686, "num_tokens": 64444908.0, "step": 6220 }, { "entropy": 0.754537570476532, "epoch": 0.04984, "grad_norm": 2.6255698204040527, "learning_rate": 4.752741096438575e-05, "loss": 0.7657, "mean_token_accuracy": 0.7878288090229034, "num_tokens": 64538428.0, "step": 6230 }, { "entropy": 0.7068874716758728, "epoch": 0.04992, "grad_norm": 3.47733998298645, "learning_rate": 4.75234093637455e-05, "loss": 0.6943, "mean_token_accuracy": 0.7919233024120331, "num_tokens": 64667674.0, "step": 6240 }, { "entropy": 0.697674161195755, "epoch": 0.05, "grad_norm": 6.031694412231445, "learning_rate": 4.751940776310525e-05, "loss": 0.7058, "mean_token_accuracy": 0.8085933804512024, "num_tokens": 64701927.0, "step": 6250 }, { "entropy": 0.6858376383781433, "epoch": 0.05008, "grad_norm": 1.8817574977874756, "learning_rate": 4.7515406162464984e-05, "loss": 0.6847, "mean_token_accuracy": 0.78398876786232, "num_tokens": 64865767.0, "step": 6260 }, { "entropy": 0.715529453754425, "epoch": 0.05016, "grad_norm": 3.563164710998535, "learning_rate": 4.751140456182473e-05, "loss": 0.704, "mean_token_accuracy": 0.7966909170150757, "num_tokens": 64961957.0, "step": 6270 }, { "entropy": 0.7793729186058045, "epoch": 0.05024, "grad_norm": 2.604830741882324, "learning_rate": 4.750740296118448e-05, "loss": 0.795, "mean_token_accuracy": 0.776188975572586, "num_tokens": 65057244.0, "step": 6280 }, { "entropy": 0.7756605446338654, "epoch": 0.05032, "grad_norm": 3.579352617263794, "learning_rate": 4.750340136054422e-05, "loss": 0.7628, "mean_token_accuracy": 0.7717630743980408, "num_tokens": 65199442.0, "step": 6290 }, { "entropy": 0.6362009942531586, "epoch": 0.0504, "grad_norm": 6.397315979003906, "learning_rate": 4.749939975990396e-05, "loss": 0.6405, "mean_token_accuracy": 0.8256870985031128, "num_tokens": 65243210.0, "step": 6300 }, { "entropy": 0.727130776643753, "epoch": 0.05048, "grad_norm": 2.9544472694396973, "learning_rate": 4.749539815926371e-05, "loss": 0.7233, "mean_token_accuracy": 0.7765754878520965, "num_tokens": 65407050.0, "step": 6310 }, { "entropy": 0.6956623673439026, "epoch": 0.05056, "grad_norm": 3.535574197769165, "learning_rate": 4.749139655862345e-05, "loss": 0.6993, "mean_token_accuracy": 0.7975940227508544, "num_tokens": 65498685.0, "step": 6320 }, { "entropy": 0.7169604957103729, "epoch": 0.05064, "grad_norm": 2.006176471710205, "learning_rate": 4.74873949579832e-05, "loss": 0.7229, "mean_token_accuracy": 0.793713653087616, "num_tokens": 65592395.0, "step": 6330 }, { "entropy": 0.7291745007038116, "epoch": 0.05072, "grad_norm": 3.384451150894165, "learning_rate": 4.7483393357342934e-05, "loss": 0.7229, "mean_token_accuracy": 0.7828900039196014, "num_tokens": 65726993.0, "step": 6340 }, { "entropy": 0.6657244026660919, "epoch": 0.0508, "grad_norm": 5.487250328063965, "learning_rate": 4.7479391756702684e-05, "loss": 0.6784, "mean_token_accuracy": 0.8208582937717438, "num_tokens": 65764738.0, "step": 6350 }, { "entropy": 0.6987995266914367, "epoch": 0.05088, "grad_norm": 1.9082996845245361, "learning_rate": 4.747539015606243e-05, "loss": 0.6935, "mean_token_accuracy": 0.7859089493751525, "num_tokens": 65928522.0, "step": 6360 }, { "entropy": 0.7533423244953156, "epoch": 0.05096, "grad_norm": 3.344332218170166, "learning_rate": 4.747138855542217e-05, "loss": 0.7461, "mean_token_accuracy": 0.7976219415664673, "num_tokens": 66006402.0, "step": 6370 }, { "entropy": 0.7674532830715179, "epoch": 0.05104, "grad_norm": 1.8759117126464844, "learning_rate": 4.7467386954781915e-05, "loss": 0.7838, "mean_token_accuracy": 0.7818169951438904, "num_tokens": 66099833.0, "step": 6380 }, { "entropy": 0.7865933477878571, "epoch": 0.05112, "grad_norm": 2.8087451457977295, "learning_rate": 4.746338535414166e-05, "loss": 0.7644, "mean_token_accuracy": 0.7696732103824615, "num_tokens": 66243861.0, "step": 6390 }, { "entropy": 0.7533000409603119, "epoch": 0.0512, "grad_norm": 6.1926679611206055, "learning_rate": 4.74593837535014e-05, "loss": 0.7596, "mean_token_accuracy": 0.798945140838623, "num_tokens": 66282037.0, "step": 6400 }, { "entropy": 0.6415381133556366, "epoch": 0.05128, "grad_norm": 1.6081185340881348, "learning_rate": 4.7455382152861146e-05, "loss": 0.6452, "mean_token_accuracy": 0.7946843087673188, "num_tokens": 66445768.0, "step": 6410 }, { "entropy": 0.6072659730911255, "epoch": 0.05136, "grad_norm": 3.2097439765930176, "learning_rate": 4.745138055222089e-05, "loss": 0.6073, "mean_token_accuracy": 0.8187746107578278, "num_tokens": 66534106.0, "step": 6420 }, { "entropy": 0.7150554120540619, "epoch": 0.05144, "grad_norm": 2.4053187370300293, "learning_rate": 4.7447378951580634e-05, "loss": 0.7204, "mean_token_accuracy": 0.7939355492591857, "num_tokens": 66629018.0, "step": 6430 }, { "entropy": 0.7017264842987061, "epoch": 0.05152, "grad_norm": 3.4384074211120605, "learning_rate": 4.744337735094038e-05, "loss": 0.6902, "mean_token_accuracy": 0.7855807483196259, "num_tokens": 66778748.0, "step": 6440 }, { "entropy": 0.7060347259044647, "epoch": 0.0516, "grad_norm": 5.53053092956543, "learning_rate": 4.743937575030012e-05, "loss": 0.7143, "mean_token_accuracy": 0.8050483107566834, "num_tokens": 66820020.0, "step": 6450 }, { "entropy": 0.6675714552402496, "epoch": 0.05168, "grad_norm": 3.0114333629608154, "learning_rate": 4.7435374149659865e-05, "loss": 0.6641, "mean_token_accuracy": 0.7881778240203857, "num_tokens": 66983860.0, "step": 6460 }, { "entropy": 0.7074561983346939, "epoch": 0.05176, "grad_norm": 4.5198869705200195, "learning_rate": 4.743137254901961e-05, "loss": 0.7008, "mean_token_accuracy": 0.8014104664325714, "num_tokens": 67070366.0, "step": 6470 }, { "entropy": 0.7137313961982727, "epoch": 0.05184, "grad_norm": 2.6882143020629883, "learning_rate": 4.742737094837935e-05, "loss": 0.7349, "mean_token_accuracy": 0.7936661243438721, "num_tokens": 67164048.0, "step": 6480 }, { "entropy": 0.7331077575683593, "epoch": 0.05192, "grad_norm": 2.164250135421753, "learning_rate": 4.7423369347739096e-05, "loss": 0.7218, "mean_token_accuracy": 0.781807827949524, "num_tokens": 67310879.0, "step": 6490 }, { "entropy": 0.7407772839069366, "epoch": 0.052, "grad_norm": 6.30588436126709, "learning_rate": 4.741936774709884e-05, "loss": 0.7299, "mean_token_accuracy": 0.8086494326591491, "num_tokens": 67351440.0, "step": 6500 }, { "entropy": 0.6910949766635894, "epoch": 0.05208, "grad_norm": 2.5135433673858643, "learning_rate": 4.7415366146458584e-05, "loss": 0.7002, "mean_token_accuracy": 0.7829140186309814, "num_tokens": 67515280.0, "step": 6510 }, { "entropy": 0.7147131145000458, "epoch": 0.05216, "grad_norm": 3.065585136413574, "learning_rate": 4.7411364545818334e-05, "loss": 0.7079, "mean_token_accuracy": 0.7991869151592255, "num_tokens": 67608268.0, "step": 6520 }, { "entropy": 0.677420300245285, "epoch": 0.05224, "grad_norm": 2.2236311435699463, "learning_rate": 4.740736294517807e-05, "loss": 0.693, "mean_token_accuracy": 0.7997870147228241, "num_tokens": 67703251.0, "step": 6530 }, { "entropy": 0.7491819679737091, "epoch": 0.05232, "grad_norm": 2.653900623321533, "learning_rate": 4.7403361344537815e-05, "loss": 0.7371, "mean_token_accuracy": 0.7755984425544739, "num_tokens": 67847785.0, "step": 6540 }, { "entropy": 0.6670914471149445, "epoch": 0.0524, "grad_norm": 5.243509292602539, "learning_rate": 4.739935974389756e-05, "loss": 0.6562, "mean_token_accuracy": 0.8198974370956421, "num_tokens": 67891309.0, "step": 6550 }, { "entropy": 0.6862355291843414, "epoch": 0.05248, "grad_norm": 1.5464026927947998, "learning_rate": 4.739535814325731e-05, "loss": 0.6912, "mean_token_accuracy": 0.7840315222740173, "num_tokens": 68055149.0, "step": 6560 }, { "entropy": 0.6343516439199448, "epoch": 0.05256, "grad_norm": 3.802086591720581, "learning_rate": 4.7391356542617046e-05, "loss": 0.6252, "mean_token_accuracy": 0.8149972140789032, "num_tokens": 68142993.0, "step": 6570 }, { "entropy": 0.7302168548107147, "epoch": 0.05264, "grad_norm": 1.5152839422225952, "learning_rate": 4.738735494197679e-05, "loss": 0.7603, "mean_token_accuracy": 0.7818396866321564, "num_tokens": 68236429.0, "step": 6580 }, { "entropy": 0.7105825126171113, "epoch": 0.05272, "grad_norm": 2.6128265857696533, "learning_rate": 4.738335334133654e-05, "loss": 0.6907, "mean_token_accuracy": 0.7906462550163269, "num_tokens": 68373475.0, "step": 6590 }, { "entropy": 0.6564808577299118, "epoch": 0.0528, "grad_norm": 5.120362758636475, "learning_rate": 4.7379351740696284e-05, "loss": 0.661, "mean_token_accuracy": 0.816786140203476, "num_tokens": 68411793.0, "step": 6600 }, { "entropy": 0.6806240677833557, "epoch": 0.05288, "grad_norm": 2.0603582859039307, "learning_rate": 4.737535014005602e-05, "loss": 0.689, "mean_token_accuracy": 0.7873507022857666, "num_tokens": 68573554.0, "step": 6610 }, { "entropy": 0.7705601632595063, "epoch": 0.05296, "grad_norm": 3.487175226211548, "learning_rate": 4.7371348539415764e-05, "loss": 0.7436, "mean_token_accuracy": 0.7988744676113129, "num_tokens": 68643613.0, "step": 6620 }, { "entropy": 0.705102664232254, "epoch": 0.05304, "grad_norm": 2.1386096477508545, "learning_rate": 4.7367346938775515e-05, "loss": 0.724, "mean_token_accuracy": 0.7937519967555999, "num_tokens": 68737508.0, "step": 6630 }, { "entropy": 0.7636875450611115, "epoch": 0.05312, "grad_norm": 2.7493951320648193, "learning_rate": 4.736334533813526e-05, "loss": 0.7565, "mean_token_accuracy": 0.7754820048809051, "num_tokens": 68873847.0, "step": 6640 }, { "entropy": 0.6834007501602173, "epoch": 0.0532, "grad_norm": 5.536835193634033, "learning_rate": 4.7359343737494996e-05, "loss": 0.6772, "mean_token_accuracy": 0.815422385931015, "num_tokens": 68915086.0, "step": 6650 }, { "entropy": 0.7655964970588685, "epoch": 0.05328, "grad_norm": 2.628267288208008, "learning_rate": 4.7355342136854746e-05, "loss": 0.7665, "mean_token_accuracy": 0.7678682863712311, "num_tokens": 69078879.0, "step": 6660 }, { "entropy": 0.6861570239067077, "epoch": 0.05336, "grad_norm": 3.536752700805664, "learning_rate": 4.735134053621449e-05, "loss": 0.6852, "mean_token_accuracy": 0.8045047461986542, "num_tokens": 69168523.0, "step": 6670 }, { "entropy": 0.6875538945198059, "epoch": 0.05344, "grad_norm": 2.455850839614868, "learning_rate": 4.7347338935574233e-05, "loss": 0.6973, "mean_token_accuracy": 0.7942494451999664, "num_tokens": 69262227.0, "step": 6680 }, { "entropy": 0.7027499675750732, "epoch": 0.05352, "grad_norm": 2.6500816345214844, "learning_rate": 4.734333733493397e-05, "loss": 0.6936, "mean_token_accuracy": 0.78953617811203, "num_tokens": 69410514.0, "step": 6690 }, { "entropy": 0.6849575102329254, "epoch": 0.0536, "grad_norm": 4.999210834503174, "learning_rate": 4.733933573429372e-05, "loss": 0.6724, "mean_token_accuracy": 0.8200733542442322, "num_tokens": 69451567.0, "step": 6700 }, { "entropy": 0.7109967350959778, "epoch": 0.05368, "grad_norm": 2.2253482341766357, "learning_rate": 4.7335334133653465e-05, "loss": 0.7162, "mean_token_accuracy": 0.7775403082370758, "num_tokens": 69615407.0, "step": 6710 }, { "entropy": 0.691727739572525, "epoch": 0.05376, "grad_norm": 4.114674091339111, "learning_rate": 4.733133253301321e-05, "loss": 0.6748, "mean_token_accuracy": 0.8058552801609039, "num_tokens": 69692573.0, "step": 6720 }, { "entropy": 0.7218332111835479, "epoch": 0.05384, "grad_norm": 1.9742825031280518, "learning_rate": 4.732733093237295e-05, "loss": 0.7347, "mean_token_accuracy": 0.79154252409935, "num_tokens": 69784215.0, "step": 6730 }, { "entropy": 0.7463369071483612, "epoch": 0.05392, "grad_norm": 3.192966938018799, "learning_rate": 4.7323329331732696e-05, "loss": 0.7352, "mean_token_accuracy": 0.7793561339378356, "num_tokens": 69921128.0, "step": 6740 }, { "entropy": 0.6706878036260605, "epoch": 0.054, "grad_norm": 5.4618706703186035, "learning_rate": 4.731932773109244e-05, "loss": 0.6744, "mean_token_accuracy": 0.8188230276107789, "num_tokens": 69955086.0, "step": 6750 }, { "entropy": 0.6320463329553604, "epoch": 0.05408, "grad_norm": 1.4604214429855347, "learning_rate": 4.731532613045218e-05, "loss": 0.638, "mean_token_accuracy": 0.7951209068298339, "num_tokens": 70118926.0, "step": 6760 }, { "entropy": 0.7122033715248108, "epoch": 0.05416, "grad_norm": 3.082796573638916, "learning_rate": 4.731132452981193e-05, "loss": 0.7017, "mean_token_accuracy": 0.7994914054870605, "num_tokens": 70200936.0, "step": 6770 }, { "entropy": 0.7100591957569122, "epoch": 0.05424, "grad_norm": 1.8679871559143066, "learning_rate": 4.730732292917167e-05, "loss": 0.7315, "mean_token_accuracy": 0.7914937794208526, "num_tokens": 70295719.0, "step": 6780 }, { "entropy": 0.7404496014118195, "epoch": 0.05432, "grad_norm": 2.3123159408569336, "learning_rate": 4.7303321328531414e-05, "loss": 0.7233, "mean_token_accuracy": 0.7821149706840516, "num_tokens": 70437601.0, "step": 6790 }, { "entropy": 0.6534596145153045, "epoch": 0.0544, "grad_norm": 4.42868709564209, "learning_rate": 4.7299319727891165e-05, "loss": 0.6549, "mean_token_accuracy": 0.8217379689216614, "num_tokens": 70481267.0, "step": 6800 }, { "entropy": 0.6947467446327209, "epoch": 0.05448, "grad_norm": 2.345196485519409, "learning_rate": 4.72953181272509e-05, "loss": 0.6996, "mean_token_accuracy": 0.7858964025974273, "num_tokens": 70643583.0, "step": 6810 }, { "entropy": 0.7444811522960663, "epoch": 0.05456, "grad_norm": 3.559323787689209, "learning_rate": 4.7291316526610645e-05, "loss": 0.7296, "mean_token_accuracy": 0.7954493939876557, "num_tokens": 70715994.0, "step": 6820 }, { "entropy": 0.7688389718532562, "epoch": 0.05464, "grad_norm": 2.1378355026245117, "learning_rate": 4.728731492597039e-05, "loss": 0.7889, "mean_token_accuracy": 0.783012306690216, "num_tokens": 70808415.0, "step": 6830 }, { "entropy": 0.7223767459392547, "epoch": 0.05472, "grad_norm": 2.897648811340332, "learning_rate": 4.728331332533014e-05, "loss": 0.7149, "mean_token_accuracy": 0.7823453664779663, "num_tokens": 70953363.0, "step": 6840 }, { "entropy": 0.7409628927707672, "epoch": 0.0548, "grad_norm": 5.332749366760254, "learning_rate": 4.7279311724689877e-05, "loss": 0.7239, "mean_token_accuracy": 0.8056524813175201, "num_tokens": 70994142.0, "step": 6850 }, { "entropy": 0.7262901961803436, "epoch": 0.05488, "grad_norm": 1.7715216875076294, "learning_rate": 4.727531012404962e-05, "loss": 0.7371, "mean_token_accuracy": 0.773919153213501, "num_tokens": 71157982.0, "step": 6860 }, { "entropy": 0.74664586186409, "epoch": 0.05496, "grad_norm": 3.5742712020874023, "learning_rate": 4.7271308523409364e-05, "loss": 0.7503, "mean_token_accuracy": 0.7951683163642883, "num_tokens": 71238887.0, "step": 6870 }, { "entropy": 0.6813008427619934, "epoch": 0.05504, "grad_norm": 2.342071533203125, "learning_rate": 4.7267306922769114e-05, "loss": 0.6794, "mean_token_accuracy": 0.8028183817863465, "num_tokens": 71331836.0, "step": 6880 }, { "entropy": 0.7458385467529297, "epoch": 0.05512, "grad_norm": 2.968980550765991, "learning_rate": 4.726330532212885e-05, "loss": 0.7503, "mean_token_accuracy": 0.7731945097446442, "num_tokens": 71458519.0, "step": 6890 }, { "entropy": 0.7297135353088379, "epoch": 0.0552, "grad_norm": 4.923983097076416, "learning_rate": 4.7259303721488595e-05, "loss": 0.7176, "mean_token_accuracy": 0.8079638361930848, "num_tokens": 71494025.0, "step": 6900 }, { "entropy": 0.6750974118709564, "epoch": 0.05528, "grad_norm": 1.5189340114593506, "learning_rate": 4.7255302120848346e-05, "loss": 0.677, "mean_token_accuracy": 0.7851001441478729, "num_tokens": 71657865.0, "step": 6910 }, { "entropy": 0.7276409149169922, "epoch": 0.05536, "grad_norm": 3.706709146499634, "learning_rate": 4.725130052020809e-05, "loss": 0.7219, "mean_token_accuracy": 0.7944812834262848, "num_tokens": 71749553.0, "step": 6920 }, { "entropy": 0.7056154191493988, "epoch": 0.05544, "grad_norm": 1.8850542306900024, "learning_rate": 4.7247298919567826e-05, "loss": 0.7169, "mean_token_accuracy": 0.7986141264438629, "num_tokens": 71844026.0, "step": 6930 }, { "entropy": 0.7611242055892944, "epoch": 0.05552, "grad_norm": 2.5730130672454834, "learning_rate": 4.724329731892757e-05, "loss": 0.7581, "mean_token_accuracy": 0.7747282743453979, "num_tokens": 71976722.0, "step": 6940 }, { "entropy": 0.6645841062068939, "epoch": 0.0556, "grad_norm": 7.687697887420654, "learning_rate": 4.723929571828732e-05, "loss": 0.633, "mean_token_accuracy": 0.8318017840385437, "num_tokens": 72015587.0, "step": 6950 }, { "entropy": 0.6859334409236908, "epoch": 0.05568, "grad_norm": 1.6738035678863525, "learning_rate": 4.7235294117647064e-05, "loss": 0.693, "mean_token_accuracy": 0.7843296468257904, "num_tokens": 72179407.0, "step": 6960 }, { "entropy": 0.6597012102603912, "epoch": 0.05576, "grad_norm": 4.052907466888428, "learning_rate": 4.72312925170068e-05, "loss": 0.6586, "mean_token_accuracy": 0.8084193229675293, "num_tokens": 72257450.0, "step": 6970 }, { "entropy": 0.7319607377052307, "epoch": 0.05584, "grad_norm": 2.3652939796447754, "learning_rate": 4.722729091636655e-05, "loss": 0.7418, "mean_token_accuracy": 0.7925260066986084, "num_tokens": 72349759.0, "step": 6980 }, { "entropy": 0.7089536488056183, "epoch": 0.05592, "grad_norm": 2.8572652339935303, "learning_rate": 4.7223289315726295e-05, "loss": 0.7103, "mean_token_accuracy": 0.7838482439517975, "num_tokens": 72485548.0, "step": 6990 }, { "entropy": 0.6930984675884246, "epoch": 0.056, "grad_norm": 5.822727203369141, "learning_rate": 4.721928771508604e-05, "loss": 0.6703, "mean_token_accuracy": 0.8124133944511414, "num_tokens": 72526418.0, "step": 7000 }, { "entropy": 0.6630230724811554, "epoch": 0.05608, "grad_norm": 2.850888967514038, "learning_rate": 4.7215286114445776e-05, "loss": 0.6748, "mean_token_accuracy": 0.7898334860801697, "num_tokens": 72689242.0, "step": 7010 }, { "entropy": 0.7379312574863434, "epoch": 0.05616, "grad_norm": 4.086813449859619, "learning_rate": 4.7211284513805526e-05, "loss": 0.7302, "mean_token_accuracy": 0.796303927898407, "num_tokens": 72759812.0, "step": 7020 }, { "entropy": 0.6672995924949646, "epoch": 0.05624, "grad_norm": 2.0084614753723145, "learning_rate": 4.720728291316527e-05, "loss": 0.6776, "mean_token_accuracy": 0.8031298577785492, "num_tokens": 72852208.0, "step": 7030 }, { "entropy": 0.69437495470047, "epoch": 0.05632, "grad_norm": 2.3999440670013428, "learning_rate": 4.7203281312525014e-05, "loss": 0.6957, "mean_token_accuracy": 0.7870599687099457, "num_tokens": 72999997.0, "step": 7040 }, { "entropy": 0.7145476549863815, "epoch": 0.0564, "grad_norm": 5.225218772888184, "learning_rate": 4.719927971188476e-05, "loss": 0.7179, "mean_token_accuracy": 0.8085507571697235, "num_tokens": 73047297.0, "step": 7050 }, { "entropy": 0.770794689655304, "epoch": 0.05648, "grad_norm": 2.433511257171631, "learning_rate": 4.71952781112445e-05, "loss": 0.7747, "mean_token_accuracy": 0.763645589351654, "num_tokens": 73210882.0, "step": 7060 }, { "entropy": 0.6636249721050262, "epoch": 0.05656, "grad_norm": 3.132800579071045, "learning_rate": 4.7191276510604245e-05, "loss": 0.6636, "mean_token_accuracy": 0.8116121649742126, "num_tokens": 73282413.0, "step": 7070 }, { "entropy": 0.7236921548843384, "epoch": 0.05664, "grad_norm": 1.4362279176712036, "learning_rate": 4.718727490996399e-05, "loss": 0.7315, "mean_token_accuracy": 0.795242303609848, "num_tokens": 73375972.0, "step": 7080 }, { "entropy": 0.7243578016757966, "epoch": 0.05672, "grad_norm": 3.011075496673584, "learning_rate": 4.718327330932373e-05, "loss": 0.7096, "mean_token_accuracy": 0.7875242948532104, "num_tokens": 73503261.0, "step": 7090 }, { "entropy": 0.7431185066699981, "epoch": 0.0568, "grad_norm": 5.3128886222839355, "learning_rate": 4.7179271708683476e-05, "loss": 0.7506, "mean_token_accuracy": 0.7968698501586914, "num_tokens": 73538252.0, "step": 7100 }, { "entropy": 0.7292089760303497, "epoch": 0.05688, "grad_norm": 2.1809890270233154, "learning_rate": 4.717527010804322e-05, "loss": 0.731, "mean_token_accuracy": 0.7732792496681213, "num_tokens": 73699404.0, "step": 7110 }, { "entropy": 0.6779338866472244, "epoch": 0.05696, "grad_norm": 3.732699394226074, "learning_rate": 4.7171268507402964e-05, "loss": 0.6631, "mean_token_accuracy": 0.8155210316181183, "num_tokens": 73770222.0, "step": 7120 }, { "entropy": 0.73962322473526, "epoch": 0.05704, "grad_norm": 1.6129934787750244, "learning_rate": 4.716726690676271e-05, "loss": 0.751, "mean_token_accuracy": 0.7891646504402161, "num_tokens": 73862767.0, "step": 7130 }, { "entropy": 0.738612312078476, "epoch": 0.05712, "grad_norm": 2.6029202938079834, "learning_rate": 4.716326530612245e-05, "loss": 0.7359, "mean_token_accuracy": 0.778672844171524, "num_tokens": 74009184.0, "step": 7140 }, { "entropy": 0.6881103932857513, "epoch": 0.0572, "grad_norm": 4.955204010009766, "learning_rate": 4.7159263705482195e-05, "loss": 0.6725, "mean_token_accuracy": 0.8140874922275543, "num_tokens": 74053846.0, "step": 7150 }, { "entropy": 0.7149232029914856, "epoch": 0.05728, "grad_norm": 1.820578694343567, "learning_rate": 4.715526210484194e-05, "loss": 0.7154, "mean_token_accuracy": 0.7823878288269043, "num_tokens": 74216733.0, "step": 7160 }, { "entropy": 0.7081674963235856, "epoch": 0.05736, "grad_norm": 5.596731185913086, "learning_rate": 4.715126050420168e-05, "loss": 0.7031, "mean_token_accuracy": 0.7995010197162629, "num_tokens": 74282933.0, "step": 7170 }, { "entropy": 0.789714801311493, "epoch": 0.05744, "grad_norm": 1.8283299207687378, "learning_rate": 4.7147258903561426e-05, "loss": 0.7862, "mean_token_accuracy": 0.7794993102550507, "num_tokens": 74374844.0, "step": 7180 }, { "entropy": 0.7192356586456299, "epoch": 0.05752, "grad_norm": 2.974348306655884, "learning_rate": 4.7143257302921176e-05, "loss": 0.7164, "mean_token_accuracy": 0.7809772670269013, "num_tokens": 74513290.0, "step": 7190 }, { "entropy": 0.7299598515033722, "epoch": 0.0576, "grad_norm": 4.88840913772583, "learning_rate": 4.713925570228091e-05, "loss": 0.7274, "mean_token_accuracy": 0.8113088965415954, "num_tokens": 74552460.0, "step": 7200 }, { "entropy": 0.6730819582939148, "epoch": 0.05768, "grad_norm": 2.191927433013916, "learning_rate": 4.713525410164066e-05, "loss": 0.6795, "mean_token_accuracy": 0.7887640416622161, "num_tokens": 74716300.0, "step": 7210 }, { "entropy": 0.705349600315094, "epoch": 0.05776, "grad_norm": 3.5624070167541504, "learning_rate": 4.71312525010004e-05, "loss": 0.6954, "mean_token_accuracy": 0.7978479623794555, "num_tokens": 74797436.0, "step": 7220 }, { "entropy": 0.7157325267791748, "epoch": 0.05784, "grad_norm": 2.8355109691619873, "learning_rate": 4.712725090036015e-05, "loss": 0.7197, "mean_token_accuracy": 0.7963757216930389, "num_tokens": 74890741.0, "step": 7230 }, { "entropy": 0.6946555376052856, "epoch": 0.05792, "grad_norm": 3.0150904655456543, "learning_rate": 4.712324929971989e-05, "loss": 0.6843, "mean_token_accuracy": 0.7896579027175903, "num_tokens": 75032734.0, "step": 7240 }, { "entropy": 0.6976794302463531, "epoch": 0.058, "grad_norm": 4.649782180786133, "learning_rate": 4.711924769907963e-05, "loss": 0.7016, "mean_token_accuracy": 0.8120878458023071, "num_tokens": 75071737.0, "step": 7250 }, { "entropy": 0.7307097494602204, "epoch": 0.05808, "grad_norm": 1.9753915071487427, "learning_rate": 4.711524609843938e-05, "loss": 0.736, "mean_token_accuracy": 0.775147944688797, "num_tokens": 75235550.0, "step": 7260 }, { "entropy": 0.6972940146923066, "epoch": 0.05816, "grad_norm": 3.9285900592803955, "learning_rate": 4.7111244497799126e-05, "loss": 0.6929, "mean_token_accuracy": 0.8020839333534241, "num_tokens": 75319029.0, "step": 7270 }, { "entropy": 0.6725509941577912, "epoch": 0.05824, "grad_norm": 1.4538743495941162, "learning_rate": 4.710724289715886e-05, "loss": 0.6788, "mean_token_accuracy": 0.8048302710056305, "num_tokens": 75413335.0, "step": 7280 }, { "entropy": 0.708511084318161, "epoch": 0.05832, "grad_norm": 2.6318345069885254, "learning_rate": 4.710324129651861e-05, "loss": 0.7099, "mean_token_accuracy": 0.7828430891036987, "num_tokens": 75551402.0, "step": 7290 }, { "entropy": 0.6804218292236328, "epoch": 0.0584, "grad_norm": 5.283367156982422, "learning_rate": 4.709923969587836e-05, "loss": 0.6678, "mean_token_accuracy": 0.8187371671199799, "num_tokens": 75589735.0, "step": 7300 }, { "entropy": 0.6435972273349762, "epoch": 0.05848, "grad_norm": 2.0061492919921875, "learning_rate": 4.70952380952381e-05, "loss": 0.6483, "mean_token_accuracy": 0.7911211550235748, "num_tokens": 75753575.0, "step": 7310 }, { "entropy": 0.6756671249866486, "epoch": 0.05856, "grad_norm": 3.778748035430908, "learning_rate": 4.709123649459784e-05, "loss": 0.6786, "mean_token_accuracy": 0.8029399633407592, "num_tokens": 75847131.0, "step": 7320 }, { "entropy": 0.7369563460350037, "epoch": 0.05864, "grad_norm": 2.4259562492370605, "learning_rate": 4.708723489395759e-05, "loss": 0.7283, "mean_token_accuracy": 0.7965247988700866, "num_tokens": 75940873.0, "step": 7330 }, { "entropy": 0.758866423368454, "epoch": 0.05872, "grad_norm": 2.784310817718506, "learning_rate": 4.708323329331733e-05, "loss": 0.7541, "mean_token_accuracy": 0.7803191184997559, "num_tokens": 76070302.0, "step": 7340 }, { "entropy": 0.6889219760894776, "epoch": 0.0588, "grad_norm": 5.643022060394287, "learning_rate": 4.7079231692677076e-05, "loss": 0.673, "mean_token_accuracy": 0.8182763278484344, "num_tokens": 76108506.0, "step": 7350 }, { "entropy": 0.6643274366855622, "epoch": 0.05888, "grad_norm": 1.8602973222732544, "learning_rate": 4.707523009203681e-05, "loss": 0.6674, "mean_token_accuracy": 0.7936248242855072, "num_tokens": 76272346.0, "step": 7360 }, { "entropy": 0.6720586240291595, "epoch": 0.05896, "grad_norm": 3.1414425373077393, "learning_rate": 4.707122849139656e-05, "loss": 0.677, "mean_token_accuracy": 0.8030494213104248, "num_tokens": 76361229.0, "step": 7370 }, { "entropy": 0.7280028104782105, "epoch": 0.05904, "grad_norm": 1.770622730255127, "learning_rate": 4.706722689075631e-05, "loss": 0.7306, "mean_token_accuracy": 0.7941327095031738, "num_tokens": 76455925.0, "step": 7380 }, { "entropy": 0.728749406337738, "epoch": 0.05912, "grad_norm": 2.4263343811035156, "learning_rate": 4.706322529011605e-05, "loss": 0.7264, "mean_token_accuracy": 0.7789476752281189, "num_tokens": 76595279.0, "step": 7390 }, { "entropy": 0.7301312029361725, "epoch": 0.0592, "grad_norm": 4.350391864776611, "learning_rate": 4.705922368947579e-05, "loss": 0.7277, "mean_token_accuracy": 0.8113160967826843, "num_tokens": 76633313.0, "step": 7400 }, { "entropy": 0.6473390519618988, "epoch": 0.05928, "grad_norm": 1.5497503280639648, "learning_rate": 4.705522208883554e-05, "loss": 0.6495, "mean_token_accuracy": 0.7930447101593018, "num_tokens": 76797153.0, "step": 7410 }, { "entropy": 0.6247776746749878, "epoch": 0.05936, "grad_norm": 3.1302831172943115, "learning_rate": 4.705122048819528e-05, "loss": 0.6181, "mean_token_accuracy": 0.8171108543872834, "num_tokens": 76883040.0, "step": 7420 }, { "entropy": 0.697649210691452, "epoch": 0.05944, "grad_norm": 1.6160616874694824, "learning_rate": 4.7047218887555025e-05, "loss": 0.6969, "mean_token_accuracy": 0.798204380273819, "num_tokens": 76978926.0, "step": 7430 }, { "entropy": 0.7075642704963684, "epoch": 0.05952, "grad_norm": 2.1057350635528564, "learning_rate": 4.704321728691477e-05, "loss": 0.7075, "mean_token_accuracy": 0.7869625926017761, "num_tokens": 77119106.0, "step": 7440 }, { "entropy": 0.6483104467391968, "epoch": 0.0596, "grad_norm": 6.512531280517578, "learning_rate": 4.703921568627451e-05, "loss": 0.6429, "mean_token_accuracy": 0.8222284913063049, "num_tokens": 77163816.0, "step": 7450 }, { "entropy": 0.6497842311859131, "epoch": 0.05968, "grad_norm": 1.5152440071105957, "learning_rate": 4.703521408563426e-05, "loss": 0.6485, "mean_token_accuracy": 0.7955056250095367, "num_tokens": 77327656.0, "step": 7460 }, { "entropy": 0.75073461830616, "epoch": 0.05976, "grad_norm": 3.8825531005859375, "learning_rate": 4.7031212484994e-05, "loss": 0.7436, "mean_token_accuracy": 0.7890023708343505, "num_tokens": 77423736.0, "step": 7470 }, { "entropy": 0.7150587856769561, "epoch": 0.05984, "grad_norm": 2.724106550216675, "learning_rate": 4.7027210884353744e-05, "loss": 0.7179, "mean_token_accuracy": 0.7984933435916901, "num_tokens": 77516213.0, "step": 7480 }, { "entropy": 0.7329165518283844, "epoch": 0.05992, "grad_norm": 2.602834463119507, "learning_rate": 4.702320928371349e-05, "loss": 0.7349, "mean_token_accuracy": 0.7799891531467438, "num_tokens": 77643999.0, "step": 7490 }, { "entropy": 0.6713918924331665, "epoch": 0.06, "grad_norm": 7.073276042938232, "learning_rate": 4.701920768307323e-05, "loss": 0.6523, "mean_token_accuracy": 0.8268399059772491, "num_tokens": 77677996.0, "step": 7500 }, { "entropy": 0.6546876668930054, "epoch": 0.06008, "grad_norm": 2.2843079566955566, "learning_rate": 4.7015206082432975e-05, "loss": 0.6641, "mean_token_accuracy": 0.792085987329483, "num_tokens": 77841836.0, "step": 7510 }, { "entropy": 0.6518349170684814, "epoch": 0.06016, "grad_norm": 3.5219545364379883, "learning_rate": 4.701120448179272e-05, "loss": 0.639, "mean_token_accuracy": 0.8095561385154724, "num_tokens": 77933089.0, "step": 7520 }, { "entropy": 0.7229756593704224, "epoch": 0.06024, "grad_norm": 1.6752681732177734, "learning_rate": 4.700720288115246e-05, "loss": 0.7345, "mean_token_accuracy": 0.792249184846878, "num_tokens": 78027673.0, "step": 7530 }, { "entropy": 0.686326003074646, "epoch": 0.06032, "grad_norm": 3.3705410957336426, "learning_rate": 4.7003201280512206e-05, "loss": 0.6782, "mean_token_accuracy": 0.794960904121399, "num_tokens": 78156181.0, "step": 7540 }, { "entropy": 0.6883456170558929, "epoch": 0.0604, "grad_norm": 5.426213264465332, "learning_rate": 4.699919967987195e-05, "loss": 0.6841, "mean_token_accuracy": 0.8193764388561249, "num_tokens": 78193253.0, "step": 7550 }, { "entropy": 0.67265784740448, "epoch": 0.06048, "grad_norm": 2.1375837326049805, "learning_rate": 4.6995198079231694e-05, "loss": 0.6732, "mean_token_accuracy": 0.787756472826004, "num_tokens": 78357093.0, "step": 7560 }, { "entropy": 0.8445215702056885, "epoch": 0.06056, "grad_norm": 3.4111289978027344, "learning_rate": 4.699119647859144e-05, "loss": 0.8398, "mean_token_accuracy": 0.7707171618938446, "num_tokens": 78437697.0, "step": 7570 }, { "entropy": 0.7450326681137085, "epoch": 0.06064, "grad_norm": 1.7725129127502441, "learning_rate": 4.698719487795119e-05, "loss": 0.7374, "mean_token_accuracy": 0.7848253309726715, "num_tokens": 78531794.0, "step": 7580 }, { "entropy": 0.7900131404399872, "epoch": 0.06072, "grad_norm": 3.8003861904144287, "learning_rate": 4.6983193277310925e-05, "loss": 0.7821, "mean_token_accuracy": 0.7658324062824249, "num_tokens": 78678837.0, "step": 7590 }, { "entropy": 0.6996699094772338, "epoch": 0.0608, "grad_norm": 5.0588884353637695, "learning_rate": 4.697919167667067e-05, "loss": 0.7147, "mean_token_accuracy": 0.8098361492156982, "num_tokens": 78720310.0, "step": 7600 }, { "entropy": 0.70042484998703, "epoch": 0.06088, "grad_norm": 2.1531076431274414, "learning_rate": 4.697519007603041e-05, "loss": 0.7027, "mean_token_accuracy": 0.782321697473526, "num_tokens": 78884150.0, "step": 7610 }, { "entropy": 0.6515151798725128, "epoch": 0.06096, "grad_norm": 4.141386985778809, "learning_rate": 4.697118847539016e-05, "loss": 0.6416, "mean_token_accuracy": 0.8117716610431671, "num_tokens": 78962630.0, "step": 7620 }, { "entropy": 0.7025827169418335, "epoch": 0.06104, "grad_norm": 2.1903836727142334, "learning_rate": 4.69671868747499e-05, "loss": 0.7159, "mean_token_accuracy": 0.7989046394824981, "num_tokens": 79056391.0, "step": 7630 }, { "entropy": 0.7471938073635102, "epoch": 0.06112, "grad_norm": 2.524407148361206, "learning_rate": 4.6963185274109643e-05, "loss": 0.7393, "mean_token_accuracy": 0.7846004068851471, "num_tokens": 79182309.0, "step": 7640 }, { "entropy": 0.7744067639112473, "epoch": 0.0612, "grad_norm": 6.421394348144531, "learning_rate": 4.6959183673469394e-05, "loss": 0.7718, "mean_token_accuracy": 0.7974925935268402, "num_tokens": 79215848.0, "step": 7650 }, { "entropy": 0.6954597413539887, "epoch": 0.06128, "grad_norm": 2.5830256938934326, "learning_rate": 4.695518207282914e-05, "loss": 0.702, "mean_token_accuracy": 0.7832960426807404, "num_tokens": 79379659.0, "step": 7660 }, { "entropy": 0.7369183361530304, "epoch": 0.06136, "grad_norm": 4.59665584564209, "learning_rate": 4.6951180472188875e-05, "loss": 0.7273, "mean_token_accuracy": 0.7947853565216064, "num_tokens": 79458750.0, "step": 7670 }, { "entropy": 0.8039923489093781, "epoch": 0.06144, "grad_norm": 1.8848551511764526, "learning_rate": 4.694717887154862e-05, "loss": 0.7856, "mean_token_accuracy": 0.786553579568863, "num_tokens": 79552008.0, "step": 7680 }, { "entropy": 0.6875883638858795, "epoch": 0.06152, "grad_norm": 3.693082571029663, "learning_rate": 4.694317727090837e-05, "loss": 0.6856, "mean_token_accuracy": 0.7923605859279632, "num_tokens": 79685873.0, "step": 7690 }, { "entropy": 0.6615382254123687, "epoch": 0.0616, "grad_norm": 5.631678104400635, "learning_rate": 4.693917567026811e-05, "loss": 0.6589, "mean_token_accuracy": 0.821614670753479, "num_tokens": 79727312.0, "step": 7700 }, { "entropy": 0.6532787144184112, "epoch": 0.06168, "grad_norm": 1.776137351989746, "learning_rate": 4.693517406962785e-05, "loss": 0.6597, "mean_token_accuracy": 0.7931301951408386, "num_tokens": 79891152.0, "step": 7710 }, { "entropy": 0.6741016566753387, "epoch": 0.06176, "grad_norm": 3.1276047229766846, "learning_rate": 4.69311724689876e-05, "loss": 0.6644, "mean_token_accuracy": 0.8075404763221741, "num_tokens": 79982624.0, "step": 7720 }, { "entropy": 0.6364237636327743, "epoch": 0.06184, "grad_norm": 1.7089046239852905, "learning_rate": 4.6927170868347344e-05, "loss": 0.6427, "mean_token_accuracy": 0.8085153341293335, "num_tokens": 80078755.0, "step": 7730 }, { "entropy": 0.6687073588371277, "epoch": 0.06192, "grad_norm": 2.5626556873321533, "learning_rate": 4.692316926770709e-05, "loss": 0.659, "mean_token_accuracy": 0.7945926725864411, "num_tokens": 80220880.0, "step": 7740 }, { "entropy": 0.6380296170711517, "epoch": 0.062, "grad_norm": 4.79558801651001, "learning_rate": 4.6919167667066824e-05, "loss": 0.6338, "mean_token_accuracy": 0.8215503871440888, "num_tokens": 80265175.0, "step": 7750 }, { "entropy": 0.7253217935562134, "epoch": 0.06208, "grad_norm": 1.5946776866912842, "learning_rate": 4.6915166066426575e-05, "loss": 0.7321, "mean_token_accuracy": 0.7782120168209076, "num_tokens": 80429015.0, "step": 7760 }, { "entropy": 0.6827381014823913, "epoch": 0.06216, "grad_norm": 3.1007802486419678, "learning_rate": 4.691116446578632e-05, "loss": 0.6722, "mean_token_accuracy": 0.8068941116333008, "num_tokens": 80510843.0, "step": 7770 }, { "entropy": 0.7316217839717865, "epoch": 0.06224, "grad_norm": 2.2609007358551025, "learning_rate": 4.690716286514606e-05, "loss": 0.7354, "mean_token_accuracy": 0.7913499653339386, "num_tokens": 80604793.0, "step": 7780 }, { "entropy": 0.6761456608772278, "epoch": 0.06232, "grad_norm": 2.5835485458374023, "learning_rate": 4.6903161264505806e-05, "loss": 0.6822, "mean_token_accuracy": 0.7939943671226501, "num_tokens": 80736050.0, "step": 7790 }, { "entropy": 0.6798199236392974, "epoch": 0.0624, "grad_norm": 5.1378655433654785, "learning_rate": 4.689915966386555e-05, "loss": 0.6742, "mean_token_accuracy": 0.8150401532649993, "num_tokens": 80771580.0, "step": 7800 }, { "entropy": 0.7102455377578736, "epoch": 0.06248, "grad_norm": 2.372703790664673, "learning_rate": 4.689515806322529e-05, "loss": 0.7143, "mean_token_accuracy": 0.7774609208106995, "num_tokens": 80935420.0, "step": 7810 }, { "entropy": 0.6989517033100128, "epoch": 0.06256, "grad_norm": 4.058769702911377, "learning_rate": 4.689115646258504e-05, "loss": 0.6978, "mean_token_accuracy": 0.7998451232910156, "num_tokens": 81024691.0, "step": 7820 }, { "entropy": 0.7772353649139404, "epoch": 0.06264, "grad_norm": 2.8059911727905273, "learning_rate": 4.688715486194478e-05, "loss": 0.7739, "mean_token_accuracy": 0.7867143630981446, "num_tokens": 81116909.0, "step": 7830 }, { "entropy": 0.7079249858856201, "epoch": 0.06272, "grad_norm": 2.3203208446502686, "learning_rate": 4.6883153261304525e-05, "loss": 0.7, "mean_token_accuracy": 0.787809532880783, "num_tokens": 81255351.0, "step": 7840 }, { "entropy": 0.6533970355987548, "epoch": 0.0628, "grad_norm": 9.96830940246582, "learning_rate": 4.687915166066427e-05, "loss": 0.6417, "mean_token_accuracy": 0.8203457355499267, "num_tokens": 81295797.0, "step": 7850 }, { "entropy": 0.6620848715305329, "epoch": 0.06288, "grad_norm": 2.227058172225952, "learning_rate": 4.687515006002401e-05, "loss": 0.6698, "mean_token_accuracy": 0.7868344008922576, "num_tokens": 81459637.0, "step": 7860 }, { "entropy": 0.7138966858386994, "epoch": 0.06296, "grad_norm": 4.173727512359619, "learning_rate": 4.6871148459383756e-05, "loss": 0.7079, "mean_token_accuracy": 0.792868173122406, "num_tokens": 81555994.0, "step": 7870 }, { "entropy": 0.6846612334251404, "epoch": 0.06304, "grad_norm": 2.0815553665161133, "learning_rate": 4.68671468587435e-05, "loss": 0.6935, "mean_token_accuracy": 0.7970943093299866, "num_tokens": 81650863.0, "step": 7880 }, { "entropy": 0.716995632648468, "epoch": 0.06312, "grad_norm": 2.8506929874420166, "learning_rate": 4.686314525810324e-05, "loss": 0.7124, "mean_token_accuracy": 0.788903945684433, "num_tokens": 81774977.0, "step": 7890 }, { "entropy": 0.779921081662178, "epoch": 0.0632, "grad_norm": 5.540877819061279, "learning_rate": 4.685914365746299e-05, "loss": 0.7703, "mean_token_accuracy": 0.8024200022220611, "num_tokens": 81807299.0, "step": 7900 }, { "entropy": 0.6983536839485168, "epoch": 0.06328, "grad_norm": 1.8284704685211182, "learning_rate": 4.685514205682273e-05, "loss": 0.6978, "mean_token_accuracy": 0.7809110939502716, "num_tokens": 81971139.0, "step": 7910 }, { "entropy": 0.6935306847095489, "epoch": 0.06336, "grad_norm": 2.964839458465576, "learning_rate": 4.6851140456182474e-05, "loss": 0.7013, "mean_token_accuracy": 0.7972304105758667, "num_tokens": 82059740.0, "step": 7920 }, { "entropy": 0.7394335448741913, "epoch": 0.06344, "grad_norm": 1.995283603668213, "learning_rate": 4.684713885554222e-05, "loss": 0.736, "mean_token_accuracy": 0.7925179302692413, "num_tokens": 82154756.0, "step": 7930 }, { "entropy": 0.7595732152462006, "epoch": 0.06352, "grad_norm": 2.6988954544067383, "learning_rate": 4.684313725490196e-05, "loss": 0.7528, "mean_token_accuracy": 0.7732567191123962, "num_tokens": 82290518.0, "step": 7940 }, { "entropy": 0.6343204259872437, "epoch": 0.0636, "grad_norm": 5.1161346435546875, "learning_rate": 4.6839135654261705e-05, "loss": 0.6388, "mean_token_accuracy": 0.8233921766281128, "num_tokens": 82329639.0, "step": 7950 }, { "entropy": 0.6901828527450562, "epoch": 0.06368, "grad_norm": 1.8358476161956787, "learning_rate": 4.683513405362145e-05, "loss": 0.6924, "mean_token_accuracy": 0.7840254127979278, "num_tokens": 82493479.0, "step": 7960 }, { "entropy": 0.6494813919067383, "epoch": 0.06376, "grad_norm": 4.595601558685303, "learning_rate": 4.68311324529812e-05, "loss": 0.6501, "mean_token_accuracy": 0.8114818871021271, "num_tokens": 82573042.0, "step": 7970 }, { "entropy": 0.7182214140892029, "epoch": 0.06384, "grad_norm": 1.8922170400619507, "learning_rate": 4.6827130852340937e-05, "loss": 0.7285, "mean_token_accuracy": 0.7939072012901306, "num_tokens": 82667042.0, "step": 7980 }, { "entropy": 0.7708416998386383, "epoch": 0.06392, "grad_norm": 3.173095464706421, "learning_rate": 4.682312925170068e-05, "loss": 0.7676, "mean_token_accuracy": 0.7734446227550507, "num_tokens": 82804039.0, "step": 7990 }, { "entropy": 0.7794161021709443, "epoch": 0.064, "grad_norm": 4.3830952644348145, "learning_rate": 4.6819127651060424e-05, "loss": 0.7651, "mean_token_accuracy": 0.7964083194732666, "num_tokens": 82843883.0, "step": 8000 }, { "entropy": 0.67960444688797, "epoch": 0.06408, "grad_norm": 2.5344767570495605, "learning_rate": 4.6815126050420174e-05, "loss": 0.6838, "mean_token_accuracy": 0.7826819777488708, "num_tokens": 83007723.0, "step": 8010 }, { "entropy": 0.7033899694681167, "epoch": 0.06416, "grad_norm": 3.119405746459961, "learning_rate": 4.681112444977991e-05, "loss": 0.6856, "mean_token_accuracy": 0.8013014972209931, "num_tokens": 83105112.0, "step": 8020 }, { "entropy": 0.7603117406368256, "epoch": 0.06424, "grad_norm": 1.7536990642547607, "learning_rate": 4.6807122849139655e-05, "loss": 0.7795, "mean_token_accuracy": 0.7829180300235749, "num_tokens": 83199934.0, "step": 8030 }, { "entropy": 0.7223023176193237, "epoch": 0.06432, "grad_norm": 3.3675453662872314, "learning_rate": 4.6803121248499406e-05, "loss": 0.7091, "mean_token_accuracy": 0.7830998599529266, "num_tokens": 83332767.0, "step": 8040 }, { "entropy": 0.6562320083379746, "epoch": 0.0644, "grad_norm": 4.741036415100098, "learning_rate": 4.679911964785915e-05, "loss": 0.6666, "mean_token_accuracy": 0.8124914228916168, "num_tokens": 83368130.0, "step": 8050 }, { "entropy": 0.6825137257575988, "epoch": 0.06448, "grad_norm": 1.6294084787368774, "learning_rate": 4.6795118047218886e-05, "loss": 0.6778, "mean_token_accuracy": 0.7873778820037842, "num_tokens": 83531970.0, "step": 8060 }, { "entropy": 0.6827750027179718, "epoch": 0.06456, "grad_norm": 3.957319974899292, "learning_rate": 4.679111644657863e-05, "loss": 0.682, "mean_token_accuracy": 0.8012048304080963, "num_tokens": 83628080.0, "step": 8070 }, { "entropy": 0.6820100784301758, "epoch": 0.06464, "grad_norm": 2.2165257930755615, "learning_rate": 4.678711484593838e-05, "loss": 0.7081, "mean_token_accuracy": 0.7943060994148254, "num_tokens": 83722299.0, "step": 8080 }, { "entropy": 0.7300933778285981, "epoch": 0.06472, "grad_norm": 2.641667366027832, "learning_rate": 4.6783113245298124e-05, "loss": 0.7077, "mean_token_accuracy": 0.7872061252593994, "num_tokens": 83847788.0, "step": 8090 }, { "entropy": 0.7307800590991974, "epoch": 0.0648, "grad_norm": 6.34265661239624, "learning_rate": 4.677911164465786e-05, "loss": 0.7382, "mean_token_accuracy": 0.8063368320465087, "num_tokens": 83883510.0, "step": 8100 }, { "entropy": 0.6450398921966553, "epoch": 0.06488, "grad_norm": 2.446472644805908, "learning_rate": 4.677511004401761e-05, "loss": 0.6464, "mean_token_accuracy": 0.795941287279129, "num_tokens": 84045783.0, "step": 8110 }, { "entropy": 0.715217924118042, "epoch": 0.06496, "grad_norm": 5.913429260253906, "learning_rate": 4.6771108443377355e-05, "loss": 0.6992, "mean_token_accuracy": 0.8029979228973388, "num_tokens": 84121991.0, "step": 8120 }, { "entropy": 0.6968787610530853, "epoch": 0.06504, "grad_norm": 2.2516186237335205, "learning_rate": 4.67671068427371e-05, "loss": 0.7014, "mean_token_accuracy": 0.8019091784954071, "num_tokens": 84215373.0, "step": 8130 }, { "entropy": 0.7117706894874573, "epoch": 0.06512, "grad_norm": 2.931081533432007, "learning_rate": 4.6763105242096836e-05, "loss": 0.7088, "mean_token_accuracy": 0.7857870519161224, "num_tokens": 84345460.0, "step": 8140 }, { "entropy": 0.7021040916442871, "epoch": 0.0652, "grad_norm": 6.570584297180176, "learning_rate": 4.6759103641456586e-05, "loss": 0.6979, "mean_token_accuracy": 0.8180476784706116, "num_tokens": 84384145.0, "step": 8150 }, { "entropy": 0.6478897511959076, "epoch": 0.06528, "grad_norm": 2.1292531490325928, "learning_rate": 4.675510204081633e-05, "loss": 0.6499, "mean_token_accuracy": 0.7930508136749268, "num_tokens": 84547985.0, "step": 8160 }, { "entropy": 0.754848039150238, "epoch": 0.06536, "grad_norm": 3.4026424884796143, "learning_rate": 4.6751100440176074e-05, "loss": 0.7506, "mean_token_accuracy": 0.786369240283966, "num_tokens": 84641241.0, "step": 8170 }, { "entropy": 0.7112803041934967, "epoch": 0.06544, "grad_norm": 2.09165358543396, "learning_rate": 4.674709883953582e-05, "loss": 0.7222, "mean_token_accuracy": 0.7933887422084809, "num_tokens": 84735555.0, "step": 8180 }, { "entropy": 0.7429555594921112, "epoch": 0.06552, "grad_norm": 3.1007239818573, "learning_rate": 4.674309723889556e-05, "loss": 0.7354, "mean_token_accuracy": 0.7755721688270569, "num_tokens": 84876084.0, "step": 8190 }, { "entropy": 0.7757616460323333, "epoch": 0.0656, "grad_norm": 4.669337749481201, "learning_rate": 4.6739095638255305e-05, "loss": 0.7626, "mean_token_accuracy": 0.7995081007480621, "num_tokens": 84910801.0, "step": 8200 }, { "entropy": 0.6639736592769623, "epoch": 0.06568, "grad_norm": 1.7296762466430664, "learning_rate": 4.673509403761505e-05, "loss": 0.6659, "mean_token_accuracy": 0.7895517885684967, "num_tokens": 85074641.0, "step": 8210 }, { "entropy": 0.7020543813705444, "epoch": 0.06576, "grad_norm": 3.5259153842926025, "learning_rate": 4.673109243697479e-05, "loss": 0.7018, "mean_token_accuracy": 0.798566472530365, "num_tokens": 85170831.0, "step": 8220 }, { "entropy": 0.7300807297229767, "epoch": 0.06584, "grad_norm": 2.1064655780792236, "learning_rate": 4.6727090836334536e-05, "loss": 0.7394, "mean_token_accuracy": 0.79283806681633, "num_tokens": 85265641.0, "step": 8230 }, { "entropy": 0.7867015898227692, "epoch": 0.06592, "grad_norm": 3.4742276668548584, "learning_rate": 4.672308923569428e-05, "loss": 0.7762, "mean_token_accuracy": 0.7671640574932098, "num_tokens": 85407818.0, "step": 8240 }, { "entropy": 0.7450100779533386, "epoch": 0.066, "grad_norm": 6.311077117919922, "learning_rate": 4.6719087635054024e-05, "loss": 0.7445, "mean_token_accuracy": 0.8026830315589905, "num_tokens": 85448930.0, "step": 8250 }, { "entropy": 0.6654911935329437, "epoch": 0.06608, "grad_norm": 1.8397736549377441, "learning_rate": 4.671508603441377e-05, "loss": 0.6681, "mean_token_accuracy": 0.790445613861084, "num_tokens": 85611325.0, "step": 8260 }, { "entropy": 0.6941079556941986, "epoch": 0.06616, "grad_norm": 3.1907997131347656, "learning_rate": 4.671108443377351e-05, "loss": 0.6869, "mean_token_accuracy": 0.802847969532013, "num_tokens": 85693651.0, "step": 8270 }, { "entropy": 0.7097001492977142, "epoch": 0.06624, "grad_norm": 1.768875241279602, "learning_rate": 4.6707082833133255e-05, "loss": 0.7093, "mean_token_accuracy": 0.798986941576004, "num_tokens": 85788720.0, "step": 8280 }, { "entropy": 0.7299125611782074, "epoch": 0.06632, "grad_norm": 3.0256712436676025, "learning_rate": 4.6703081232493e-05, "loss": 0.7331, "mean_token_accuracy": 0.778785640001297, "num_tokens": 85927465.0, "step": 8290 }, { "entropy": 0.6742906630039215, "epoch": 0.0664, "grad_norm": 5.027505874633789, "learning_rate": 4.669907963185274e-05, "loss": 0.6685, "mean_token_accuracy": 0.8188278138637543, "num_tokens": 85965322.0, "step": 8300 }, { "entropy": 0.6925160467624665, "epoch": 0.06648, "grad_norm": 2.817662477493286, "learning_rate": 4.6695078031212486e-05, "loss": 0.6947, "mean_token_accuracy": 0.7847642958164215, "num_tokens": 86129162.0, "step": 8310 }, { "entropy": 0.7098952651023864, "epoch": 0.06656, "grad_norm": 3.6613826751708984, "learning_rate": 4.6691076430572236e-05, "loss": 0.7166, "mean_token_accuracy": 0.7900349020957946, "num_tokens": 86232063.0, "step": 8320 }, { "entropy": 0.7259746134281159, "epoch": 0.06664, "grad_norm": 1.7832838296890259, "learning_rate": 4.668707482993197e-05, "loss": 0.7185, "mean_token_accuracy": 0.792791199684143, "num_tokens": 86325682.0, "step": 8330 }, { "entropy": 0.6645788013935089, "epoch": 0.06672, "grad_norm": 1.9183413982391357, "learning_rate": 4.668307322929172e-05, "loss": 0.6607, "mean_token_accuracy": 0.792960649728775, "num_tokens": 86476371.0, "step": 8340 }, { "entropy": 0.6854094415903091, "epoch": 0.0668, "grad_norm": 5.124397277832031, "learning_rate": 4.667907162865146e-05, "loss": 0.6817, "mean_token_accuracy": 0.8166633903980255, "num_tokens": 86523587.0, "step": 8350 }, { "entropy": 0.6770333349704742, "epoch": 0.06688, "grad_norm": 2.826293468475342, "learning_rate": 4.667507002801121e-05, "loss": 0.6821, "mean_token_accuracy": 0.786119955778122, "num_tokens": 86686371.0, "step": 8360 }, { "entropy": 0.6905177950859069, "epoch": 0.06696, "grad_norm": 4.750245571136475, "learning_rate": 4.667106842737095e-05, "loss": 0.6812, "mean_token_accuracy": 0.8076915383338928, "num_tokens": 86760952.0, "step": 8370 }, { "entropy": 0.6964037895202637, "epoch": 0.06704, "grad_norm": 1.9080126285552979, "learning_rate": 4.666706682673069e-05, "loss": 0.7089, "mean_token_accuracy": 0.7953470408916473, "num_tokens": 86854133.0, "step": 8380 }, { "entropy": 0.7465195894241333, "epoch": 0.06712, "grad_norm": 3.559831380844116, "learning_rate": 4.666306522609044e-05, "loss": 0.7304, "mean_token_accuracy": 0.7842239320278168, "num_tokens": 86992098.0, "step": 8390 }, { "entropy": 0.7048337161540985, "epoch": 0.0672, "grad_norm": 4.3837103843688965, "learning_rate": 4.6659063625450186e-05, "loss": 0.7096, "mean_token_accuracy": 0.807261997461319, "num_tokens": 87031050.0, "step": 8400 }, { "entropy": 0.7340362906455994, "epoch": 0.06728, "grad_norm": 2.865579605102539, "learning_rate": 4.665506202480992e-05, "loss": 0.7298, "mean_token_accuracy": 0.7770212590694427, "num_tokens": 87194890.0, "step": 8410 }, { "entropy": 0.7270693987607956, "epoch": 0.06736, "grad_norm": 3.678105115890503, "learning_rate": 4.665106042416967e-05, "loss": 0.7308, "mean_token_accuracy": 0.7878587305545807, "num_tokens": 87292021.0, "step": 8420 }, { "entropy": 0.7666853487491607, "epoch": 0.06744, "grad_norm": 2.374932050704956, "learning_rate": 4.664705882352942e-05, "loss": 0.765, "mean_token_accuracy": 0.7837035179138183, "num_tokens": 87385647.0, "step": 8430 }, { "entropy": 0.7296348989009858, "epoch": 0.06752, "grad_norm": 3.984010696411133, "learning_rate": 4.664305722288916e-05, "loss": 0.7172, "mean_token_accuracy": 0.7811186730861663, "num_tokens": 87528805.0, "step": 8440 }, { "entropy": 0.6689713925123215, "epoch": 0.0676, "grad_norm": 5.902311325073242, "learning_rate": 4.66390556222489e-05, "loss": 0.679, "mean_token_accuracy": 0.8153033673763275, "num_tokens": 87569203.0, "step": 8450 }, { "entropy": 0.7459976255893708, "epoch": 0.06768, "grad_norm": 1.638167142868042, "learning_rate": 4.663505402160865e-05, "loss": 0.7457, "mean_token_accuracy": 0.7747557401657105, "num_tokens": 87733043.0, "step": 8460 }, { "entropy": 0.6732170283794403, "epoch": 0.06776, "grad_norm": 2.9607856273651123, "learning_rate": 4.663105242096839e-05, "loss": 0.6719, "mean_token_accuracy": 0.7999240398406983, "num_tokens": 87842045.0, "step": 8470 }, { "entropy": 0.7624109208583831, "epoch": 0.06784, "grad_norm": 2.361588478088379, "learning_rate": 4.6627050820328136e-05, "loss": 0.7631, "mean_token_accuracy": 0.7859266877174378, "num_tokens": 87939546.0, "step": 8480 }, { "entropy": 0.7142423272132874, "epoch": 0.06792, "grad_norm": 3.9273548126220703, "learning_rate": 4.662304921968787e-05, "loss": 0.7057, "mean_token_accuracy": 0.7884642302989959, "num_tokens": 88068941.0, "step": 8490 }, { "entropy": 0.7480208277702332, "epoch": 0.068, "grad_norm": 6.473973751068115, "learning_rate": 4.661904761904762e-05, "loss": 0.7604, "mean_token_accuracy": 0.8069252908229828, "num_tokens": 88102885.0, "step": 8500 }, { "entropy": 0.6898007690906525, "epoch": 0.06808, "grad_norm": 2.0262510776519775, "learning_rate": 4.661504601840737e-05, "loss": 0.6919, "mean_token_accuracy": 0.780946671962738, "num_tokens": 88266519.0, "step": 8510 }, { "entropy": 0.6479163646697998, "epoch": 0.06816, "grad_norm": 4.337863922119141, "learning_rate": 4.661104441776711e-05, "loss": 0.6421, "mean_token_accuracy": 0.8131748259067535, "num_tokens": 88354000.0, "step": 8520 }, { "entropy": 0.7399667203426361, "epoch": 0.06824, "grad_norm": 1.7906694412231445, "learning_rate": 4.660704281712685e-05, "loss": 0.7359, "mean_token_accuracy": 0.7909751534461975, "num_tokens": 88447733.0, "step": 8530 }, { "entropy": 0.7130682170391083, "epoch": 0.06832, "grad_norm": 2.7123894691467285, "learning_rate": 4.66030412164866e-05, "loss": 0.712, "mean_token_accuracy": 0.7864878714084625, "num_tokens": 88583871.0, "step": 8540 }, { "entropy": 0.6580867141485214, "epoch": 0.0684, "grad_norm": 5.407219409942627, "learning_rate": 4.659903961584634e-05, "loss": 0.6466, "mean_token_accuracy": 0.825834184885025, "num_tokens": 88625213.0, "step": 8550 }, { "entropy": 0.724755984544754, "epoch": 0.06848, "grad_norm": 1.9444553852081299, "learning_rate": 4.6595038015206085e-05, "loss": 0.7308, "mean_token_accuracy": 0.7748778760433197, "num_tokens": 88789053.0, "step": 8560 }, { "entropy": 0.6691206693649292, "epoch": 0.06856, "grad_norm": 4.5953874588012695, "learning_rate": 4.659103641456583e-05, "loss": 0.6649, "mean_token_accuracy": 0.8102380752563476, "num_tokens": 88883669.0, "step": 8570 }, { "entropy": 0.7379570364952087, "epoch": 0.06864, "grad_norm": 2.723522186279297, "learning_rate": 4.658703481392557e-05, "loss": 0.7278, "mean_token_accuracy": 0.7927578568458558, "num_tokens": 88979231.0, "step": 8580 }, { "entropy": 0.7281608700752258, "epoch": 0.06872, "grad_norm": 2.477212905883789, "learning_rate": 4.6583033213285317e-05, "loss": 0.7313, "mean_token_accuracy": 0.7771301805973053, "num_tokens": 89121031.0, "step": 8590 }, { "entropy": 0.6557321965694427, "epoch": 0.0688, "grad_norm": 5.758674144744873, "learning_rate": 4.657903161264506e-05, "loss": 0.6657, "mean_token_accuracy": 0.8219867706298828, "num_tokens": 89163065.0, "step": 8600 }, { "entropy": 0.6513987421989441, "epoch": 0.06888, "grad_norm": 2.358950138092041, "learning_rate": 4.6575030012004804e-05, "loss": 0.6516, "mean_token_accuracy": 0.7905440986156463, "num_tokens": 89325451.0, "step": 8610 }, { "entropy": 0.5951485365629197, "epoch": 0.06896, "grad_norm": 3.525047779083252, "learning_rate": 4.657102841136455e-05, "loss": 0.5958, "mean_token_accuracy": 0.8257729768753052, "num_tokens": 89394159.0, "step": 8620 }, { "entropy": 0.6708347797393799, "epoch": 0.06904, "grad_norm": 1.922377347946167, "learning_rate": 4.656702681072429e-05, "loss": 0.6776, "mean_token_accuracy": 0.801200944185257, "num_tokens": 89487063.0, "step": 8630 }, { "entropy": 0.7200921803712845, "epoch": 0.06912, "grad_norm": 2.907475471496582, "learning_rate": 4.6563025210084035e-05, "loss": 0.7083, "mean_token_accuracy": 0.7870403707027436, "num_tokens": 89629234.0, "step": 8640 }, { "entropy": 0.6712493151426315, "epoch": 0.0692, "grad_norm": 6.811583995819092, "learning_rate": 4.655902360944378e-05, "loss": 0.6738, "mean_token_accuracy": 0.8266464531421661, "num_tokens": 89666244.0, "step": 8650 }, { "entropy": 0.6441969513893128, "epoch": 0.06928, "grad_norm": 2.3508694171905518, "learning_rate": 4.655502200880352e-05, "loss": 0.6473, "mean_token_accuracy": 0.7940637946128846, "num_tokens": 89829946.0, "step": 8660 }, { "entropy": 0.7846232354640961, "epoch": 0.06936, "grad_norm": 3.9107422828674316, "learning_rate": 4.6551020408163266e-05, "loss": 0.7852, "mean_token_accuracy": 0.7826015293598175, "num_tokens": 89906476.0, "step": 8670 }, { "entropy": 0.7395306646823883, "epoch": 0.06944, "grad_norm": 2.2549591064453125, "learning_rate": 4.654701880752301e-05, "loss": 0.734, "mean_token_accuracy": 0.7887684881687165, "num_tokens": 90001550.0, "step": 8680 }, { "entropy": 0.7460441410541534, "epoch": 0.06952, "grad_norm": 2.553133249282837, "learning_rate": 4.6543017206882754e-05, "loss": 0.7296, "mean_token_accuracy": 0.7842717170715332, "num_tokens": 90127461.0, "step": 8690 }, { "entropy": 0.7258938431739808, "epoch": 0.0696, "grad_norm": 5.752545356750488, "learning_rate": 4.65390156062425e-05, "loss": 0.7225, "mean_token_accuracy": 0.8037122964859009, "num_tokens": 90162559.0, "step": 8700 }, { "entropy": 0.6883860468864441, "epoch": 0.06968, "grad_norm": 1.551190972328186, "learning_rate": 4.653501400560225e-05, "loss": 0.6923, "mean_token_accuracy": 0.7835918426513672, "num_tokens": 90326399.0, "step": 8710 }, { "entropy": 0.7106227695941925, "epoch": 0.06976, "grad_norm": 4.3134050369262695, "learning_rate": 4.6531012404961985e-05, "loss": 0.7022, "mean_token_accuracy": 0.7993316352367401, "num_tokens": 90413499.0, "step": 8720 }, { "entropy": 0.63023601770401, "epoch": 0.06984, "grad_norm": 1.810201644897461, "learning_rate": 4.652701080432173e-05, "loss": 0.6363, "mean_token_accuracy": 0.809520560503006, "num_tokens": 90507800.0, "step": 8730 }, { "entropy": 0.7628681540489197, "epoch": 0.06992, "grad_norm": 2.6300230026245117, "learning_rate": 4.652300920368147e-05, "loss": 0.7578, "mean_token_accuracy": 0.7755355834960938, "num_tokens": 90652365.0, "step": 8740 }, { "entropy": 0.6985868066549301, "epoch": 0.07, "grad_norm": 5.003227710723877, "learning_rate": 4.651900760304122e-05, "loss": 0.7004, "mean_token_accuracy": 0.8105223655700684, "num_tokens": 90697524.0, "step": 8750 }, { "entropy": 0.6909521996974946, "epoch": 0.07008, "grad_norm": 1.8125967979431152, "learning_rate": 4.651500600240096e-05, "loss": 0.6874, "mean_token_accuracy": 0.7859916985034943, "num_tokens": 90861364.0, "step": 8760 }, { "entropy": 0.6683136582374573, "epoch": 0.07016, "grad_norm": 3.269179105758667, "learning_rate": 4.6511004401760703e-05, "loss": 0.6623, "mean_token_accuracy": 0.8018004298210144, "num_tokens": 90962737.0, "step": 8770 }, { "entropy": 0.7237710177898407, "epoch": 0.07024, "grad_norm": 1.8880091905593872, "learning_rate": 4.6507002801120454e-05, "loss": 0.7449, "mean_token_accuracy": 0.7913466215133667, "num_tokens": 91056366.0, "step": 8780 }, { "entropy": 0.7442766010761261, "epoch": 0.07032, "grad_norm": 3.8060386180877686, "learning_rate": 4.65030012004802e-05, "loss": 0.727, "mean_token_accuracy": 0.7801315426826477, "num_tokens": 91189848.0, "step": 8790 }, { "entropy": 0.6381028324365616, "epoch": 0.0704, "grad_norm": 4.816007614135742, "learning_rate": 4.6498999599839935e-05, "loss": 0.6431, "mean_token_accuracy": 0.822293895483017, "num_tokens": 91227294.0, "step": 8800 }, { "entropy": 0.64343621134758, "epoch": 0.07048, "grad_norm": 2.3791327476501465, "learning_rate": 4.649499799919968e-05, "loss": 0.6501, "mean_token_accuracy": 0.794259887933731, "num_tokens": 91391134.0, "step": 8810 }, { "entropy": 0.7567229330539703, "epoch": 0.07056, "grad_norm": 3.5808491706848145, "learning_rate": 4.649099639855943e-05, "loss": 0.7539, "mean_token_accuracy": 0.7881084620952606, "num_tokens": 91480069.0, "step": 8820 }, { "entropy": 0.7266998201608658, "epoch": 0.07064, "grad_norm": 3.364776372909546, "learning_rate": 4.648699479791917e-05, "loss": 0.7252, "mean_token_accuracy": 0.7946669578552246, "num_tokens": 91574418.0, "step": 8830 }, { "entropy": 0.698409903049469, "epoch": 0.07072, "grad_norm": 2.2694621086120605, "learning_rate": 4.648299319727891e-05, "loss": 0.6933, "mean_token_accuracy": 0.786885803937912, "num_tokens": 91712395.0, "step": 8840 }, { "entropy": 0.7652063727378845, "epoch": 0.0708, "grad_norm": 4.792698860168457, "learning_rate": 4.647899159663866e-05, "loss": 0.7569, "mean_token_accuracy": 0.8028083980083466, "num_tokens": 91748820.0, "step": 8850 }, { "entropy": 0.6621902048587799, "epoch": 0.07088, "grad_norm": 1.5367298126220703, "learning_rate": 4.6474989995998404e-05, "loss": 0.6624, "mean_token_accuracy": 0.792659991979599, "num_tokens": 91912660.0, "step": 8860 }, { "entropy": 0.7420909702777863, "epoch": 0.07096, "grad_norm": 4.165455341339111, "learning_rate": 4.647098839535815e-05, "loss": 0.744, "mean_token_accuracy": 0.7939082026481629, "num_tokens": 92003434.0, "step": 8870 }, { "entropy": 0.798017019033432, "epoch": 0.07104, "grad_norm": 3.7451627254486084, "learning_rate": 4.6466986794717884e-05, "loss": 0.8014, "mean_token_accuracy": 0.7739807188510894, "num_tokens": 92099743.0, "step": 8880 }, { "entropy": 0.7195653915405273, "epoch": 0.07112, "grad_norm": 2.9914660453796387, "learning_rate": 4.6462985194077635e-05, "loss": 0.6996, "mean_token_accuracy": 0.7879240274429321, "num_tokens": 92239031.0, "step": 8890 }, { "entropy": 0.7132206380367279, "epoch": 0.0712, "grad_norm": 4.5119147300720215, "learning_rate": 4.645898359343738e-05, "loss": 0.7343, "mean_token_accuracy": 0.8050411939620972, "num_tokens": 92275316.0, "step": 8900 }, { "entropy": 0.7185623407363891, "epoch": 0.07128, "grad_norm": 2.186506748199463, "learning_rate": 4.645498199279712e-05, "loss": 0.7119, "mean_token_accuracy": 0.7801050424575806, "num_tokens": 92439156.0, "step": 8910 }, { "entropy": 0.7198186159133911, "epoch": 0.07136, "grad_norm": 3.578193426132202, "learning_rate": 4.6450980392156866e-05, "loss": 0.7163, "mean_token_accuracy": 0.7963639199733734, "num_tokens": 92519629.0, "step": 8920 }, { "entropy": 0.720028418302536, "epoch": 0.07144, "grad_norm": 1.7074960470199585, "learning_rate": 4.644697879151661e-05, "loss": 0.7303, "mean_token_accuracy": 0.793987387418747, "num_tokens": 92613442.0, "step": 8930 }, { "entropy": 0.7283935070037841, "epoch": 0.07152, "grad_norm": 2.633697986602783, "learning_rate": 4.644297719087635e-05, "loss": 0.7144, "mean_token_accuracy": 0.783449923992157, "num_tokens": 92752800.0, "step": 8940 }, { "entropy": 0.68905288875103, "epoch": 0.0716, "grad_norm": 5.864593505859375, "learning_rate": 4.64389755902361e-05, "loss": 0.6894, "mean_token_accuracy": 0.81552072763443, "num_tokens": 92788543.0, "step": 8950 }, { "entropy": 0.7258300602436065, "epoch": 0.07168, "grad_norm": 1.660001277923584, "learning_rate": 4.643497398959584e-05, "loss": 0.7276, "mean_token_accuracy": 0.7762396275997162, "num_tokens": 92952383.0, "step": 8960 }, { "entropy": 0.7124823093414306, "epoch": 0.07176, "grad_norm": 3.4718198776245117, "learning_rate": 4.6430972388955584e-05, "loss": 0.7102, "mean_token_accuracy": 0.7991422772407532, "num_tokens": 93040738.0, "step": 8970 }, { "entropy": 0.7454619705677032, "epoch": 0.07184, "grad_norm": 1.9094996452331543, "learning_rate": 4.642697078831533e-05, "loss": 0.7346, "mean_token_accuracy": 0.7891648411750793, "num_tokens": 93132914.0, "step": 8980 }, { "entropy": 0.751919949054718, "epoch": 0.07192, "grad_norm": 2.5978305339813232, "learning_rate": 4.642296918767507e-05, "loss": 0.7528, "mean_token_accuracy": 0.7737573862075806, "num_tokens": 93265536.0, "step": 8990 }, { "entropy": 0.6569688379764557, "epoch": 0.072, "grad_norm": 5.474598407745361, "learning_rate": 4.6418967587034816e-05, "loss": 0.6373, "mean_token_accuracy": 0.8243049860000611, "num_tokens": 93300876.0, "step": 9000 }, { "entropy": 0.6572144567966461, "epoch": 0.07208, "grad_norm": 1.8999278545379639, "learning_rate": 4.641496598639456e-05, "loss": 0.667, "mean_token_accuracy": 0.7935628175735474, "num_tokens": 93463067.0, "step": 9010 }, { "entropy": 0.7326251327991485, "epoch": 0.07216, "grad_norm": 5.746217250823975, "learning_rate": 4.64109643857543e-05, "loss": 0.7254, "mean_token_accuracy": 0.7984652280807495, "num_tokens": 93536441.0, "step": 9020 }, { "entropy": 0.7433971285820007, "epoch": 0.07224, "grad_norm": 2.1052215099334717, "learning_rate": 4.640696278511405e-05, "loss": 0.748, "mean_token_accuracy": 0.7919844806194305, "num_tokens": 93629544.0, "step": 9030 }, { "entropy": 0.6894863963127136, "epoch": 0.07232, "grad_norm": 2.92257022857666, "learning_rate": 4.640296118447379e-05, "loss": 0.6837, "mean_token_accuracy": 0.7902769565582275, "num_tokens": 93767161.0, "step": 9040 }, { "entropy": 0.7209660470485687, "epoch": 0.0724, "grad_norm": 5.1700968742370605, "learning_rate": 4.6398959583833534e-05, "loss": 0.7245, "mean_token_accuracy": 0.8077195703983306, "num_tokens": 93805524.0, "step": 9050 }, { "entropy": 0.6900577247142792, "epoch": 0.07248, "grad_norm": 2.1634137630462646, "learning_rate": 4.639495798319328e-05, "loss": 0.6911, "mean_token_accuracy": 0.7834391891956329, "num_tokens": 93969364.0, "step": 9060 }, { "entropy": 0.6696581542491913, "epoch": 0.07256, "grad_norm": 3.177516222000122, "learning_rate": 4.639095638255302e-05, "loss": 0.6719, "mean_token_accuracy": 0.8054692685604096, "num_tokens": 94054396.0, "step": 9070 }, { "entropy": 0.7083090960979461, "epoch": 0.07264, "grad_norm": 1.9252960681915283, "learning_rate": 4.6386954781912765e-05, "loss": 0.7068, "mean_token_accuracy": 0.8000052511692047, "num_tokens": 94149415.0, "step": 9080 }, { "entropy": 0.6690527498722076, "epoch": 0.07272, "grad_norm": 3.847144842147827, "learning_rate": 4.638295318127251e-05, "loss": 0.6613, "mean_token_accuracy": 0.7980054974555969, "num_tokens": 94286768.0, "step": 9090 }, { "entropy": 0.744106239080429, "epoch": 0.0728, "grad_norm": 6.895677089691162, "learning_rate": 4.637895158063226e-05, "loss": 0.7463, "mean_token_accuracy": 0.8059569478034974, "num_tokens": 94319525.0, "step": 9100 }, { "entropy": 0.6425414502620697, "epoch": 0.07288, "grad_norm": 2.516948938369751, "learning_rate": 4.6374949979991996e-05, "loss": 0.6445, "mean_token_accuracy": 0.7935484170913696, "num_tokens": 94483194.0, "step": 9110 }, { "entropy": 0.684095299243927, "epoch": 0.07296, "grad_norm": 4.124522686004639, "learning_rate": 4.637094837935174e-05, "loss": 0.6726, "mean_token_accuracy": 0.8078323721885681, "num_tokens": 94566220.0, "step": 9120 }, { "entropy": 0.7204775989055634, "epoch": 0.07304, "grad_norm": 1.5018067359924316, "learning_rate": 4.6366946778711484e-05, "loss": 0.7331, "mean_token_accuracy": 0.7958387076854706, "num_tokens": 94661142.0, "step": 9130 }, { "entropy": 0.6970223665237427, "epoch": 0.07312, "grad_norm": 2.389420509338379, "learning_rate": 4.6362945178071234e-05, "loss": 0.6838, "mean_token_accuracy": 0.7897837221622467, "num_tokens": 94798549.0, "step": 9140 }, { "entropy": 0.6714691162109375, "epoch": 0.0732, "grad_norm": 6.5991058349609375, "learning_rate": 4.635894357743097e-05, "loss": 0.686, "mean_token_accuracy": 0.815291041135788, "num_tokens": 94832748.0, "step": 9150 }, { "entropy": 0.6791327476501465, "epoch": 0.07328, "grad_norm": 2.047451972961426, "learning_rate": 4.6354941976790715e-05, "loss": 0.6799, "mean_token_accuracy": 0.7870153665542603, "num_tokens": 94996501.0, "step": 9160 }, { "entropy": 0.752552992105484, "epoch": 0.07336, "grad_norm": 3.739166736602783, "learning_rate": 4.6350940376150466e-05, "loss": 0.749, "mean_token_accuracy": 0.7882482469081878, "num_tokens": 95085796.0, "step": 9170 }, { "entropy": 0.7250829398632049, "epoch": 0.07344, "grad_norm": 2.0301687717437744, "learning_rate": 4.634693877551021e-05, "loss": 0.7247, "mean_token_accuracy": 0.7901110410690307, "num_tokens": 95181355.0, "step": 9180 }, { "entropy": 0.7323714196681976, "epoch": 0.07352, "grad_norm": 2.497694492340088, "learning_rate": 4.6342937174869946e-05, "loss": 0.7184, "mean_token_accuracy": 0.7800497591495514, "num_tokens": 95330700.0, "step": 9190 }, { "entropy": 0.6083283632993698, "epoch": 0.0736, "grad_norm": 5.745437145233154, "learning_rate": 4.633893557422969e-05, "loss": 0.6268, "mean_token_accuracy": 0.8230025410652161, "num_tokens": 95380869.0, "step": 9200 }, { "entropy": 0.6286138594150543, "epoch": 0.07368, "grad_norm": 2.191335678100586, "learning_rate": 4.633493397358944e-05, "loss": 0.6287, "mean_token_accuracy": 0.798393988609314, "num_tokens": 95544709.0, "step": 9210 }, { "entropy": 0.645444369316101, "epoch": 0.07376, "grad_norm": 4.100092887878418, "learning_rate": 4.6330932372949184e-05, "loss": 0.6264, "mean_token_accuracy": 0.8186423897743225, "num_tokens": 95627112.0, "step": 9220 }, { "entropy": 0.6887154281139374, "epoch": 0.07384, "grad_norm": 1.9781568050384521, "learning_rate": 4.632693077230892e-05, "loss": 0.7223, "mean_token_accuracy": 0.7937048316001892, "num_tokens": 95722060.0, "step": 9230 }, { "entropy": 0.7193709135055542, "epoch": 0.07392, "grad_norm": 2.347665309906006, "learning_rate": 4.632292917166867e-05, "loss": 0.7045, "mean_token_accuracy": 0.7852933466434479, "num_tokens": 95864937.0, "step": 9240 }, { "entropy": 0.756318798661232, "epoch": 0.074, "grad_norm": 5.206295013427734, "learning_rate": 4.6318927571028415e-05, "loss": 0.7402, "mean_token_accuracy": 0.8053412437438965, "num_tokens": 95903063.0, "step": 9250 }, { "entropy": 0.638420307636261, "epoch": 0.07408, "grad_norm": 1.5426478385925293, "learning_rate": 4.631492597038816e-05, "loss": 0.6465, "mean_token_accuracy": 0.7947545230388642, "num_tokens": 96066903.0, "step": 9260 }, { "entropy": 0.83851637840271, "epoch": 0.07416, "grad_norm": 3.332343816757202, "learning_rate": 4.6310924369747896e-05, "loss": 0.8274, "mean_token_accuracy": 0.7707083761692047, "num_tokens": 96149533.0, "step": 9270 }, { "entropy": 0.7394918084144593, "epoch": 0.07424, "grad_norm": 1.637984275817871, "learning_rate": 4.6306922769107646e-05, "loss": 0.7543, "mean_token_accuracy": 0.7873151183128357, "num_tokens": 96242644.0, "step": 9280 }, { "entropy": 0.7625670552253723, "epoch": 0.07432, "grad_norm": 2.8445935249328613, "learning_rate": 4.630292116846739e-05, "loss": 0.7556, "mean_token_accuracy": 0.7733691036701202, "num_tokens": 96375879.0, "step": 9290 }, { "entropy": 0.7586758375167847, "epoch": 0.0744, "grad_norm": 5.521483898162842, "learning_rate": 4.6298919567827134e-05, "loss": 0.7513, "mean_token_accuracy": 0.7976182341575623, "num_tokens": 96412557.0, "step": 9300 }, { "entropy": 0.7045974135398865, "epoch": 0.07448, "grad_norm": 2.0372469425201416, "learning_rate": 4.629491796718688e-05, "loss": 0.7097, "mean_token_accuracy": 0.7830107510089874, "num_tokens": 96574094.0, "step": 9310 }, { "entropy": 0.7064437389373779, "epoch": 0.07456, "grad_norm": 4.660027980804443, "learning_rate": 4.629091636654662e-05, "loss": 0.6967, "mean_token_accuracy": 0.8035681843757629, "num_tokens": 96644884.0, "step": 9320 }, { "entropy": 0.7236435651779175, "epoch": 0.07464, "grad_norm": 2.2348129749298096, "learning_rate": 4.6286914765906365e-05, "loss": 0.7234, "mean_token_accuracy": 0.7942578136920929, "num_tokens": 96738061.0, "step": 9330 }, { "entropy": 0.7118273377418518, "epoch": 0.07472, "grad_norm": 3.1476736068725586, "learning_rate": 4.628291316526611e-05, "loss": 0.72, "mean_token_accuracy": 0.7841964185237884, "num_tokens": 96877990.0, "step": 9340 }, { "entropy": 0.7326103270053863, "epoch": 0.0748, "grad_norm": 9.356379508972168, "learning_rate": 4.627891156462585e-05, "loss": 0.7135, "mean_token_accuracy": 0.8131058275699615, "num_tokens": 96914062.0, "step": 9350 }, { "entropy": 0.6879689991474152, "epoch": 0.07488, "grad_norm": 2.3290817737579346, "learning_rate": 4.6274909963985596e-05, "loss": 0.6919, "mean_token_accuracy": 0.782444304227829, "num_tokens": 97076744.0, "step": 9360 }, { "entropy": 0.6886475026607514, "epoch": 0.07496, "grad_norm": 4.053310871124268, "learning_rate": 4.627090836334534e-05, "loss": 0.6907, "mean_token_accuracy": 0.8031238675117492, "num_tokens": 97147497.0, "step": 9370 }, { "entropy": 0.7523693144321442, "epoch": 0.07504, "grad_norm": 1.9067676067352295, "learning_rate": 4.6266906762705084e-05, "loss": 0.7446, "mean_token_accuracy": 0.784957492351532, "num_tokens": 97241910.0, "step": 9380 }, { "entropy": 0.7136088728904724, "epoch": 0.07512, "grad_norm": 1.9757646322250366, "learning_rate": 4.626290516206483e-05, "loss": 0.7096, "mean_token_accuracy": 0.7817149102687836, "num_tokens": 97387220.0, "step": 9390 }, { "entropy": 0.6642134368419648, "epoch": 0.0752, "grad_norm": 5.103102207183838, "learning_rate": 4.625890356142457e-05, "loss": 0.6635, "mean_token_accuracy": 0.8181697189807892, "num_tokens": 97427966.0, "step": 9400 }, { "entropy": 0.6937807679176331, "epoch": 0.07528, "grad_norm": 1.7128000259399414, "learning_rate": 4.6254901960784315e-05, "loss": 0.6956, "mean_token_accuracy": 0.7825048923492431, "num_tokens": 97591806.0, "step": 9410 }, { "entropy": 0.6988496333360672, "epoch": 0.07536, "grad_norm": 4.036126136779785, "learning_rate": 4.625090036014406e-05, "loss": 0.7034, "mean_token_accuracy": 0.79894158244133, "num_tokens": 97680361.0, "step": 9420 }, { "entropy": 0.7219177186489105, "epoch": 0.07544, "grad_norm": 1.9431402683258057, "learning_rate": 4.62468987595038e-05, "loss": 0.7195, "mean_token_accuracy": 0.7947080969810486, "num_tokens": 97773963.0, "step": 9430 }, { "entropy": 0.7366402268409729, "epoch": 0.07552, "grad_norm": 3.3234386444091797, "learning_rate": 4.6242897158863546e-05, "loss": 0.7296, "mean_token_accuracy": 0.778108662366867, "num_tokens": 97916695.0, "step": 9440 }, { "entropy": 0.630741274356842, "epoch": 0.0756, "grad_norm": 5.161890029907227, "learning_rate": 4.6238895558223296e-05, "loss": 0.6482, "mean_token_accuracy": 0.8252800345420838, "num_tokens": 97952835.0, "step": 9450 }, { "entropy": 0.7032941102981567, "epoch": 0.07568, "grad_norm": 1.5649219751358032, "learning_rate": 4.623489395758303e-05, "loss": 0.6958, "mean_token_accuracy": 0.7831277489662171, "num_tokens": 98116675.0, "step": 9460 }, { "entropy": 0.6756620883941651, "epoch": 0.07576, "grad_norm": 3.399211883544922, "learning_rate": 4.623089235694278e-05, "loss": 0.6743, "mean_token_accuracy": 0.7987866997718811, "num_tokens": 98201159.0, "step": 9470 }, { "entropy": 0.7512216091156005, "epoch": 0.07584, "grad_norm": 2.7042577266693115, "learning_rate": 4.622689075630252e-05, "loss": 0.7537, "mean_token_accuracy": 0.7932135701179505, "num_tokens": 98293165.0, "step": 9480 }, { "entropy": 0.7645970046520233, "epoch": 0.07592, "grad_norm": 3.8871688842773438, "learning_rate": 4.622288915566227e-05, "loss": 0.7499, "mean_token_accuracy": 0.7806173264980316, "num_tokens": 98420717.0, "step": 9490 }, { "entropy": 0.6464944124221802, "epoch": 0.076, "grad_norm": 7.347275257110596, "learning_rate": 4.621888755502201e-05, "loss": 0.6502, "mean_token_accuracy": 0.8246745288372039, "num_tokens": 98459907.0, "step": 9500 }, { "entropy": 0.6770206749439239, "epoch": 0.07608, "grad_norm": 1.9257506132125854, "learning_rate": 4.621488595438175e-05, "loss": 0.6761, "mean_token_accuracy": 0.7873484194278717, "num_tokens": 98623719.0, "step": 9510 }, { "entropy": 0.7163217008113861, "epoch": 0.07616, "grad_norm": 3.707106828689575, "learning_rate": 4.62108843537415e-05, "loss": 0.7045, "mean_token_accuracy": 0.7973921775817872, "num_tokens": 98714254.0, "step": 9520 }, { "entropy": 0.70814288854599, "epoch": 0.07624, "grad_norm": 1.7619149684906006, "learning_rate": 4.6206882753101246e-05, "loss": 0.7177, "mean_token_accuracy": 0.7904400110244751, "num_tokens": 98808552.0, "step": 9530 }, { "entropy": 0.762034434080124, "epoch": 0.07632, "grad_norm": 3.032008409500122, "learning_rate": 4.620288115246098e-05, "loss": 0.7459, "mean_token_accuracy": 0.777660471200943, "num_tokens": 98943364.0, "step": 9540 }, { "entropy": 0.7846374154090882, "epoch": 0.0764, "grad_norm": 6.914165496826172, "learning_rate": 4.619887955182073e-05, "loss": 0.7864, "mean_token_accuracy": 0.7933871209621429, "num_tokens": 98980161.0, "step": 9550 }, { "entropy": 0.6520713210105896, "epoch": 0.07648, "grad_norm": 1.8531984090805054, "learning_rate": 4.619487795118048e-05, "loss": 0.6538, "mean_token_accuracy": 0.7921874821186066, "num_tokens": 99143939.0, "step": 9560 }, { "entropy": 0.7759733915328979, "epoch": 0.07656, "grad_norm": 4.085437297821045, "learning_rate": 4.619087635054022e-05, "loss": 0.7763, "mean_token_accuracy": 0.7822205126285553, "num_tokens": 99223721.0, "step": 9570 }, { "entropy": 0.6663265287876129, "epoch": 0.07664, "grad_norm": 2.3753581047058105, "learning_rate": 4.618687474989996e-05, "loss": 0.6651, "mean_token_accuracy": 0.8039907991886139, "num_tokens": 99318084.0, "step": 9580 }, { "entropy": 0.7492842137813568, "epoch": 0.07672, "grad_norm": 2.6339268684387207, "learning_rate": 4.61828731492597e-05, "loss": 0.7389, "mean_token_accuracy": 0.7830690681934357, "num_tokens": 99449629.0, "step": 9590 }, { "entropy": 0.7326230049133301, "epoch": 0.0768, "grad_norm": 5.846170425415039, "learning_rate": 4.617887154861945e-05, "loss": 0.7299, "mean_token_accuracy": 0.8047248542308807, "num_tokens": 99485854.0, "step": 9600 }, { "entropy": 0.6764203161001205, "epoch": 0.07688, "grad_norm": 1.9251611232757568, "learning_rate": 4.6174869947979196e-05, "loss": 0.676, "mean_token_accuracy": 0.7895459413528443, "num_tokens": 99648681.0, "step": 9610 }, { "entropy": 0.6605997115373612, "epoch": 0.07696, "grad_norm": 4.284296035766602, "learning_rate": 4.617086834733893e-05, "loss": 0.6501, "mean_token_accuracy": 0.8144646227359772, "num_tokens": 99722057.0, "step": 9620 }, { "entropy": 0.7078466713428497, "epoch": 0.07704, "grad_norm": 2.445702314376831, "learning_rate": 4.616686674669868e-05, "loss": 0.7285, "mean_token_accuracy": 0.797551280260086, "num_tokens": 99814567.0, "step": 9630 }, { "entropy": 0.7457213461399078, "epoch": 0.07712, "grad_norm": 2.5211164951324463, "learning_rate": 4.616286514605843e-05, "loss": 0.7407, "mean_token_accuracy": 0.7745398998260498, "num_tokens": 99966461.0, "step": 9640 }, { "entropy": 0.6885676980018616, "epoch": 0.0772, "grad_norm": 6.518370151519775, "learning_rate": 4.615886354541817e-05, "loss": 0.6913, "mean_token_accuracy": 0.8078333497047424, "num_tokens": 100013435.0, "step": 9650 }, { "entropy": 0.7131873667240143, "epoch": 0.07728, "grad_norm": 1.877037525177002, "learning_rate": 4.615486194477791e-05, "loss": 0.7055, "mean_token_accuracy": 0.7820469081401825, "num_tokens": 100177275.0, "step": 9660 }, { "entropy": 0.6009572058916092, "epoch": 0.07736, "grad_norm": 3.582157611846924, "learning_rate": 4.615086034413766e-05, "loss": 0.5976, "mean_token_accuracy": 0.8265214264392853, "num_tokens": 100257103.0, "step": 9670 }, { "entropy": 0.76646329164505, "epoch": 0.07744, "grad_norm": 1.854604959487915, "learning_rate": 4.61468587434974e-05, "loss": 0.7644, "mean_token_accuracy": 0.7836681425571441, "num_tokens": 100351895.0, "step": 9680 }, { "entropy": 0.710955661535263, "epoch": 0.07752, "grad_norm": 4.487539768218994, "learning_rate": 4.6142857142857145e-05, "loss": 0.7034, "mean_token_accuracy": 0.7879692196846009, "num_tokens": 100478274.0, "step": 9690 }, { "entropy": 0.6412195444107056, "epoch": 0.0776, "grad_norm": 5.209956645965576, "learning_rate": 4.613885554221689e-05, "loss": 0.6485, "mean_token_accuracy": 0.826359361410141, "num_tokens": 100515406.0, "step": 9700 }, { "entropy": 0.7090812087059021, "epoch": 0.07768, "grad_norm": 1.8579249382019043, "learning_rate": 4.613485394157663e-05, "loss": 0.7062, "mean_token_accuracy": 0.7791524291038513, "num_tokens": 100679246.0, "step": 9710 }, { "entropy": 0.7409132421016693, "epoch": 0.07776, "grad_norm": 3.4689183235168457, "learning_rate": 4.6130852340936377e-05, "loss": 0.7433, "mean_token_accuracy": 0.7908667385578155, "num_tokens": 100765914.0, "step": 9720 }, { "entropy": 0.7865496695041656, "epoch": 0.07784, "grad_norm": 2.317222833633423, "learning_rate": 4.612685074029612e-05, "loss": 0.7832, "mean_token_accuracy": 0.7816374838352204, "num_tokens": 100859234.0, "step": 9730 }, { "entropy": 0.7170082449913024, "epoch": 0.07792, "grad_norm": 3.051175355911255, "learning_rate": 4.6122849139655864e-05, "loss": 0.7171, "mean_token_accuracy": 0.7804364204406739, "num_tokens": 101005125.0, "step": 9740 }, { "entropy": 0.7838072061538697, "epoch": 0.078, "grad_norm": 4.849700450897217, "learning_rate": 4.611884753901561e-05, "loss": 0.787, "mean_token_accuracy": 0.7934091031551361, "num_tokens": 101043686.0, "step": 9750 }, { "entropy": 0.6430404365062714, "epoch": 0.07808, "grad_norm": 2.090224027633667, "learning_rate": 4.611484593837535e-05, "loss": 0.6413, "mean_token_accuracy": 0.7966353297233582, "num_tokens": 101207526.0, "step": 9760 }, { "entropy": 0.6684817343950271, "epoch": 0.07816, "grad_norm": 4.714359760284424, "learning_rate": 4.6110844337735095e-05, "loss": 0.6678, "mean_token_accuracy": 0.8098715782165528, "num_tokens": 101292228.0, "step": 9770 }, { "entropy": 0.6921248733997345, "epoch": 0.07824, "grad_norm": 1.637365698814392, "learning_rate": 4.610684273709484e-05, "loss": 0.7061, "mean_token_accuracy": 0.7982942044734955, "num_tokens": 101385571.0, "step": 9780 }, { "entropy": 0.7252784550189972, "epoch": 0.07832, "grad_norm": 2.611612558364868, "learning_rate": 4.610284113645458e-05, "loss": 0.7218, "mean_token_accuracy": 0.7857154905796051, "num_tokens": 101520096.0, "step": 9790 }, { "entropy": 0.6787501811981201, "epoch": 0.0784, "grad_norm": 5.861029148101807, "learning_rate": 4.6098839535814326e-05, "loss": 0.6652, "mean_token_accuracy": 0.8173645257949829, "num_tokens": 101555823.0, "step": 9800 }, { "entropy": 0.6985023260116577, "epoch": 0.07848, "grad_norm": 1.853561282157898, "learning_rate": 4.609483793517407e-05, "loss": 0.7032, "mean_token_accuracy": 0.7789125144481659, "num_tokens": 101719639.0, "step": 9810 }, { "entropy": 0.7591472357511521, "epoch": 0.07856, "grad_norm": 3.8510336875915527, "learning_rate": 4.6090836334533814e-05, "loss": 0.7491, "mean_token_accuracy": 0.7881970167160034, "num_tokens": 101804615.0, "step": 9820 }, { "entropy": 0.6677791953086853, "epoch": 0.07864, "grad_norm": 2.118070602416992, "learning_rate": 4.608683473389356e-05, "loss": 0.6796, "mean_token_accuracy": 0.8031384170055389, "num_tokens": 101900495.0, "step": 9830 }, { "entropy": 0.7207074344158173, "epoch": 0.07872, "grad_norm": 2.572760581970215, "learning_rate": 4.608283313325331e-05, "loss": 0.7148, "mean_token_accuracy": 0.7812585413455964, "num_tokens": 102041418.0, "step": 9840 }, { "entropy": 0.6808772802352905, "epoch": 0.0788, "grad_norm": 5.10487699508667, "learning_rate": 4.6078831532613045e-05, "loss": 0.6755, "mean_token_accuracy": 0.8142207980155944, "num_tokens": 102083890.0, "step": 9850 }, { "entropy": 0.6848615765571594, "epoch": 0.07888, "grad_norm": 2.265260696411133, "learning_rate": 4.607482993197279e-05, "loss": 0.684, "mean_token_accuracy": 0.7851795375347137, "num_tokens": 102247730.0, "step": 9860 }, { "entropy": 0.6513399630784988, "epoch": 0.07896, "grad_norm": 3.5025405883789062, "learning_rate": 4.607082833133253e-05, "loss": 0.6521, "mean_token_accuracy": 0.8123735010623931, "num_tokens": 102324310.0, "step": 9870 }, { "entropy": 0.6971183121204376, "epoch": 0.07904, "grad_norm": 2.314992904663086, "learning_rate": 4.606682673069228e-05, "loss": 0.7024, "mean_token_accuracy": 0.8028412818908691, "num_tokens": 102417016.0, "step": 9880 }, { "entropy": 0.7008150815963745, "epoch": 0.07912, "grad_norm": 2.3880701065063477, "learning_rate": 4.606282513005202e-05, "loss": 0.6942, "mean_token_accuracy": 0.783137583732605, "num_tokens": 102563184.0, "step": 9890 }, { "entropy": 0.7316648989915848, "epoch": 0.0792, "grad_norm": 5.6370720863342285, "learning_rate": 4.605882352941176e-05, "loss": 0.7163, "mean_token_accuracy": 0.815084058046341, "num_tokens": 102601078.0, "step": 9900 }, { "entropy": 0.694990199804306, "epoch": 0.07928, "grad_norm": 1.9208887815475464, "learning_rate": 4.6054821928771514e-05, "loss": 0.7022, "mean_token_accuracy": 0.7861260414123535, "num_tokens": 102764918.0, "step": 9910 }, { "entropy": 0.6731528639793396, "epoch": 0.07936, "grad_norm": 3.8424344062805176, "learning_rate": 4.605082032813126e-05, "loss": 0.667, "mean_token_accuracy": 0.8094605445861817, "num_tokens": 102852175.0, "step": 9920 }, { "entropy": 0.6984025239944458, "epoch": 0.07944, "grad_norm": 1.529535174369812, "learning_rate": 4.6046818727490995e-05, "loss": 0.7132, "mean_token_accuracy": 0.8009831607341766, "num_tokens": 102945822.0, "step": 9930 }, { "entropy": 0.7294781148433686, "epoch": 0.07952, "grad_norm": 2.398836374282837, "learning_rate": 4.604281712685074e-05, "loss": 0.7072, "mean_token_accuracy": 0.7842584252357483, "num_tokens": 103086749.0, "step": 9940 }, { "entropy": 0.5989655286073685, "epoch": 0.0796, "grad_norm": 8.518362045288086, "learning_rate": 4.603881552621049e-05, "loss": 0.6162, "mean_token_accuracy": 0.8305233776569366, "num_tokens": 103127709.0, "step": 9950 }, { "entropy": 0.6958981871604919, "epoch": 0.07968, "grad_norm": 2.7393627166748047, "learning_rate": 4.603481392557023e-05, "loss": 0.7009, "mean_token_accuracy": 0.7821507155895233, "num_tokens": 103291549.0, "step": 9960 }, { "entropy": 0.7741911947727204, "epoch": 0.07976, "grad_norm": 3.244809150695801, "learning_rate": 4.603081232492997e-05, "loss": 0.7625, "mean_token_accuracy": 0.7855052590370178, "num_tokens": 103383819.0, "step": 9970 }, { "entropy": 0.6996681988239288, "epoch": 0.07984, "grad_norm": 1.7609846591949463, "learning_rate": 4.602681072428972e-05, "loss": 0.706, "mean_token_accuracy": 0.7970048904418945, "num_tokens": 103476930.0, "step": 9980 }, { "entropy": 0.6735809803009033, "epoch": 0.07992, "grad_norm": 2.4317378997802734, "learning_rate": 4.6022809123649464e-05, "loss": 0.6724, "mean_token_accuracy": 0.7929690957069397, "num_tokens": 103624932.0, "step": 9990 }, { "entropy": 0.745320999622345, "epoch": 0.08, "grad_norm": 7.106207847595215, "learning_rate": 4.601880752300921e-05, "loss": 0.7394, "mean_token_accuracy": 0.801680839061737, "num_tokens": 103668696.0, "step": 10000 }, { "entropy": 0.7022673487663269, "epoch": 0.08008, "grad_norm": 1.8271833658218384, "learning_rate": 4.6014805922368944e-05, "loss": 0.7018, "mean_token_accuracy": 0.7830726087093354, "num_tokens": 103830674.0, "step": 10010 }, { "entropy": 0.7266414046287537, "epoch": 0.08016, "grad_norm": 3.834561347961426, "learning_rate": 4.6010804321728695e-05, "loss": 0.7273, "mean_token_accuracy": 0.7943147361278534, "num_tokens": 103908814.0, "step": 10020 }, { "entropy": 0.7068132281303405, "epoch": 0.08024, "grad_norm": 1.5965089797973633, "learning_rate": 4.600680272108844e-05, "loss": 0.713, "mean_token_accuracy": 0.7973109483718872, "num_tokens": 104002638.0, "step": 10030 }, { "entropy": 0.7257350802421569, "epoch": 0.08032, "grad_norm": 2.7529826164245605, "learning_rate": 4.600280112044818e-05, "loss": 0.7175, "mean_token_accuracy": 0.7785174429416657, "num_tokens": 104146090.0, "step": 10040 }, { "entropy": 0.6921116650104523, "epoch": 0.0804, "grad_norm": 5.981126308441162, "learning_rate": 4.5998799519807926e-05, "loss": 0.6834, "mean_token_accuracy": 0.8186652779579162, "num_tokens": 104184152.0, "step": 10050 }, { "entropy": 0.6723793625831604, "epoch": 0.08048, "grad_norm": 1.8514339923858643, "learning_rate": 4.599479791916767e-05, "loss": 0.6741, "mean_token_accuracy": 0.7890489876270295, "num_tokens": 104347964.0, "step": 10060 }, { "entropy": 0.658343517780304, "epoch": 0.08056, "grad_norm": 2.917912244796753, "learning_rate": 4.599079631852741e-05, "loss": 0.6573, "mean_token_accuracy": 0.8079505980014801, "num_tokens": 104429907.0, "step": 10070 }, { "entropy": 0.677286434173584, "epoch": 0.08064, "grad_norm": 1.7486573457717896, "learning_rate": 4.598679471788716e-05, "loss": 0.6787, "mean_token_accuracy": 0.8042525053024292, "num_tokens": 104523818.0, "step": 10080 }, { "entropy": 0.7530473887920379, "epoch": 0.08072, "grad_norm": 2.104301929473877, "learning_rate": 4.59827931172469e-05, "loss": 0.7498, "mean_token_accuracy": 0.7806221604347229, "num_tokens": 104650411.0, "step": 10090 }, { "entropy": 0.7710002273321152, "epoch": 0.0808, "grad_norm": 5.960721969604492, "learning_rate": 4.5978791516606644e-05, "loss": 0.7747, "mean_token_accuracy": 0.7967604100704193, "num_tokens": 104684792.0, "step": 10100 }, { "entropy": 0.7194268763065338, "epoch": 0.08088, "grad_norm": 2.7071869373321533, "learning_rate": 4.597478991596639e-05, "loss": 0.7161, "mean_token_accuracy": 0.7759892582893372, "num_tokens": 104848632.0, "step": 10110 }, { "entropy": 0.714184832572937, "epoch": 0.08096, "grad_norm": 2.8163232803344727, "learning_rate": 4.597078831532614e-05, "loss": 0.7143, "mean_token_accuracy": 0.793945825099945, "num_tokens": 104949118.0, "step": 10120 }, { "entropy": 0.6874721705913543, "epoch": 0.08104, "grad_norm": 2.1688809394836426, "learning_rate": 4.5966786714685876e-05, "loss": 0.6863, "mean_token_accuracy": 0.799208652973175, "num_tokens": 105043169.0, "step": 10130 }, { "entropy": 0.7554157316684723, "epoch": 0.08112, "grad_norm": 2.563802719116211, "learning_rate": 4.596278511404562e-05, "loss": 0.7486, "mean_token_accuracy": 0.7784672558307648, "num_tokens": 105188547.0, "step": 10140 }, { "entropy": 0.6584446161985398, "epoch": 0.0812, "grad_norm": 6.580122947692871, "learning_rate": 4.595878351340536e-05, "loss": 0.6533, "mean_token_accuracy": 0.8274668753147125, "num_tokens": 105233079.0, "step": 10150 }, { "entropy": 0.6833945155143738, "epoch": 0.08128, "grad_norm": 2.580298662185669, "learning_rate": 4.5954781912765113e-05, "loss": 0.6882, "mean_token_accuracy": 0.7876587748527527, "num_tokens": 105396919.0, "step": 10160 }, { "entropy": 0.6741097748279572, "epoch": 0.08136, "grad_norm": 4.195068836212158, "learning_rate": 4.595078031212485e-05, "loss": 0.661, "mean_token_accuracy": 0.8094602465629578, "num_tokens": 105484071.0, "step": 10170 }, { "entropy": 0.6917600274085999, "epoch": 0.08144, "grad_norm": 1.9949463605880737, "learning_rate": 4.5946778711484594e-05, "loss": 0.711, "mean_token_accuracy": 0.7980820536613464, "num_tokens": 105577639.0, "step": 10180 }, { "entropy": 0.7551709532737731, "epoch": 0.08152, "grad_norm": 2.4536430835723877, "learning_rate": 4.594277711084434e-05, "loss": 0.7497, "mean_token_accuracy": 0.7747670888900757, "num_tokens": 105723239.0, "step": 10190 }, { "entropy": 0.7140199661254882, "epoch": 0.0816, "grad_norm": 4.447483539581299, "learning_rate": 4.593877551020409e-05, "loss": 0.7046, "mean_token_accuracy": 0.8089093387126922, "num_tokens": 105763238.0, "step": 10200 }, { "entropy": 0.6680365443229676, "epoch": 0.08168, "grad_norm": 2.3726680278778076, "learning_rate": 4.5934773909563825e-05, "loss": 0.6659, "mean_token_accuracy": 0.7875122189521789, "num_tokens": 105927078.0, "step": 10210 }, { "entropy": 0.6786614179611206, "epoch": 0.08176, "grad_norm": 3.609652042388916, "learning_rate": 4.593077230892357e-05, "loss": 0.6741, "mean_token_accuracy": 0.8053334891796112, "num_tokens": 106012131.0, "step": 10220 }, { "entropy": 0.7156066596508026, "epoch": 0.08184, "grad_norm": 1.682012915611267, "learning_rate": 4.592677070828332e-05, "loss": 0.7417, "mean_token_accuracy": 0.7903103530406952, "num_tokens": 106107249.0, "step": 10230 }, { "entropy": 0.7643202662467956, "epoch": 0.08192, "grad_norm": 3.225839376449585, "learning_rate": 4.592276910764306e-05, "loss": 0.7411, "mean_token_accuracy": 0.7765810489654541, "num_tokens": 106250330.0, "step": 10240 }, { "entropy": 0.6490605443716049, "epoch": 0.082, "grad_norm": 6.873386383056641, "learning_rate": 4.59187675070028e-05, "loss": 0.6505, "mean_token_accuracy": 0.8235881268978119, "num_tokens": 106294792.0, "step": 10250 }, { "entropy": 0.6757401347160339, "epoch": 0.08208, "grad_norm": 1.6158409118652344, "learning_rate": 4.5914765906362544e-05, "loss": 0.68, "mean_token_accuracy": 0.7902051866054535, "num_tokens": 106458632.0, "step": 10260 }, { "entropy": 0.6643137276172638, "epoch": 0.08216, "grad_norm": 4.187957286834717, "learning_rate": 4.5910764305722294e-05, "loss": 0.6584, "mean_token_accuracy": 0.812504893541336, "num_tokens": 106532424.0, "step": 10270 }, { "entropy": 0.7140329360961915, "epoch": 0.08224, "grad_norm": 1.5125900506973267, "learning_rate": 4.590676270508204e-05, "loss": 0.7215, "mean_token_accuracy": 0.7884990572929382, "num_tokens": 106624312.0, "step": 10280 }, { "entropy": 0.7363921225070953, "epoch": 0.08232, "grad_norm": 2.4664008617401123, "learning_rate": 4.5902761104441775e-05, "loss": 0.7355, "mean_token_accuracy": 0.7777271687984466, "num_tokens": 106751304.0, "step": 10290 }, { "entropy": 0.7243607699871063, "epoch": 0.0824, "grad_norm": 5.382936000823975, "learning_rate": 4.5898759503801525e-05, "loss": 0.7213, "mean_token_accuracy": 0.8107318043708801, "num_tokens": 106785365.0, "step": 10300 }, { "entropy": 0.7168324649333954, "epoch": 0.08248, "grad_norm": 1.7370671033859253, "learning_rate": 4.589475790316127e-05, "loss": 0.7246, "mean_token_accuracy": 0.7786179900169372, "num_tokens": 106948949.0, "step": 10310 }, { "entropy": 0.6786497175693512, "epoch": 0.08256, "grad_norm": 3.934882164001465, "learning_rate": 4.589075630252101e-05, "loss": 0.6641, "mean_token_accuracy": 0.8117300570011139, "num_tokens": 107019722.0, "step": 10320 }, { "entropy": 0.710988563299179, "epoch": 0.08264, "grad_norm": 1.7960917949676514, "learning_rate": 4.588675470188075e-05, "loss": 0.7206, "mean_token_accuracy": 0.79371777176857, "num_tokens": 107112919.0, "step": 10330 }, { "entropy": 0.7140957534313201, "epoch": 0.08272, "grad_norm": 2.8507111072540283, "learning_rate": 4.58827531012405e-05, "loss": 0.7164, "mean_token_accuracy": 0.7805655360221863, "num_tokens": 107261389.0, "step": 10340 }, { "entropy": 0.7271079272031784, "epoch": 0.0828, "grad_norm": 5.390942096710205, "learning_rate": 4.5878751500600244e-05, "loss": 0.7137, "mean_token_accuracy": 0.8076916515827179, "num_tokens": 107299261.0, "step": 10350 }, { "entropy": 0.6923181235790252, "epoch": 0.08288, "grad_norm": 1.5063472986221313, "learning_rate": 4.587474989995999e-05, "loss": 0.6925, "mean_token_accuracy": 0.7829323470592499, "num_tokens": 107463101.0, "step": 10360 }, { "entropy": 0.6796900629997253, "epoch": 0.08296, "grad_norm": 3.971818447113037, "learning_rate": 4.587074829931973e-05, "loss": 0.6838, "mean_token_accuracy": 0.8009882092475891, "num_tokens": 107550536.0, "step": 10370 }, { "entropy": 0.6859371155500412, "epoch": 0.08304, "grad_norm": 1.8012064695358276, "learning_rate": 4.5866746698679475e-05, "loss": 0.6977, "mean_token_accuracy": 0.7978113055229187, "num_tokens": 107645779.0, "step": 10380 }, { "entropy": 0.7312745153903961, "epoch": 0.08312, "grad_norm": 2.810706853866577, "learning_rate": 4.586274509803922e-05, "loss": 0.7232, "mean_token_accuracy": 0.7815077245235443, "num_tokens": 107793469.0, "step": 10390 }, { "entropy": 0.7370361924171448, "epoch": 0.0832, "grad_norm": 6.125798225402832, "learning_rate": 4.585874349739896e-05, "loss": 0.7335, "mean_token_accuracy": 0.8065164506435394, "num_tokens": 107836628.0, "step": 10400 }, { "entropy": 0.6965038061141968, "epoch": 0.08328, "grad_norm": 2.4048197269439697, "learning_rate": 4.5854741896758706e-05, "loss": 0.691, "mean_token_accuracy": 0.7835002541542053, "num_tokens": 108000468.0, "step": 10410 }, { "entropy": 0.7072629332542419, "epoch": 0.08336, "grad_norm": 2.8728559017181396, "learning_rate": 4.585074029611845e-05, "loss": 0.6974, "mean_token_accuracy": 0.7960815072059632, "num_tokens": 108090116.0, "step": 10420 }, { "entropy": 0.6756595015525818, "epoch": 0.08344, "grad_norm": 1.951561450958252, "learning_rate": 4.5846738695478194e-05, "loss": 0.7068, "mean_token_accuracy": 0.7965631902217865, "num_tokens": 108185647.0, "step": 10430 }, { "entropy": 0.7618992686271667, "epoch": 0.08352, "grad_norm": 2.8672091960906982, "learning_rate": 4.584273709483794e-05, "loss": 0.7381, "mean_token_accuracy": 0.7772447288036346, "num_tokens": 108330447.0, "step": 10440 }, { "entropy": 0.6609963238239288, "epoch": 0.0836, "grad_norm": 5.618129730224609, "learning_rate": 4.583873549419768e-05, "loss": 0.6696, "mean_token_accuracy": 0.8172559559345245, "num_tokens": 108368169.0, "step": 10450 }, { "entropy": 0.6650412321090698, "epoch": 0.08368, "grad_norm": 2.0833630561828613, "learning_rate": 4.5834733893557425e-05, "loss": 0.6668, "mean_token_accuracy": 0.7881900310516358, "num_tokens": 108532009.0, "step": 10460 }, { "entropy": 0.6695364534854888, "epoch": 0.08376, "grad_norm": 3.8197109699249268, "learning_rate": 4.583073229291717e-05, "loss": 0.6605, "mean_token_accuracy": 0.8080940067768096, "num_tokens": 108621291.0, "step": 10470 }, { "entropy": 0.7288743436336518, "epoch": 0.08384, "grad_norm": 2.8633625507354736, "learning_rate": 4.582673069227691e-05, "loss": 0.7504, "mean_token_accuracy": 0.7847785890102387, "num_tokens": 108715131.0, "step": 10480 }, { "entropy": 0.7730857551097869, "epoch": 0.08392, "grad_norm": 3.280045986175537, "learning_rate": 4.5822729091636656e-05, "loss": 0.7449, "mean_token_accuracy": 0.7805470705032349, "num_tokens": 108844134.0, "step": 10490 }, { "entropy": 0.7492175996303558, "epoch": 0.084, "grad_norm": 6.1494526863098145, "learning_rate": 4.58187274909964e-05, "loss": 0.7528, "mean_token_accuracy": 0.7972560942173004, "num_tokens": 108877754.0, "step": 10500 }, { "entropy": 0.6253725171089173, "epoch": 0.08408, "grad_norm": 2.10258412361145, "learning_rate": 4.581472589035615e-05, "loss": 0.6299, "mean_token_accuracy": 0.8013739645481109, "num_tokens": 109041594.0, "step": 10510 }, { "entropy": 0.7079167425632477, "epoch": 0.08416, "grad_norm": 2.934964656829834, "learning_rate": 4.581072428971589e-05, "loss": 0.6967, "mean_token_accuracy": 0.7957953095436097, "num_tokens": 109136611.0, "step": 10520 }, { "entropy": 0.7396998226642608, "epoch": 0.08424, "grad_norm": 1.9658966064453125, "learning_rate": 4.580672268907563e-05, "loss": 0.741, "mean_token_accuracy": 0.7905198812484742, "num_tokens": 109230995.0, "step": 10530 }, { "entropy": 0.7092145144939422, "epoch": 0.08432, "grad_norm": 3.54482102394104, "learning_rate": 4.5802721088435375e-05, "loss": 0.7147, "mean_token_accuracy": 0.7855138301849365, "num_tokens": 109365555.0, "step": 10540 }, { "entropy": 0.7125154733657837, "epoch": 0.0844, "grad_norm": 5.567950248718262, "learning_rate": 4.5798719487795125e-05, "loss": 0.7281, "mean_token_accuracy": 0.8054930806159973, "num_tokens": 109405491.0, "step": 10550 }, { "entropy": 0.6990639805793762, "epoch": 0.08448, "grad_norm": 2.2845232486724854, "learning_rate": 4.579471788715486e-05, "loss": 0.6856, "mean_token_accuracy": 0.7857169151306153, "num_tokens": 109569331.0, "step": 10560 }, { "entropy": 0.7440461993217469, "epoch": 0.08456, "grad_norm": 4.936113357543945, "learning_rate": 4.5790716286514606e-05, "loss": 0.7526, "mean_token_accuracy": 0.7881588935852051, "num_tokens": 109657556.0, "step": 10570 }, { "entropy": 0.7026799380779266, "epoch": 0.08464, "grad_norm": 1.9371514320373535, "learning_rate": 4.5786714685874356e-05, "loss": 0.7068, "mean_token_accuracy": 0.798276299238205, "num_tokens": 109749972.0, "step": 10580 }, { "entropy": 0.7305671274662018, "epoch": 0.08472, "grad_norm": 3.132417678833008, "learning_rate": 4.57827130852341e-05, "loss": 0.7157, "mean_token_accuracy": 0.7801746666431427, "num_tokens": 109899948.0, "step": 10590 }, { "entropy": 0.7474254488945007, "epoch": 0.0848, "grad_norm": 6.380549907684326, "learning_rate": 4.577871148459384e-05, "loss": 0.7568, "mean_token_accuracy": 0.7946566581726074, "num_tokens": 109940437.0, "step": 10600 }, { "entropy": 0.6717114865779876, "epoch": 0.08488, "grad_norm": 1.9978364706039429, "learning_rate": 4.577470988395358e-05, "loss": 0.6744, "mean_token_accuracy": 0.7857352316379547, "num_tokens": 110104277.0, "step": 10610 }, { "entropy": 0.7322629630565644, "epoch": 0.08496, "grad_norm": 3.2126245498657227, "learning_rate": 4.577070828331333e-05, "loss": 0.7177, "mean_token_accuracy": 0.7961398839950562, "num_tokens": 110191053.0, "step": 10620 }, { "entropy": 0.6886064291000367, "epoch": 0.08504, "grad_norm": 2.084638833999634, "learning_rate": 4.5766706682673075e-05, "loss": 0.7095, "mean_token_accuracy": 0.7948743402957916, "num_tokens": 110285427.0, "step": 10630 }, { "entropy": 0.7399132311344147, "epoch": 0.08512, "grad_norm": 2.393977165222168, "learning_rate": 4.576270508203281e-05, "loss": 0.7249, "mean_token_accuracy": 0.7780716955661774, "num_tokens": 110421325.0, "step": 10640 }, { "entropy": 0.7154446810483932, "epoch": 0.0852, "grad_norm": 4.685512065887451, "learning_rate": 4.575870348139256e-05, "loss": 0.719, "mean_token_accuracy": 0.8065235495567322, "num_tokens": 110462283.0, "step": 10650 }, { "entropy": 0.6740082740783692, "epoch": 0.08528, "grad_norm": 1.9084044694900513, "learning_rate": 4.5754701880752306e-05, "loss": 0.6787, "mean_token_accuracy": 0.7849170327186584, "num_tokens": 110626080.0, "step": 10660 }, { "entropy": 0.6796879172325134, "epoch": 0.08536, "grad_norm": 2.9920742511749268, "learning_rate": 4.575070028011205e-05, "loss": 0.6665, "mean_token_accuracy": 0.8016762256622314, "num_tokens": 110722529.0, "step": 10670 }, { "entropy": 0.652401077747345, "epoch": 0.08544, "grad_norm": 2.0401456356048584, "learning_rate": 4.5746698679471787e-05, "loss": 0.6477, "mean_token_accuracy": 0.8116915702819825, "num_tokens": 110817908.0, "step": 10680 }, { "entropy": 0.727421635389328, "epoch": 0.08552, "grad_norm": 2.955482006072998, "learning_rate": 4.574269707883154e-05, "loss": 0.7344, "mean_token_accuracy": 0.7789873301982879, "num_tokens": 110951583.0, "step": 10690 }, { "entropy": 0.7191871166229248, "epoch": 0.0856, "grad_norm": 5.782371520996094, "learning_rate": 4.573869547819128e-05, "loss": 0.7035, "mean_token_accuracy": 0.814867126941681, "num_tokens": 110988795.0, "step": 10700 }, { "entropy": 0.692177700996399, "epoch": 0.08568, "grad_norm": 1.826768159866333, "learning_rate": 4.5734693877551025e-05, "loss": 0.6929, "mean_token_accuracy": 0.7824525535106659, "num_tokens": 111152070.0, "step": 10710 }, { "entropy": 0.6706438511610031, "epoch": 0.08576, "grad_norm": 3.2727510929107666, "learning_rate": 4.573069227691076e-05, "loss": 0.6724, "mean_token_accuracy": 0.8039351344108582, "num_tokens": 111233229.0, "step": 10720 }, { "entropy": 0.6884538292884826, "epoch": 0.08584, "grad_norm": 2.192687511444092, "learning_rate": 4.572669067627051e-05, "loss": 0.6871, "mean_token_accuracy": 0.8005156338214874, "num_tokens": 111326612.0, "step": 10730 }, { "entropy": 0.7209647476673127, "epoch": 0.08592, "grad_norm": 2.6607611179351807, "learning_rate": 4.5722689075630256e-05, "loss": 0.7151, "mean_token_accuracy": 0.7856633543968201, "num_tokens": 111451636.0, "step": 10740 }, { "entropy": 0.7647327274084091, "epoch": 0.086, "grad_norm": 5.21378755569458, "learning_rate": 4.571868747499e-05, "loss": 0.7726, "mean_token_accuracy": 0.8007673740386962, "num_tokens": 111488673.0, "step": 10750 }, { "entropy": 0.7007787942886352, "epoch": 0.08608, "grad_norm": 1.969044804573059, "learning_rate": 4.571468587434974e-05, "loss": 0.6994, "mean_token_accuracy": 0.7809704840183258, "num_tokens": 111652453.0, "step": 10760 }, { "entropy": 0.6899629890918731, "epoch": 0.08616, "grad_norm": 3.37319016456604, "learning_rate": 4.571068427370949e-05, "loss": 0.6884, "mean_token_accuracy": 0.8101143777370453, "num_tokens": 111730047.0, "step": 10770 }, { "entropy": 0.7180256366729736, "epoch": 0.08624, "grad_norm": 1.8003202676773071, "learning_rate": 4.570668267306923e-05, "loss": 0.7031, "mean_token_accuracy": 0.8014287054538727, "num_tokens": 111822765.0, "step": 10780 }, { "entropy": 0.6815264046192169, "epoch": 0.08632, "grad_norm": 2.2614076137542725, "learning_rate": 4.5702681072428974e-05, "loss": 0.6804, "mean_token_accuracy": 0.7904721796512604, "num_tokens": 111968684.0, "step": 10790 }, { "entropy": 0.6703069031238555, "epoch": 0.0864, "grad_norm": 4.296823978424072, "learning_rate": 4.569867947178872e-05, "loss": 0.6623, "mean_token_accuracy": 0.8125728964805603, "num_tokens": 112013230.0, "step": 10800 }, { "entropy": 0.6835310101509094, "epoch": 0.08648, "grad_norm": 2.0710597038269043, "learning_rate": 4.569467787114846e-05, "loss": 0.6781, "mean_token_accuracy": 0.7868099749088288, "num_tokens": 112177070.0, "step": 10810 }, { "entropy": 0.7110952883958817, "epoch": 0.08656, "grad_norm": 4.305867671966553, "learning_rate": 4.5690676270508205e-05, "loss": 0.7046, "mean_token_accuracy": 0.8011234998703003, "num_tokens": 112262107.0, "step": 10820 }, { "entropy": 0.7141198337078094, "epoch": 0.08664, "grad_norm": 1.8510377407073975, "learning_rate": 4.568667466986795e-05, "loss": 0.7267, "mean_token_accuracy": 0.7980907082557678, "num_tokens": 112356629.0, "step": 10830 }, { "entropy": 0.7774844229221344, "epoch": 0.08672, "grad_norm": 3.090606451034546, "learning_rate": 4.568267306922769e-05, "loss": 0.7628, "mean_token_accuracy": 0.7751616835594177, "num_tokens": 112490556.0, "step": 10840 }, { "entropy": 0.7761751115322113, "epoch": 0.0868, "grad_norm": 5.194055080413818, "learning_rate": 4.5678671468587436e-05, "loss": 0.7771, "mean_token_accuracy": 0.7928663909435272, "num_tokens": 112529015.0, "step": 10850 }, { "entropy": 0.6478338479995728, "epoch": 0.08688, "grad_norm": 2.069981813430786, "learning_rate": 4.567466986794718e-05, "loss": 0.6452, "mean_token_accuracy": 0.7939301431179047, "num_tokens": 112692855.0, "step": 10860 }, { "entropy": 0.6681849479675293, "epoch": 0.08696, "grad_norm": 2.880218029022217, "learning_rate": 4.5670668267306924e-05, "loss": 0.659, "mean_token_accuracy": 0.8096631824970245, "num_tokens": 112777884.0, "step": 10870 }, { "entropy": 0.6892183482646942, "epoch": 0.08704, "grad_norm": 2.2460172176361084, "learning_rate": 4.566666666666667e-05, "loss": 0.7049, "mean_token_accuracy": 0.7994200825691223, "num_tokens": 112872021.0, "step": 10880 }, { "entropy": 0.7220003485679627, "epoch": 0.08712, "grad_norm": 2.585293769836426, "learning_rate": 4.566266506602641e-05, "loss": 0.704, "mean_token_accuracy": 0.7872121334075928, "num_tokens": 113006232.0, "step": 10890 }, { "entropy": 0.6942842036485672, "epoch": 0.0872, "grad_norm": 4.5332231521606445, "learning_rate": 4.565866346538616e-05, "loss": 0.7031, "mean_token_accuracy": 0.8101967155933381, "num_tokens": 113044084.0, "step": 10900 }, { "entropy": 0.6978986263275146, "epoch": 0.08728, "grad_norm": 2.6635794639587402, "learning_rate": 4.56546618647459e-05, "loss": 0.7091, "mean_token_accuracy": 0.7831888198852539, "num_tokens": 113207924.0, "step": 10910 }, { "entropy": 0.6585489511489868, "epoch": 0.08736, "grad_norm": 4.28367805480957, "learning_rate": 4.565066026410564e-05, "loss": 0.6514, "mean_token_accuracy": 0.8094501435756684, "num_tokens": 113295865.0, "step": 10920 }, { "entropy": 0.7294296801090241, "epoch": 0.08744, "grad_norm": 1.9928902387619019, "learning_rate": 4.5646658663465386e-05, "loss": 0.7348, "mean_token_accuracy": 0.7932300388813018, "num_tokens": 113389128.0, "step": 10930 }, { "entropy": 0.7157633066177368, "epoch": 0.08752, "grad_norm": 3.171703577041626, "learning_rate": 4.564265706282514e-05, "loss": 0.7092, "mean_token_accuracy": 0.7883484959602356, "num_tokens": 113518010.0, "step": 10940 }, { "entropy": 0.7377277851104737, "epoch": 0.0876, "grad_norm": 5.407132625579834, "learning_rate": 4.5638655462184874e-05, "loss": 0.7324, "mean_token_accuracy": 0.8068386256694794, "num_tokens": 113552294.0, "step": 10950 }, { "entropy": 0.6367930680513382, "epoch": 0.08768, "grad_norm": 1.7643296718597412, "learning_rate": 4.563465386154462e-05, "loss": 0.6377, "mean_token_accuracy": 0.7973194360733032, "num_tokens": 113716014.0, "step": 10960 }, { "entropy": 0.7543172955513, "epoch": 0.08776, "grad_norm": 4.0708746910095215, "learning_rate": 4.563065226090437e-05, "loss": 0.7417, "mean_token_accuracy": 0.7912382125854492, "num_tokens": 113794501.0, "step": 10970 }, { "entropy": 0.7385423839092254, "epoch": 0.08784, "grad_norm": 1.7371230125427246, "learning_rate": 4.562665066026411e-05, "loss": 0.7485, "mean_token_accuracy": 0.7913556694984436, "num_tokens": 113887072.0, "step": 10980 }, { "entropy": 0.7724506855010986, "epoch": 0.08792, "grad_norm": 2.2341597080230713, "learning_rate": 4.562264905962385e-05, "loss": 0.771, "mean_token_accuracy": 0.7744264900684357, "num_tokens": 114020855.0, "step": 10990 }, { "entropy": 0.7185215890407562, "epoch": 0.088, "grad_norm": 4.788400173187256, "learning_rate": 4.561864745898359e-05, "loss": 0.6989, "mean_token_accuracy": 0.8057198822498322, "num_tokens": 114056847.0, "step": 11000 }, { "entropy": 0.7058768332004547, "epoch": 0.08808, "grad_norm": 1.6515090465545654, "learning_rate": 4.561464585834334e-05, "loss": 0.7167, "mean_token_accuracy": 0.7789631187915802, "num_tokens": 114220687.0, "step": 11010 }, { "entropy": 0.6674617230892181, "epoch": 0.08816, "grad_norm": 2.8841664791107178, "learning_rate": 4.5610644257703086e-05, "loss": 0.6555, "mean_token_accuracy": 0.8080081224441529, "num_tokens": 114317113.0, "step": 11020 }, { "entropy": 0.7546421170234681, "epoch": 0.08824, "grad_norm": 1.5138583183288574, "learning_rate": 4.560664265706282e-05, "loss": 0.7633, "mean_token_accuracy": 0.786224502325058, "num_tokens": 114411192.0, "step": 11030 }, { "entropy": 0.7544454395771026, "epoch": 0.08832, "grad_norm": 2.0874760150909424, "learning_rate": 4.5602641056422574e-05, "loss": 0.7542, "mean_token_accuracy": 0.7775928020477295, "num_tokens": 114558430.0, "step": 11040 }, { "entropy": 0.6906359672546387, "epoch": 0.0884, "grad_norm": 5.074774265289307, "learning_rate": 4.559863945578232e-05, "loss": 0.6902, "mean_token_accuracy": 0.8140148162841797, "num_tokens": 114604077.0, "step": 11050 }, { "entropy": 0.7295815587043762, "epoch": 0.08848, "grad_norm": 2.4158763885498047, "learning_rate": 4.559463785514206e-05, "loss": 0.7227, "mean_token_accuracy": 0.7792820274829865, "num_tokens": 114766749.0, "step": 11060 }, { "entropy": 0.6853029429912567, "epoch": 0.08856, "grad_norm": 4.082833766937256, "learning_rate": 4.55906362545018e-05, "loss": 0.6786, "mean_token_accuracy": 0.8078934550285339, "num_tokens": 114843999.0, "step": 11070 }, { "entropy": 0.7000383853912353, "epoch": 0.08864, "grad_norm": 1.8047175407409668, "learning_rate": 4.558663465386155e-05, "loss": 0.6956, "mean_token_accuracy": 0.796825236082077, "num_tokens": 114936939.0, "step": 11080 }, { "entropy": 0.7106852948665618, "epoch": 0.08872, "grad_norm": 2.54579496383667, "learning_rate": 4.558263305322129e-05, "loss": 0.6993, "mean_token_accuracy": 0.7830225050449371, "num_tokens": 115078805.0, "step": 11090 }, { "entropy": 0.6673472702503205, "epoch": 0.0888, "grad_norm": 5.913567543029785, "learning_rate": 4.5578631452581036e-05, "loss": 0.6651, "mean_token_accuracy": 0.823146516084671, "num_tokens": 115119102.0, "step": 11100 }, { "entropy": 0.7101780414581299, "epoch": 0.08888, "grad_norm": 2.4292726516723633, "learning_rate": 4.557462985194078e-05, "loss": 0.7149, "mean_token_accuracy": 0.7792012751102447, "num_tokens": 115282942.0, "step": 11110 }, { "entropy": 0.6908928036689759, "epoch": 0.08896, "grad_norm": 3.8801352977752686, "learning_rate": 4.5570628251300524e-05, "loss": 0.6876, "mean_token_accuracy": 0.7994730412960053, "num_tokens": 115368583.0, "step": 11120 }, { "entropy": 0.7078794121742249, "epoch": 0.08904, "grad_norm": 1.6726933717727661, "learning_rate": 4.556662665066027e-05, "loss": 0.7013, "mean_token_accuracy": 0.7999282479286194, "num_tokens": 115462238.0, "step": 11130 }, { "entropy": 0.7232125401496887, "epoch": 0.08912, "grad_norm": 3.3800532817840576, "learning_rate": 4.556262505002001e-05, "loss": 0.7181, "mean_token_accuracy": 0.7847164869308472, "num_tokens": 115591307.0, "step": 11140 }, { "entropy": 0.7455088078975678, "epoch": 0.0892, "grad_norm": 4.4276580810546875, "learning_rate": 4.5558623449379755e-05, "loss": 0.7502, "mean_token_accuracy": 0.808485746383667, "num_tokens": 115624474.0, "step": 11150 }, { "entropy": 0.6267837792634964, "epoch": 0.08928, "grad_norm": 2.4517834186553955, "learning_rate": 4.55546218487395e-05, "loss": 0.6279, "mean_token_accuracy": 0.8008060574531555, "num_tokens": 115788314.0, "step": 11160 }, { "entropy": 0.6968980193138122, "epoch": 0.08936, "grad_norm": 3.145721673965454, "learning_rate": 4.555062024809924e-05, "loss": 0.7014, "mean_token_accuracy": 0.8028078854084015, "num_tokens": 115872860.0, "step": 11170 }, { "entropy": 0.7080144464969635, "epoch": 0.08944, "grad_norm": 2.05342435836792, "learning_rate": 4.5546618647458986e-05, "loss": 0.6953, "mean_token_accuracy": 0.7995558202266693, "num_tokens": 115966196.0, "step": 11180 }, { "entropy": 0.7357763051986694, "epoch": 0.08952, "grad_norm": 2.932687997817993, "learning_rate": 4.554261704681873e-05, "loss": 0.7289, "mean_token_accuracy": 0.7811911046504975, "num_tokens": 116107436.0, "step": 11190 }, { "entropy": 0.8012751340866089, "epoch": 0.0896, "grad_norm": 5.521324634552002, "learning_rate": 4.553861544617847e-05, "loss": 0.8128, "mean_token_accuracy": 0.7915736556053161, "num_tokens": 116145792.0, "step": 11200 }, { "entropy": 0.6524652600288391, "epoch": 0.08968, "grad_norm": 1.7071555852890015, "learning_rate": 4.553461384553822e-05, "loss": 0.6588, "mean_token_accuracy": 0.7919027864933014, "num_tokens": 116309632.0, "step": 11210 }, { "entropy": 0.6450334548950195, "epoch": 0.08976, "grad_norm": 4.464695930480957, "learning_rate": 4.553061224489796e-05, "loss": 0.6324, "mean_token_accuracy": 0.8167493462562561, "num_tokens": 116390720.0, "step": 11220 }, { "entropy": 0.7343746781349182, "epoch": 0.08984, "grad_norm": 1.6246559619903564, "learning_rate": 4.5526610644257704e-05, "loss": 0.7509, "mean_token_accuracy": 0.7884405493736267, "num_tokens": 116485169.0, "step": 11230 }, { "entropy": 0.7166607975959778, "epoch": 0.08992, "grad_norm": 2.5084023475646973, "learning_rate": 4.552260904361745e-05, "loss": 0.7173, "mean_token_accuracy": 0.7858851075172424, "num_tokens": 116620318.0, "step": 11240 }, { "entropy": 0.7263800621032714, "epoch": 0.09, "grad_norm": 5.524136543273926, "learning_rate": 4.55186074429772e-05, "loss": 0.7146, "mean_token_accuracy": 0.8127117335796357, "num_tokens": 116657741.0, "step": 11250 }, { "entropy": 0.6756105065345764, "epoch": 0.09008, "grad_norm": 1.5610120296478271, "learning_rate": 4.5514605842336936e-05, "loss": 0.6834, "mean_token_accuracy": 0.7844528555870056, "num_tokens": 116821581.0, "step": 11260 }, { "entropy": 0.7246650636196137, "epoch": 0.09016, "grad_norm": 4.111111164093018, "learning_rate": 4.551060424169668e-05, "loss": 0.7189, "mean_token_accuracy": 0.7942392468452454, "num_tokens": 116912030.0, "step": 11270 }, { "entropy": 0.7613316953182221, "epoch": 0.09024, "grad_norm": 2.4591572284698486, "learning_rate": 4.550660264105642e-05, "loss": 0.7602, "mean_token_accuracy": 0.7861729979515075, "num_tokens": 117005051.0, "step": 11280 }, { "entropy": 0.7281406641006469, "epoch": 0.09032, "grad_norm": 2.9141106605529785, "learning_rate": 4.5502601040416173e-05, "loss": 0.7293, "mean_token_accuracy": 0.7786973536014556, "num_tokens": 117148222.0, "step": 11290 }, { "entropy": 0.7043954253196716, "epoch": 0.0904, "grad_norm": 6.539708614349365, "learning_rate": 4.549859943977591e-05, "loss": 0.699, "mean_token_accuracy": 0.8149674475193024, "num_tokens": 117189130.0, "step": 11300 }, { "entropy": 0.6594221234321594, "epoch": 0.09048, "grad_norm": 2.359562873840332, "learning_rate": 4.5494597839135654e-05, "loss": 0.661, "mean_token_accuracy": 0.7930996656417847, "num_tokens": 117352970.0, "step": 11310 }, { "entropy": 0.684240710735321, "epoch": 0.09056, "grad_norm": 3.2716095447540283, "learning_rate": 4.54905962384954e-05, "loss": 0.6756, "mean_token_accuracy": 0.803388899564743, "num_tokens": 117446327.0, "step": 11320 }, { "entropy": 0.7384359180927277, "epoch": 0.09064, "grad_norm": 2.020277976989746, "learning_rate": 4.548659463785515e-05, "loss": 0.7417, "mean_token_accuracy": 0.792670738697052, "num_tokens": 117538811.0, "step": 11330 }, { "entropy": 0.750164920091629, "epoch": 0.09072, "grad_norm": 2.8064119815826416, "learning_rate": 4.5482593037214885e-05, "loss": 0.749, "mean_token_accuracy": 0.7769715249538421, "num_tokens": 117680181.0, "step": 11340 }, { "entropy": 0.7466907739639282, "epoch": 0.0908, "grad_norm": 5.137176990509033, "learning_rate": 4.547859143657463e-05, "loss": 0.7476, "mean_token_accuracy": 0.8054216504096985, "num_tokens": 117721367.0, "step": 11350 }, { "entropy": 0.6989965200424194, "epoch": 0.09088, "grad_norm": 1.6106302738189697, "learning_rate": 4.547458983593438e-05, "loss": 0.7017, "mean_token_accuracy": 0.7828211843967438, "num_tokens": 117885201.0, "step": 11360 }, { "entropy": 0.7507554948329925, "epoch": 0.09096, "grad_norm": 4.3976335525512695, "learning_rate": 4.547058823529412e-05, "loss": 0.7445, "mean_token_accuracy": 0.7935219049453736, "num_tokens": 117965264.0, "step": 11370 }, { "entropy": 0.7796150743961334, "epoch": 0.09104, "grad_norm": 6.5083513259887695, "learning_rate": 4.546658663465386e-05, "loss": 0.7473, "mean_token_accuracy": 0.7925527572631836, "num_tokens": 118056866.0, "step": 11380 }, { "entropy": 0.6932830333709716, "epoch": 0.09112, "grad_norm": 2.237370729446411, "learning_rate": 4.5462585034013604e-05, "loss": 0.6954, "mean_token_accuracy": 0.7863296806812287, "num_tokens": 118200828.0, "step": 11390 }, { "entropy": 0.7338398516178131, "epoch": 0.0912, "grad_norm": 5.742410182952881, "learning_rate": 4.5458583433373354e-05, "loss": 0.732, "mean_token_accuracy": 0.8039285719394684, "num_tokens": 118240509.0, "step": 11400 }, { "entropy": 0.7377332746982574, "epoch": 0.09128, "grad_norm": 1.4946393966674805, "learning_rate": 4.54545818327331e-05, "loss": 0.7331, "mean_token_accuracy": 0.7758861184120178, "num_tokens": 118403973.0, "step": 11410 }, { "entropy": 0.6655917555093765, "epoch": 0.09136, "grad_norm": 3.3215253353118896, "learning_rate": 4.5450580232092835e-05, "loss": 0.6569, "mean_token_accuracy": 0.8075742244720459, "num_tokens": 118483375.0, "step": 11420 }, { "entropy": 0.7457988321781158, "epoch": 0.09144, "grad_norm": 1.8310024738311768, "learning_rate": 4.5446578631452585e-05, "loss": 0.7682, "mean_token_accuracy": 0.7815296828746796, "num_tokens": 118577673.0, "step": 11430 }, { "entropy": 0.7526318371295929, "epoch": 0.09152, "grad_norm": 2.9687836170196533, "learning_rate": 4.544257703081233e-05, "loss": 0.7439, "mean_token_accuracy": 0.7815688848495483, "num_tokens": 118713887.0, "step": 11440 }, { "entropy": 0.7784903049468994, "epoch": 0.0916, "grad_norm": 5.082960605621338, "learning_rate": 4.543857543017207e-05, "loss": 0.7763, "mean_token_accuracy": 0.7976452052593231, "num_tokens": 118749316.0, "step": 11450 }, { "entropy": 0.7329873085021973, "epoch": 0.09168, "grad_norm": 1.9855667352676392, "learning_rate": 4.543457382953181e-05, "loss": 0.7327, "mean_token_accuracy": 0.7783143222332001, "num_tokens": 118912893.0, "step": 11460 }, { "entropy": 0.6765867650508881, "epoch": 0.09176, "grad_norm": 4.676974773406982, "learning_rate": 4.543057222889156e-05, "loss": 0.6745, "mean_token_accuracy": 0.810256177186966, "num_tokens": 118989505.0, "step": 11470 }, { "entropy": 0.778648066520691, "epoch": 0.09184, "grad_norm": 1.8706754446029663, "learning_rate": 4.5426570628251304e-05, "loss": 0.7863, "mean_token_accuracy": 0.7797855257987976, "num_tokens": 119083958.0, "step": 11480 }, { "entropy": 0.6807202637195587, "epoch": 0.09192, "grad_norm": 2.6512105464935303, "learning_rate": 4.542256902761105e-05, "loss": 0.6713, "mean_token_accuracy": 0.7937234759330749, "num_tokens": 119228678.0, "step": 11490 }, { "entropy": 0.7495281875133515, "epoch": 0.092, "grad_norm": 5.316513538360596, "learning_rate": 4.541856742697079e-05, "loss": 0.7581, "mean_token_accuracy": 0.7976669251918793, "num_tokens": 119274315.0, "step": 11500 }, { "entropy": 0.7498072564601899, "epoch": 0.09208, "grad_norm": 2.296982526779175, "learning_rate": 4.5414565826330535e-05, "loss": 0.7471, "mean_token_accuracy": 0.7702613592147827, "num_tokens": 119438155.0, "step": 11510 }, { "entropy": 0.7047181487083435, "epoch": 0.09216, "grad_norm": 4.191275596618652, "learning_rate": 4.541056422569028e-05, "loss": 0.6966, "mean_token_accuracy": 0.8018317520618439, "num_tokens": 119520512.0, "step": 11520 }, { "entropy": 0.7308139324188232, "epoch": 0.09224, "grad_norm": 2.780207633972168, "learning_rate": 4.540656262505002e-05, "loss": 0.7426, "mean_token_accuracy": 0.790832394361496, "num_tokens": 119615636.0, "step": 11530 }, { "entropy": 0.6927210688591003, "epoch": 0.09232, "grad_norm": 3.878960132598877, "learning_rate": 4.5402561024409766e-05, "loss": 0.6723, "mean_token_accuracy": 0.7936409294605256, "num_tokens": 119753477.0, "step": 11540 }, { "entropy": 0.6572849869728088, "epoch": 0.0924, "grad_norm": 5.1566901206970215, "learning_rate": 4.539855942376951e-05, "loss": 0.6638, "mean_token_accuracy": 0.8199584782123566, "num_tokens": 119789517.0, "step": 11550 }, { "entropy": 0.6765379011631012, "epoch": 0.09248, "grad_norm": 2.1721785068511963, "learning_rate": 4.5394557823129254e-05, "loss": 0.6752, "mean_token_accuracy": 0.7894907236099243, "num_tokens": 119953357.0, "step": 11560 }, { "entropy": 0.698912912607193, "epoch": 0.09256, "grad_norm": 3.9066295623779297, "learning_rate": 4.5390556222489e-05, "loss": 0.6935, "mean_token_accuracy": 0.8009001553058624, "num_tokens": 120049436.0, "step": 11570 }, { "entropy": 0.7116667211055756, "epoch": 0.09264, "grad_norm": 2.05997371673584, "learning_rate": 4.538655462184874e-05, "loss": 0.7131, "mean_token_accuracy": 0.7922036230564118, "num_tokens": 120143873.0, "step": 11580 }, { "entropy": 0.7425211668014526, "epoch": 0.09272, "grad_norm": 2.0800669193267822, "learning_rate": 4.5382553021208485e-05, "loss": 0.733, "mean_token_accuracy": 0.7801049470901489, "num_tokens": 120282668.0, "step": 11590 }, { "entropy": 0.7042569994926453, "epoch": 0.0928, "grad_norm": 4.986062526702881, "learning_rate": 4.537855142056823e-05, "loss": 0.7079, "mean_token_accuracy": 0.8095136404037475, "num_tokens": 120322661.0, "step": 11600 }, { "entropy": 0.7150700569152832, "epoch": 0.09288, "grad_norm": 2.4539496898651123, "learning_rate": 4.537454981992797e-05, "loss": 0.7185, "mean_token_accuracy": 0.7777234971523285, "num_tokens": 120486501.0, "step": 11610 }, { "entropy": 0.670923912525177, "epoch": 0.09296, "grad_norm": 3.726262092590332, "learning_rate": 4.5370548219287716e-05, "loss": 0.668, "mean_token_accuracy": 0.8070607423782349, "num_tokens": 120577310.0, "step": 11620 }, { "entropy": 0.7437511563301087, "epoch": 0.09304, "grad_norm": 2.3524558544158936, "learning_rate": 4.536654661864746e-05, "loss": 0.7569, "mean_token_accuracy": 0.7905082702636719, "num_tokens": 120672934.0, "step": 11630 }, { "entropy": 0.771677041053772, "epoch": 0.09312, "grad_norm": 2.668811798095703, "learning_rate": 4.536254501800721e-05, "loss": 0.7658, "mean_token_accuracy": 0.7764466106891632, "num_tokens": 120796279.0, "step": 11640 }, { "entropy": 0.6460221916437149, "epoch": 0.0932, "grad_norm": 4.6262712478637695, "learning_rate": 4.535854341736695e-05, "loss": 0.6491, "mean_token_accuracy": 0.8224087059497833, "num_tokens": 120833716.0, "step": 11650 }, { "entropy": 0.6573792487382889, "epoch": 0.09328, "grad_norm": 1.6513190269470215, "learning_rate": 4.535454181672669e-05, "loss": 0.6497, "mean_token_accuracy": 0.7917867660522461, "num_tokens": 120997556.0, "step": 11660 }, { "entropy": 0.7439851760864258, "epoch": 0.09336, "grad_norm": 4.868485450744629, "learning_rate": 4.5350540216086435e-05, "loss": 0.744, "mean_token_accuracy": 0.7940348386764526, "num_tokens": 121079768.0, "step": 11670 }, { "entropy": 0.7503554821014404, "epoch": 0.09344, "grad_norm": 1.9002398252487183, "learning_rate": 4.5346538615446185e-05, "loss": 0.7371, "mean_token_accuracy": 0.7908073663711548, "num_tokens": 121172507.0, "step": 11680 }, { "entropy": 0.7354966223239898, "epoch": 0.09352, "grad_norm": 2.3286027908325195, "learning_rate": 4.534253701480592e-05, "loss": 0.735, "mean_token_accuracy": 0.7757629513740539, "num_tokens": 121313093.0, "step": 11690 }, { "entropy": 0.6664229035377502, "epoch": 0.0936, "grad_norm": 6.143339157104492, "learning_rate": 4.5338535414165666e-05, "loss": 0.6699, "mean_token_accuracy": 0.8147681534290314, "num_tokens": 121355823.0, "step": 11700 }, { "entropy": 0.6705574691295624, "epoch": 0.09368, "grad_norm": 1.958692193031311, "learning_rate": 4.5334533813525416e-05, "loss": 0.6743, "mean_token_accuracy": 0.788837331533432, "num_tokens": 121519663.0, "step": 11710 }, { "entropy": 0.7848384201526641, "epoch": 0.09376, "grad_norm": 5.292637825012207, "learning_rate": 4.533053221288516e-05, "loss": 0.7791, "mean_token_accuracy": 0.7831360697746277, "num_tokens": 121614460.0, "step": 11720 }, { "entropy": 0.7268023788928986, "epoch": 0.09384, "grad_norm": 1.552992820739746, "learning_rate": 4.53265306122449e-05, "loss": 0.7247, "mean_token_accuracy": 0.793522310256958, "num_tokens": 121709017.0, "step": 11730 }, { "entropy": 0.764194542169571, "epoch": 0.09392, "grad_norm": 2.269325017929077, "learning_rate": 4.532252901160464e-05, "loss": 0.7617, "mean_token_accuracy": 0.7762141287326813, "num_tokens": 121847675.0, "step": 11740 }, { "entropy": 0.6921693980693817, "epoch": 0.094, "grad_norm": 4.841527462005615, "learning_rate": 4.531852741096439e-05, "loss": 0.6988, "mean_token_accuracy": 0.8077024400234223, "num_tokens": 121884317.0, "step": 11750 }, { "entropy": 0.6622406125068665, "epoch": 0.09408, "grad_norm": 1.473081111907959, "learning_rate": 4.5314525810324135e-05, "loss": 0.6624, "mean_token_accuracy": 0.7909623980522156, "num_tokens": 122048157.0, "step": 11760 }, { "entropy": 0.6366491287946701, "epoch": 0.09416, "grad_norm": 3.1041007041931152, "learning_rate": 4.531052420968387e-05, "loss": 0.6341, "mean_token_accuracy": 0.818498820066452, "num_tokens": 122135811.0, "step": 11770 }, { "entropy": 0.7541069209575653, "epoch": 0.09424, "grad_norm": 2.673875570297241, "learning_rate": 4.530652260904362e-05, "loss": 0.7526, "mean_token_accuracy": 0.7842864990234375, "num_tokens": 122230966.0, "step": 11780 }, { "entropy": 0.7395395994186401, "epoch": 0.09432, "grad_norm": 2.834071159362793, "learning_rate": 4.5302521008403366e-05, "loss": 0.7373, "mean_token_accuracy": 0.7782234311103821, "num_tokens": 122369165.0, "step": 11790 }, { "entropy": 0.6694148451089859, "epoch": 0.0944, "grad_norm": 4.715333461761475, "learning_rate": 4.529851940776311e-05, "loss": 0.6699, "mean_token_accuracy": 0.8213675081729889, "num_tokens": 122408289.0, "step": 11800 }, { "entropy": 0.6794102132320404, "epoch": 0.09448, "grad_norm": 2.8259966373443604, "learning_rate": 4.5294517807122847e-05, "loss": 0.6818, "mean_token_accuracy": 0.7868771314620971, "num_tokens": 122572129.0, "step": 11810 }, { "entropy": 0.662128010392189, "epoch": 0.09456, "grad_norm": 4.242774963378906, "learning_rate": 4.52905162064826e-05, "loss": 0.645, "mean_token_accuracy": 0.8109639167785645, "num_tokens": 122666440.0, "step": 11820 }, { "entropy": 0.7152541816234589, "epoch": 0.09464, "grad_norm": 1.5840771198272705, "learning_rate": 4.528651460584234e-05, "loss": 0.7381, "mean_token_accuracy": 0.7937787532806396, "num_tokens": 122758806.0, "step": 11830 }, { "entropy": 0.7350454151630401, "epoch": 0.09472, "grad_norm": 3.666215419769287, "learning_rate": 4.5282513005202084e-05, "loss": 0.7232, "mean_token_accuracy": 0.7807837069034577, "num_tokens": 122890049.0, "step": 11840 }, { "entropy": 0.682841169834137, "epoch": 0.0948, "grad_norm": 6.051751613616943, "learning_rate": 4.527851140456182e-05, "loss": 0.6847, "mean_token_accuracy": 0.8112189412117005, "num_tokens": 122928100.0, "step": 11850 }, { "entropy": 0.5901450276374817, "epoch": 0.09488, "grad_norm": 1.7702609300613403, "learning_rate": 4.527450980392157e-05, "loss": 0.5898, "mean_token_accuracy": 0.8090254008769989, "num_tokens": 123091940.0, "step": 11860 }, { "entropy": 0.6569095432758332, "epoch": 0.09496, "grad_norm": 3.6301372051239014, "learning_rate": 4.5270508203281316e-05, "loss": 0.6464, "mean_token_accuracy": 0.8102640151977539, "num_tokens": 123172194.0, "step": 11870 }, { "entropy": 0.7582570135593414, "epoch": 0.09504, "grad_norm": 2.1586296558380127, "learning_rate": 4.526650660264106e-05, "loss": 0.7816, "mean_token_accuracy": 0.7841079950332641, "num_tokens": 123265405.0, "step": 11880 }, { "entropy": 0.7165725469589234, "epoch": 0.09512, "grad_norm": 2.717729330062866, "learning_rate": 4.52625050020008e-05, "loss": 0.7104, "mean_token_accuracy": 0.7808316767215728, "num_tokens": 123402167.0, "step": 11890 }, { "entropy": 0.7270463228225708, "epoch": 0.0952, "grad_norm": 5.6166253089904785, "learning_rate": 4.525850340136055e-05, "loss": 0.7131, "mean_token_accuracy": 0.8043950855731964, "num_tokens": 123442252.0, "step": 11900 }, { "entropy": 0.648908656835556, "epoch": 0.09528, "grad_norm": 1.8792893886566162, "learning_rate": 4.525450180072029e-05, "loss": 0.6446, "mean_token_accuracy": 0.7962068736553192, "num_tokens": 123604414.0, "step": 11910 }, { "entropy": 0.7061361014842987, "epoch": 0.09536, "grad_norm": 4.08720064163208, "learning_rate": 4.5250500200080034e-05, "loss": 0.6962, "mean_token_accuracy": 0.8051347732543945, "num_tokens": 123665986.0, "step": 11920 }, { "entropy": 0.7377768874168396, "epoch": 0.09544, "grad_norm": 1.7042291164398193, "learning_rate": 4.524649859943978e-05, "loss": 0.7408, "mean_token_accuracy": 0.7977876245975495, "num_tokens": 123757121.0, "step": 11930 }, { "entropy": 0.665493780374527, "epoch": 0.09552, "grad_norm": 2.180631160736084, "learning_rate": 4.524249699879952e-05, "loss": 0.6639, "mean_token_accuracy": 0.7950894415378571, "num_tokens": 123899320.0, "step": 11940 }, { "entropy": 0.7884832322597504, "epoch": 0.0956, "grad_norm": 5.6675567626953125, "learning_rate": 4.5238495398159265e-05, "loss": 0.7709, "mean_token_accuracy": 0.8016603708267211, "num_tokens": 123934574.0, "step": 11950 }, { "entropy": 0.6668165445327758, "epoch": 0.09568, "grad_norm": 2.1948273181915283, "learning_rate": 4.523449379751901e-05, "loss": 0.6742, "mean_token_accuracy": 0.7885625422000885, "num_tokens": 124098414.0, "step": 11960 }, { "entropy": 0.6478820443153381, "epoch": 0.09576, "grad_norm": 3.138484477996826, "learning_rate": 4.523049219687875e-05, "loss": 0.646, "mean_token_accuracy": 0.8181260228157043, "num_tokens": 124175372.0, "step": 11970 }, { "entropy": 0.7445385932922364, "epoch": 0.09584, "grad_norm": 2.228437900543213, "learning_rate": 4.5226490596238496e-05, "loss": 0.7513, "mean_token_accuracy": 0.7879865050315857, "num_tokens": 124268228.0, "step": 11980 }, { "entropy": 0.6592768371105194, "epoch": 0.09592, "grad_norm": 3.0448076725006104, "learning_rate": 4.522248899559824e-05, "loss": 0.6545, "mean_token_accuracy": 0.7956724047660828, "num_tokens": 124408481.0, "step": 11990 }, { "entropy": 0.6367421060800552, "epoch": 0.096, "grad_norm": 5.515920639038086, "learning_rate": 4.5218487394957984e-05, "loss": 0.6369, "mean_token_accuracy": 0.8270095348358154, "num_tokens": 124447528.0, "step": 12000 }, { "entropy": 0.644342839717865, "epoch": 0.09608, "grad_norm": 1.513159155845642, "learning_rate": 4.521448579431773e-05, "loss": 0.6453, "mean_token_accuracy": 0.7943576037883758, "num_tokens": 124611368.0, "step": 12010 }, { "entropy": 0.6586921900510788, "epoch": 0.09616, "grad_norm": 4.512242317199707, "learning_rate": 4.521048419367747e-05, "loss": 0.6561, "mean_token_accuracy": 0.810050904750824, "num_tokens": 124710874.0, "step": 12020 }, { "entropy": 0.6820783019065857, "epoch": 0.09624, "grad_norm": 1.8356820344924927, "learning_rate": 4.520648259303722e-05, "loss": 0.7024, "mean_token_accuracy": 0.7987062931060791, "num_tokens": 124804857.0, "step": 12030 }, { "entropy": 0.7254329204559327, "epoch": 0.09632, "grad_norm": 2.768848419189453, "learning_rate": 4.520248099239696e-05, "loss": 0.7113, "mean_token_accuracy": 0.783867233991623, "num_tokens": 124947885.0, "step": 12040 }, { "entropy": 0.74657501578331, "epoch": 0.0964, "grad_norm": 5.914327144622803, "learning_rate": 4.51984793917567e-05, "loss": 0.7341, "mean_token_accuracy": 0.8071175754070282, "num_tokens": 124986024.0, "step": 12050 }, { "entropy": 0.6538295328617096, "epoch": 0.09648, "grad_norm": 1.4518524408340454, "learning_rate": 4.5194477791116446e-05, "loss": 0.6618, "mean_token_accuracy": 0.789905971288681, "num_tokens": 125149864.0, "step": 12060 }, { "entropy": 0.6682251572608948, "epoch": 0.09656, "grad_norm": 3.283581495285034, "learning_rate": 4.51904761904762e-05, "loss": 0.6599, "mean_token_accuracy": 0.8092669486999512, "num_tokens": 125234722.0, "step": 12070 }, { "entropy": 0.6559363305568695, "epoch": 0.09664, "grad_norm": 1.775038242340088, "learning_rate": 4.5186474589835934e-05, "loss": 0.6536, "mean_token_accuracy": 0.8107054173946381, "num_tokens": 125329224.0, "step": 12080 }, { "entropy": 0.6509127914905548, "epoch": 0.09672, "grad_norm": 3.305424690246582, "learning_rate": 4.518247298919568e-05, "loss": 0.6629, "mean_token_accuracy": 0.7975566148757934, "num_tokens": 125468282.0, "step": 12090 }, { "entropy": 0.6547640860080719, "epoch": 0.0968, "grad_norm": 5.604772567749023, "learning_rate": 4.517847138855543e-05, "loss": 0.6493, "mean_token_accuracy": 0.8264977633953094, "num_tokens": 125504039.0, "step": 12100 }, { "entropy": 0.7418457210063935, "epoch": 0.09688, "grad_norm": 1.8405554294586182, "learning_rate": 4.517446978791517e-05, "loss": 0.734, "mean_token_accuracy": 0.7758915722370148, "num_tokens": 125666716.0, "step": 12110 }, { "entropy": 0.6226917207241058, "epoch": 0.09696, "grad_norm": 5.015075206756592, "learning_rate": 4.517046818727491e-05, "loss": 0.6147, "mean_token_accuracy": 0.8188751399517059, "num_tokens": 125745355.0, "step": 12120 }, { "entropy": 0.6813321948051453, "epoch": 0.09704, "grad_norm": 2.693312883377075, "learning_rate": 4.516646658663465e-05, "loss": 0.7005, "mean_token_accuracy": 0.803996104001999, "num_tokens": 125840064.0, "step": 12130 }, { "entropy": 0.671227115392685, "epoch": 0.09712, "grad_norm": 2.5170962810516357, "learning_rate": 4.51624649859944e-05, "loss": 0.6682, "mean_token_accuracy": 0.7943711817264557, "num_tokens": 125986743.0, "step": 12140 }, { "entropy": 0.7342662215232849, "epoch": 0.0972, "grad_norm": 4.645310401916504, "learning_rate": 4.5158463385354146e-05, "loss": 0.7282, "mean_token_accuracy": 0.8016203939914703, "num_tokens": 126025471.0, "step": 12150 }, { "entropy": 0.6218379020690918, "epoch": 0.09728, "grad_norm": 1.6171334981918335, "learning_rate": 4.515446178471388e-05, "loss": 0.6287, "mean_token_accuracy": 0.7999023020267486, "num_tokens": 126189311.0, "step": 12160 }, { "entropy": 0.6933366119861603, "epoch": 0.09736, "grad_norm": 4.126561641693115, "learning_rate": 4.5150460184073634e-05, "loss": 0.6737, "mean_token_accuracy": 0.8062314569950104, "num_tokens": 126261438.0, "step": 12170 }, { "entropy": 0.7443219780921936, "epoch": 0.09744, "grad_norm": 1.927187442779541, "learning_rate": 4.514645858343338e-05, "loss": 0.7511, "mean_token_accuracy": 0.7898460745811462, "num_tokens": 126353512.0, "step": 12180 }, { "entropy": 0.7198218464851379, "epoch": 0.09752, "grad_norm": 3.35771107673645, "learning_rate": 4.514245698279312e-05, "loss": 0.7146, "mean_token_accuracy": 0.7878176450729371, "num_tokens": 126481106.0, "step": 12190 }, { "entropy": 0.6399552762508393, "epoch": 0.0976, "grad_norm": 4.575778007507324, "learning_rate": 4.513845538215286e-05, "loss": 0.6469, "mean_token_accuracy": 0.8190474808216095, "num_tokens": 126519593.0, "step": 12200 }, { "entropy": 0.6609292387962341, "epoch": 0.09768, "grad_norm": 1.8650808334350586, "learning_rate": 4.513445378151261e-05, "loss": 0.6607, "mean_token_accuracy": 0.7880557119846344, "num_tokens": 126683433.0, "step": 12210 }, { "entropy": 0.7572290867567062, "epoch": 0.09776, "grad_norm": 3.2663354873657227, "learning_rate": 4.513045218087235e-05, "loss": 0.7554, "mean_token_accuracy": 0.7893149495124817, "num_tokens": 126768801.0, "step": 12220 }, { "entropy": 0.7117557048797607, "epoch": 0.09784, "grad_norm": 1.8610304594039917, "learning_rate": 4.5126450580232096e-05, "loss": 0.7166, "mean_token_accuracy": 0.792972069978714, "num_tokens": 126863952.0, "step": 12230 }, { "entropy": 0.7665158927440643, "epoch": 0.09792, "grad_norm": 2.406564712524414, "learning_rate": 4.512244897959184e-05, "loss": 0.7599, "mean_token_accuracy": 0.7737414300441742, "num_tokens": 127001981.0, "step": 12240 }, { "entropy": 0.6568793803453445, "epoch": 0.098, "grad_norm": 6.072785377502441, "learning_rate": 4.5118447378951583e-05, "loss": 0.6446, "mean_token_accuracy": 0.8236245691776276, "num_tokens": 127037656.0, "step": 12250 }, { "entropy": 0.6827451586723328, "epoch": 0.09808, "grad_norm": 2.086777687072754, "learning_rate": 4.511444577831133e-05, "loss": 0.6816, "mean_token_accuracy": 0.7851245760917663, "num_tokens": 127201496.0, "step": 12260 }, { "entropy": 0.6266341865062713, "epoch": 0.09816, "grad_norm": 3.639012336730957, "learning_rate": 4.511044417767107e-05, "loss": 0.6225, "mean_token_accuracy": 0.8217160046100617, "num_tokens": 127283916.0, "step": 12270 }, { "entropy": 0.7137272477149963, "epoch": 0.09824, "grad_norm": 2.028744697570801, "learning_rate": 4.5106442577030815e-05, "loss": 0.7133, "mean_token_accuracy": 0.8009771943092346, "num_tokens": 127376354.0, "step": 12280 }, { "entropy": 0.7109237670898437, "epoch": 0.09832, "grad_norm": 3.2945432662963867, "learning_rate": 4.510244097639056e-05, "loss": 0.702, "mean_token_accuracy": 0.7832902610301972, "num_tokens": 127526145.0, "step": 12290 }, { "entropy": 0.6307065814733506, "epoch": 0.0984, "grad_norm": 4.823207855224609, "learning_rate": 4.50984393757503e-05, "loss": 0.6412, "mean_token_accuracy": 0.8230814516544342, "num_tokens": 127570388.0, "step": 12300 }, { "entropy": 0.6551395416259765, "epoch": 0.09848, "grad_norm": 2.4374585151672363, "learning_rate": 4.5094437775110046e-05, "loss": 0.6535, "mean_token_accuracy": 0.7912860333919525, "num_tokens": 127734228.0, "step": 12310 }, { "entropy": 0.6351367115974427, "epoch": 0.09856, "grad_norm": 3.7398734092712402, "learning_rate": 4.509043617446979e-05, "loss": 0.6284, "mean_token_accuracy": 0.8151117563247681, "num_tokens": 127822723.0, "step": 12320 }, { "entropy": 0.696767395734787, "epoch": 0.09864, "grad_norm": 2.034909248352051, "learning_rate": 4.508643457382953e-05, "loss": 0.7098, "mean_token_accuracy": 0.8025914072990418, "num_tokens": 127916529.0, "step": 12330 }, { "entropy": 0.7171893656253815, "epoch": 0.09872, "grad_norm": 2.78106689453125, "learning_rate": 4.508243297318928e-05, "loss": 0.707, "mean_token_accuracy": 0.7850117743015289, "num_tokens": 128060983.0, "step": 12340 }, { "entropy": 0.6608348220586777, "epoch": 0.0988, "grad_norm": 4.303549289703369, "learning_rate": 4.507843137254902e-05, "loss": 0.6548, "mean_token_accuracy": 0.8219602465629577, "num_tokens": 128105049.0, "step": 12350 }, { "entropy": 0.6800325393676758, "epoch": 0.09888, "grad_norm": 2.1182150840759277, "learning_rate": 4.5074429771908764e-05, "loss": 0.689, "mean_token_accuracy": 0.7835124731063843, "num_tokens": 128268889.0, "step": 12360 }, { "entropy": 0.7105641007423401, "epoch": 0.09896, "grad_norm": 2.722794532775879, "learning_rate": 4.507042817126851e-05, "loss": 0.7059, "mean_token_accuracy": 0.7948489129543305, "num_tokens": 128352949.0, "step": 12370 }, { "entropy": 0.6972728788852691, "epoch": 0.09904, "grad_norm": 1.627509355545044, "learning_rate": 4.506642657062825e-05, "loss": 0.7059, "mean_token_accuracy": 0.7938578367233277, "num_tokens": 128448269.0, "step": 12380 }, { "entropy": 0.7627748787403107, "epoch": 0.09912, "grad_norm": 3.717304229736328, "learning_rate": 4.5062424969987995e-05, "loss": 0.7585, "mean_token_accuracy": 0.7720279216766357, "num_tokens": 128585374.0, "step": 12390 }, { "entropy": 0.6759436279535294, "epoch": 0.0992, "grad_norm": 4.892455577850342, "learning_rate": 4.505842336934774e-05, "loss": 0.6682, "mean_token_accuracy": 0.8209575176239013, "num_tokens": 128622021.0, "step": 12400 }, { "entropy": 0.7033761024475098, "epoch": 0.09928, "grad_norm": 2.7513489723205566, "learning_rate": 4.505442176870748e-05, "loss": 0.7064, "mean_token_accuracy": 0.7827125072479248, "num_tokens": 128785861.0, "step": 12410 }, { "entropy": 0.7566007435321808, "epoch": 0.09936, "grad_norm": 3.1413819789886475, "learning_rate": 4.5050420168067233e-05, "loss": 0.7534, "mean_token_accuracy": 0.7878461718559265, "num_tokens": 128864286.0, "step": 12420 }, { "entropy": 0.707434892654419, "epoch": 0.09944, "grad_norm": 1.6819686889648438, "learning_rate": 4.504641856742697e-05, "loss": 0.704, "mean_token_accuracy": 0.8021671772003174, "num_tokens": 128956010.0, "step": 12430 }, { "entropy": 0.7325850069522858, "epoch": 0.09952, "grad_norm": 3.585714101791382, "learning_rate": 4.5042416966786714e-05, "loss": 0.7264, "mean_token_accuracy": 0.7814997255802154, "num_tokens": 129083817.0, "step": 12440 }, { "entropy": 0.6820701122283935, "epoch": 0.0996, "grad_norm": 5.429965019226074, "learning_rate": 4.503841536614646e-05, "loss": 0.6821, "mean_token_accuracy": 0.8167386591434479, "num_tokens": 129118218.0, "step": 12450 }, { "entropy": 0.6570056796073913, "epoch": 0.09968, "grad_norm": 1.5381500720977783, "learning_rate": 4.503441376550621e-05, "loss": 0.6553, "mean_token_accuracy": 0.7934049904346466, "num_tokens": 129282058.0, "step": 12460 }, { "entropy": 0.7374698877334595, "epoch": 0.09976, "grad_norm": 3.5610761642456055, "learning_rate": 4.5030412164865945e-05, "loss": 0.7429, "mean_token_accuracy": 0.7922768771648407, "num_tokens": 129363023.0, "step": 12470 }, { "entropy": 0.7107998609542847, "epoch": 0.09984, "grad_norm": 1.737786054611206, "learning_rate": 4.502641056422569e-05, "loss": 0.7093, "mean_token_accuracy": 0.7953412115573884, "num_tokens": 129455673.0, "step": 12480 }, { "entropy": 0.6688226819038391, "epoch": 0.09992, "grad_norm": 2.846475601196289, "learning_rate": 4.502240896358544e-05, "loss": 0.6629, "mean_token_accuracy": 0.8013504207134247, "num_tokens": 129582414.0, "step": 12490 }, { "entropy": 0.6744177281856537, "epoch": 0.1, "grad_norm": 6.531871795654297, "learning_rate": 4.501840736294518e-05, "loss": 0.6771, "mean_token_accuracy": 0.8156371355056763, "num_tokens": 129618879.0, "step": 12500 }, { "entropy": 0.6706964910030365, "epoch": 0.10008, "grad_norm": 1.6207964420318604, "learning_rate": 4.501440576230492e-05, "loss": 0.6721, "mean_token_accuracy": 0.7894886374473572, "num_tokens": 129782386.0, "step": 12510 }, { "entropy": 0.688320991396904, "epoch": 0.10016, "grad_norm": 3.1115562915802, "learning_rate": 4.5010404161664664e-05, "loss": 0.6754, "mean_token_accuracy": 0.8056094169616699, "num_tokens": 129857142.0, "step": 12520 }, { "entropy": 0.718123060464859, "epoch": 0.10024, "grad_norm": 2.1369144916534424, "learning_rate": 4.5006402561024414e-05, "loss": 0.7285, "mean_token_accuracy": 0.7934321343898774, "num_tokens": 129949506.0, "step": 12530 }, { "entropy": 0.7106368333101273, "epoch": 0.10032, "grad_norm": 3.3810226917266846, "learning_rate": 4.500240096038416e-05, "loss": 0.7083, "mean_token_accuracy": 0.7846802055835724, "num_tokens": 130097201.0, "step": 12540 }, { "entropy": 0.7115706175565719, "epoch": 0.1004, "grad_norm": 5.293138027191162, "learning_rate": 4.4998399359743895e-05, "loss": 0.7017, "mean_token_accuracy": 0.8114662170410156, "num_tokens": 130135514.0, "step": 12550 }, { "entropy": 0.6640343785285949, "epoch": 0.10048, "grad_norm": 1.749829649925232, "learning_rate": 4.4994397759103645e-05, "loss": 0.666, "mean_token_accuracy": 0.7880190789699555, "num_tokens": 130299284.0, "step": 12560 }, { "entropy": 0.696593564748764, "epoch": 0.10056, "grad_norm": 3.478294610977173, "learning_rate": 4.499039615846339e-05, "loss": 0.6961, "mean_token_accuracy": 0.8028977453708649, "num_tokens": 130379581.0, "step": 12570 }, { "entropy": 0.7187910676002502, "epoch": 0.10064, "grad_norm": 2.3833365440368652, "learning_rate": 4.498639455782313e-05, "loss": 0.7283, "mean_token_accuracy": 0.7945801019668579, "num_tokens": 130471753.0, "step": 12580 }, { "entropy": 0.7260031580924988, "epoch": 0.10072, "grad_norm": 3.7909228801727295, "learning_rate": 4.498239295718287e-05, "loss": 0.7165, "mean_token_accuracy": 0.7823308646678925, "num_tokens": 130609975.0, "step": 12590 }, { "entropy": 0.7203664422035218, "epoch": 0.1008, "grad_norm": 4.522234916687012, "learning_rate": 4.497839135654262e-05, "loss": 0.7095, "mean_token_accuracy": 0.8056998789310456, "num_tokens": 130648265.0, "step": 12600 }, { "entropy": 0.6718537449836731, "epoch": 0.10088, "grad_norm": 1.5843522548675537, "learning_rate": 4.4974389755902364e-05, "loss": 0.6735, "mean_token_accuracy": 0.7877381622791291, "num_tokens": 130812105.0, "step": 12610 }, { "entropy": 0.7196670353412629, "epoch": 0.10096, "grad_norm": 3.5525097846984863, "learning_rate": 4.497038815526211e-05, "loss": 0.7206, "mean_token_accuracy": 0.7932228565216064, "num_tokens": 130898405.0, "step": 12620 }, { "entropy": 0.8022441446781159, "epoch": 0.10104, "grad_norm": 1.6344757080078125, "learning_rate": 4.496638655462185e-05, "loss": 0.7973, "mean_token_accuracy": 0.776925790309906, "num_tokens": 130990782.0, "step": 12630 }, { "entropy": 0.6526824414730072, "epoch": 0.10112, "grad_norm": 2.3302903175354004, "learning_rate": 4.4962384953981595e-05, "loss": 0.652, "mean_token_accuracy": 0.8000631093978882, "num_tokens": 131127387.0, "step": 12640 }, { "entropy": 0.7396165430545807, "epoch": 0.1012, "grad_norm": 6.849343776702881, "learning_rate": 4.495838335334134e-05, "loss": 0.7247, "mean_token_accuracy": 0.8068825960159302, "num_tokens": 131163775.0, "step": 12650 }, { "entropy": 0.7068598747253418, "epoch": 0.10128, "grad_norm": 2.8482563495635986, "learning_rate": 4.495438175270108e-05, "loss": 0.7102, "mean_token_accuracy": 0.7768258452415466, "num_tokens": 131327615.0, "step": 12660 }, { "entropy": 0.7491983473300934, "epoch": 0.10136, "grad_norm": 4.221765995025635, "learning_rate": 4.4950380152060826e-05, "loss": 0.7505, "mean_token_accuracy": 0.7883798956871033, "num_tokens": 131420230.0, "step": 12670 }, { "entropy": 0.7807986557483673, "epoch": 0.10144, "grad_norm": 2.182746171951294, "learning_rate": 4.494637855142057e-05, "loss": 0.7754, "mean_token_accuracy": 0.780015480518341, "num_tokens": 131514806.0, "step": 12680 }, { "entropy": 0.7177556931972504, "epoch": 0.10152, "grad_norm": 2.2751529216766357, "learning_rate": 4.4942376950780314e-05, "loss": 0.7107, "mean_token_accuracy": 0.7890841126441955, "num_tokens": 131644841.0, "step": 12690 }, { "entropy": 0.7104016661643981, "epoch": 0.1016, "grad_norm": 5.468401908874512, "learning_rate": 4.493837535014006e-05, "loss": 0.7135, "mean_token_accuracy": 0.8095165848731994, "num_tokens": 131683585.0, "step": 12700 }, { "entropy": 0.6928245842456817, "epoch": 0.10168, "grad_norm": 1.8216612339019775, "learning_rate": 4.49343737494998e-05, "loss": 0.6973, "mean_token_accuracy": 0.7849584996700287, "num_tokens": 131845160.0, "step": 12710 }, { "entropy": 0.6837215602397919, "epoch": 0.10176, "grad_norm": 4.0238471031188965, "learning_rate": 4.4930372148859545e-05, "loss": 0.6729, "mean_token_accuracy": 0.803305697441101, "num_tokens": 131918683.0, "step": 12720 }, { "entropy": 0.7198755621910096, "epoch": 0.10184, "grad_norm": 2.0437114238739014, "learning_rate": 4.492637054821929e-05, "loss": 0.7302, "mean_token_accuracy": 0.7947301387786865, "num_tokens": 132010805.0, "step": 12730 }, { "entropy": 0.6897163927555084, "epoch": 0.10192, "grad_norm": 2.800447463989258, "learning_rate": 4.492236894757903e-05, "loss": 0.6937, "mean_token_accuracy": 0.7866669476032258, "num_tokens": 132155311.0, "step": 12740 }, { "entropy": 0.7653539836406708, "epoch": 0.102, "grad_norm": 5.7476582527160645, "learning_rate": 4.4918367346938776e-05, "loss": 0.7563, "mean_token_accuracy": 0.8023430824279785, "num_tokens": 132195992.0, "step": 12750 }, { "entropy": 0.6645751237869263, "epoch": 0.10208, "grad_norm": 1.8159748315811157, "learning_rate": 4.491436574629852e-05, "loss": 0.6613, "mean_token_accuracy": 0.792666107416153, "num_tokens": 132359832.0, "step": 12760 }, { "entropy": 0.7256938517093658, "epoch": 0.10216, "grad_norm": 3.8898422718048096, "learning_rate": 4.491036414565827e-05, "loss": 0.7324, "mean_token_accuracy": 0.7913804531097413, "num_tokens": 132456941.0, "step": 12770 }, { "entropy": 0.683496767282486, "epoch": 0.10224, "grad_norm": 2.4720771312713623, "learning_rate": 4.490636254501801e-05, "loss": 0.6978, "mean_token_accuracy": 0.8020108461380004, "num_tokens": 132551933.0, "step": 12780 }, { "entropy": 0.6769181758165359, "epoch": 0.10232, "grad_norm": 2.6750106811523438, "learning_rate": 4.490236094437775e-05, "loss": 0.6653, "mean_token_accuracy": 0.7966098845005035, "num_tokens": 132685134.0, "step": 12790 }, { "entropy": 0.7761743724346161, "epoch": 0.1024, "grad_norm": 5.3057732582092285, "learning_rate": 4.4898359343737495e-05, "loss": 0.7615, "mean_token_accuracy": 0.8062630534172058, "num_tokens": 132718740.0, "step": 12800 }, { "entropy": 0.6137708693742752, "epoch": 0.10248, "grad_norm": 1.7794442176818848, "learning_rate": 4.4894357743097245e-05, "loss": 0.6158, "mean_token_accuracy": 0.8052112460136414, "num_tokens": 132882257.0, "step": 12810 }, { "entropy": 0.6924181282520294, "epoch": 0.10256, "grad_norm": 4.430352210998535, "learning_rate": 4.489035614245698e-05, "loss": 0.682, "mean_token_accuracy": 0.7997621357440948, "num_tokens": 132965626.0, "step": 12820 }, { "entropy": 0.710853773355484, "epoch": 0.10264, "grad_norm": 1.5340431928634644, "learning_rate": 4.4886354541816726e-05, "loss": 0.7181, "mean_token_accuracy": 0.7932642519474029, "num_tokens": 133059907.0, "step": 12830 }, { "entropy": 0.7064619481563568, "epoch": 0.10272, "grad_norm": 2.2568633556365967, "learning_rate": 4.4882352941176476e-05, "loss": 0.6975, "mean_token_accuracy": 0.7835655748844147, "num_tokens": 133212369.0, "step": 12840 }, { "entropy": 0.6919672220945359, "epoch": 0.1028, "grad_norm": 7.117642879486084, "learning_rate": 4.487835134053622e-05, "loss": 0.7065, "mean_token_accuracy": 0.8088865041732788, "num_tokens": 133253484.0, "step": 12850 }, { "entropy": 0.6629799664020538, "epoch": 0.10288, "grad_norm": 1.7557865381240845, "learning_rate": 4.487434973989596e-05, "loss": 0.6674, "mean_token_accuracy": 0.7897254824638367, "num_tokens": 133415617.0, "step": 12860 }, { "entropy": 0.7218241393566132, "epoch": 0.10296, "grad_norm": 3.326099395751953, "learning_rate": 4.48703481392557e-05, "loss": 0.7074, "mean_token_accuracy": 0.7958970487117767, "num_tokens": 133484800.0, "step": 12870 }, { "entropy": 0.7605546653270722, "epoch": 0.10304, "grad_norm": 1.6608340740203857, "learning_rate": 4.486634653861545e-05, "loss": 0.773, "mean_token_accuracy": 0.7853981494903565, "num_tokens": 133576249.0, "step": 12880 }, { "entropy": 0.7097661197185516, "epoch": 0.10312, "grad_norm": 2.7184345722198486, "learning_rate": 4.4862344937975195e-05, "loss": 0.7078, "mean_token_accuracy": 0.7843159437179565, "num_tokens": 133714362.0, "step": 12890 }, { "entropy": 0.7087177157402038, "epoch": 0.1032, "grad_norm": 4.6317620277404785, "learning_rate": 4.485834333733493e-05, "loss": 0.6987, "mean_token_accuracy": 0.8100135624408722, "num_tokens": 133755089.0, "step": 12900 }, { "entropy": 0.6612863302230835, "epoch": 0.10328, "grad_norm": 1.5584053993225098, "learning_rate": 4.485434173669468e-05, "loss": 0.6563, "mean_token_accuracy": 0.7933191597461701, "num_tokens": 133918842.0, "step": 12910 }, { "entropy": 0.6399185359477997, "epoch": 0.10336, "grad_norm": 4.085634708404541, "learning_rate": 4.4850340136054426e-05, "loss": 0.6373, "mean_token_accuracy": 0.8143190324306488, "num_tokens": 133995573.0, "step": 12920 }, { "entropy": 0.6784547150135041, "epoch": 0.10344, "grad_norm": 1.5235143899917603, "learning_rate": 4.484633853541417e-05, "loss": 0.6812, "mean_token_accuracy": 0.8063700795173645, "num_tokens": 134088705.0, "step": 12930 }, { "entropy": 0.7024430006742477, "epoch": 0.10352, "grad_norm": 2.1227011680603027, "learning_rate": 4.4842336934773907e-05, "loss": 0.6921, "mean_token_accuracy": 0.7905481517314911, "num_tokens": 134228029.0, "step": 12940 }, { "entropy": 0.7387674361467361, "epoch": 0.1036, "grad_norm": 6.861995697021484, "learning_rate": 4.483833533413366e-05, "loss": 0.7479, "mean_token_accuracy": 0.8008465588092804, "num_tokens": 134265355.0, "step": 12950 }, { "entropy": 0.6670814573764801, "epoch": 0.10368, "grad_norm": 2.563389778137207, "learning_rate": 4.48343337334934e-05, "loss": 0.6666, "mean_token_accuracy": 0.7881067812442779, "num_tokens": 134428686.0, "step": 12960 }, { "entropy": 0.717086809873581, "epoch": 0.10376, "grad_norm": 3.8248136043548584, "learning_rate": 4.4830332132853144e-05, "loss": 0.7115, "mean_token_accuracy": 0.7965283870697022, "num_tokens": 134506406.0, "step": 12970 }, { "entropy": 0.6639619827270508, "epoch": 0.10384, "grad_norm": 1.6763818264007568, "learning_rate": 4.482633053221288e-05, "loss": 0.6562, "mean_token_accuracy": 0.8094716250896454, "num_tokens": 134599761.0, "step": 12980 }, { "entropy": 0.7487948179244995, "epoch": 0.10392, "grad_norm": 4.181972503662109, "learning_rate": 4.482232893157263e-05, "loss": 0.7573, "mean_token_accuracy": 0.7766994118690491, "num_tokens": 134727599.0, "step": 12990 }, { "entropy": 0.7549664855003357, "epoch": 0.104, "grad_norm": 5.710048198699951, "learning_rate": 4.4818327330932376e-05, "loss": 0.7511, "mean_token_accuracy": 0.7978043377399444, "num_tokens": 134763729.0, "step": 13000 }, { "entropy": 0.6851869881153106, "epoch": 0.10408, "grad_norm": 2.0495405197143555, "learning_rate": 4.481432573029212e-05, "loss": 0.6835, "mean_token_accuracy": 0.7832926332950592, "num_tokens": 134927569.0, "step": 13010 }, { "entropy": 0.6863488614559173, "epoch": 0.10416, "grad_norm": 3.682401418685913, "learning_rate": 4.481032412965186e-05, "loss": 0.6766, "mean_token_accuracy": 0.8038320362567901, "num_tokens": 135023468.0, "step": 13020 }, { "entropy": 0.8072285771369934, "epoch": 0.10424, "grad_norm": 2.5130531787872314, "learning_rate": 4.480632252901161e-05, "loss": 0.8128, "mean_token_accuracy": 0.7773088634014129, "num_tokens": 135118656.0, "step": 13030 }, { "entropy": 0.7526118338108063, "epoch": 0.10432, "grad_norm": 2.909991502761841, "learning_rate": 4.480232092837135e-05, "loss": 0.7512, "mean_token_accuracy": 0.7724208831787109, "num_tokens": 135264430.0, "step": 13040 }, { "entropy": 0.6168117165565491, "epoch": 0.1044, "grad_norm": 5.359276294708252, "learning_rate": 4.4798319327731094e-05, "loss": 0.5991, "mean_token_accuracy": 0.8324706375598907, "num_tokens": 135309379.0, "step": 13050 }, { "entropy": 0.7010535061359405, "epoch": 0.10448, "grad_norm": 1.9213318824768066, "learning_rate": 4.479431772709084e-05, "loss": 0.706, "mean_token_accuracy": 0.7803004384040833, "num_tokens": 135473219.0, "step": 13060 }, { "entropy": 0.8030283510684967, "epoch": 0.10456, "grad_norm": 3.4494686126708984, "learning_rate": 4.479031612645058e-05, "loss": 0.7976, "mean_token_accuracy": 0.7784128427505493, "num_tokens": 135562702.0, "step": 13070 }, { "entropy": 0.7121814787387848, "epoch": 0.10464, "grad_norm": 1.727669358253479, "learning_rate": 4.4786314525810325e-05, "loss": 0.714, "mean_token_accuracy": 0.797942727804184, "num_tokens": 135657267.0, "step": 13080 }, { "entropy": 0.6678097724914551, "epoch": 0.10472, "grad_norm": 2.7924537658691406, "learning_rate": 4.4782312925170076e-05, "loss": 0.6536, "mean_token_accuracy": 0.7994554698467254, "num_tokens": 135809004.0, "step": 13090 }, { "entropy": 0.7882618010044098, "epoch": 0.1048, "grad_norm": 4.630250930786133, "learning_rate": 4.477831132452981e-05, "loss": 0.7879, "mean_token_accuracy": 0.7905635297298431, "num_tokens": 135859193.0, "step": 13100 }, { "entropy": 0.6700955748558044, "epoch": 0.10488, "grad_norm": 1.4312490224838257, "learning_rate": 4.4774309723889556e-05, "loss": 0.6673, "mean_token_accuracy": 0.7883915483951569, "num_tokens": 136023033.0, "step": 13110 }, { "entropy": 0.6641165852546692, "epoch": 0.10496, "grad_norm": 3.6883928775787354, "learning_rate": 4.47703081232493e-05, "loss": 0.6536, "mean_token_accuracy": 0.8064209461212158, "num_tokens": 136114773.0, "step": 13120 }, { "entropy": 0.7302615582942963, "epoch": 0.10504, "grad_norm": 3.1899964809417725, "learning_rate": 4.476630652260905e-05, "loss": 0.7458, "mean_token_accuracy": 0.7886205792427063, "num_tokens": 136208753.0, "step": 13130 }, { "entropy": 0.6839607119560241, "epoch": 0.10512, "grad_norm": 3.423858165740967, "learning_rate": 4.476230492196879e-05, "loss": 0.6685, "mean_token_accuracy": 0.7989146113395691, "num_tokens": 136349544.0, "step": 13140 }, { "entropy": 0.6698643237352371, "epoch": 0.1052, "grad_norm": 6.191068649291992, "learning_rate": 4.475830332132853e-05, "loss": 0.6723, "mean_token_accuracy": 0.8184991836547851, "num_tokens": 136389433.0, "step": 13150 }, { "entropy": 0.6512589812278747, "epoch": 0.10528, "grad_norm": 2.088247299194336, "learning_rate": 4.475430172068828e-05, "loss": 0.6529, "mean_token_accuracy": 0.7945896506309509, "num_tokens": 136553273.0, "step": 13160 }, { "entropy": 0.7174988090991974, "epoch": 0.10536, "grad_norm": 2.69903826713562, "learning_rate": 4.4750300120048025e-05, "loss": 0.7124, "mean_token_accuracy": 0.8001075565814972, "num_tokens": 136637302.0, "step": 13170 }, { "entropy": 0.697393947839737, "epoch": 0.10544, "grad_norm": 1.491231918334961, "learning_rate": 4.474629851940776e-05, "loss": 0.6917, "mean_token_accuracy": 0.8001511454582214, "num_tokens": 136731577.0, "step": 13180 }, { "entropy": 0.6931809723377228, "epoch": 0.10552, "grad_norm": 2.8941800594329834, "learning_rate": 4.4742296918767506e-05, "loss": 0.6828, "mean_token_accuracy": 0.7880883634090423, "num_tokens": 136872299.0, "step": 13190 }, { "entropy": 0.6376683264970779, "epoch": 0.1056, "grad_norm": 5.898960590362549, "learning_rate": 4.4738295318127257e-05, "loss": 0.6509, "mean_token_accuracy": 0.8223832011222839, "num_tokens": 136914064.0, "step": 13200 }, { "entropy": 0.6907670736312866, "epoch": 0.10568, "grad_norm": 1.7550300359725952, "learning_rate": 4.4734293717487e-05, "loss": 0.6882, "mean_token_accuracy": 0.784495609998703, "num_tokens": 137077904.0, "step": 13210 }, { "entropy": 0.6676172465085983, "epoch": 0.10576, "grad_norm": 3.567953586578369, "learning_rate": 4.473029211684674e-05, "loss": 0.6644, "mean_token_accuracy": 0.8103749394416809, "num_tokens": 137160844.0, "step": 13220 }, { "entropy": 0.7147712886333466, "epoch": 0.10584, "grad_norm": 2.0208847522735596, "learning_rate": 4.472629051620649e-05, "loss": 0.7305, "mean_token_accuracy": 0.792438405752182, "num_tokens": 137257024.0, "step": 13230 }, { "entropy": 0.7369763731956482, "epoch": 0.10592, "grad_norm": 2.8922886848449707, "learning_rate": 4.472228891556623e-05, "loss": 0.7199, "mean_token_accuracy": 0.7853587687015533, "num_tokens": 137377286.0, "step": 13240 }, { "entropy": 0.8292653679847717, "epoch": 0.106, "grad_norm": 4.982941150665283, "learning_rate": 4.4718287314925975e-05, "loss": 0.8333, "mean_token_accuracy": 0.7844678342342377, "num_tokens": 137410218.0, "step": 13250 }, { "entropy": 0.6810658931732178, "epoch": 0.10608, "grad_norm": 1.561933994293213, "learning_rate": 4.471428571428571e-05, "loss": 0.6788, "mean_token_accuracy": 0.7898927628993988, "num_tokens": 137573726.0, "step": 13260 }, { "entropy": 0.6519725143909454, "epoch": 0.10616, "grad_norm": 3.178067684173584, "learning_rate": 4.471028411364546e-05, "loss": 0.6434, "mean_token_accuracy": 0.8122871100902558, "num_tokens": 137655141.0, "step": 13270 }, { "entropy": 0.7609924614429474, "epoch": 0.10624, "grad_norm": 1.581850290298462, "learning_rate": 4.4706282513005206e-05, "loss": 0.7723, "mean_token_accuracy": 0.7811664819717408, "num_tokens": 137748813.0, "step": 13280 }, { "entropy": 0.6797802329063416, "epoch": 0.10632, "grad_norm": 2.2988717555999756, "learning_rate": 4.470228091236495e-05, "loss": 0.6714, "mean_token_accuracy": 0.7937829375267029, "num_tokens": 137880009.0, "step": 13290 }, { "entropy": 0.8198541581630707, "epoch": 0.1064, "grad_norm": 4.843915939331055, "learning_rate": 4.4698279311724694e-05, "loss": 0.81, "mean_token_accuracy": 0.7846676707267761, "num_tokens": 137918722.0, "step": 13300 }, { "entropy": 0.6489015102386475, "epoch": 0.10648, "grad_norm": 2.196213722229004, "learning_rate": 4.469427771108444e-05, "loss": 0.6512, "mean_token_accuracy": 0.7957683563232422, "num_tokens": 138078575.0, "step": 13310 }, { "entropy": 0.7006651937961579, "epoch": 0.10656, "grad_norm": 3.8214938640594482, "learning_rate": 4.469027611044418e-05, "loss": 0.7001, "mean_token_accuracy": 0.8054262280464173, "num_tokens": 138152280.0, "step": 13320 }, { "entropy": 0.7421425223350525, "epoch": 0.10664, "grad_norm": 1.883183479309082, "learning_rate": 4.4686274509803925e-05, "loss": 0.7365, "mean_token_accuracy": 0.7948309540748596, "num_tokens": 138246093.0, "step": 13330 }, { "entropy": 0.6678024709224701, "epoch": 0.10672, "grad_norm": 2.480055570602417, "learning_rate": 4.468227290916367e-05, "loss": 0.6738, "mean_token_accuracy": 0.787718516588211, "num_tokens": 138389047.0, "step": 13340 }, { "entropy": 0.7057091176509858, "epoch": 0.1068, "grad_norm": 6.749701023101807, "learning_rate": 4.467827130852341e-05, "loss": 0.7106, "mean_token_accuracy": 0.813012319803238, "num_tokens": 138428249.0, "step": 13350 }, { "entropy": 0.6869760155677795, "epoch": 0.10688, "grad_norm": 1.924485683441162, "learning_rate": 4.4674269707883156e-05, "loss": 0.6892, "mean_token_accuracy": 0.7839887619018555, "num_tokens": 138592089.0, "step": 13360 }, { "entropy": 0.7195118486881256, "epoch": 0.10696, "grad_norm": 2.867978096008301, "learning_rate": 4.46702681072429e-05, "loss": 0.7149, "mean_token_accuracy": 0.7930991590023041, "num_tokens": 138693102.0, "step": 13370 }, { "entropy": 0.6600325286388398, "epoch": 0.10704, "grad_norm": 1.8824542760849, "learning_rate": 4.4666266506602643e-05, "loss": 0.6579, "mean_token_accuracy": 0.8079476416110992, "num_tokens": 138788547.0, "step": 13380 }, { "entropy": 0.6341437339782715, "epoch": 0.10712, "grad_norm": 3.6646602153778076, "learning_rate": 4.466226490596239e-05, "loss": 0.6255, "mean_token_accuracy": 0.8058064818382263, "num_tokens": 138922332.0, "step": 13390 }, { "entropy": 0.6724238961935043, "epoch": 0.1072, "grad_norm": 3.9376540184020996, "learning_rate": 4.465826330532213e-05, "loss": 0.6693, "mean_token_accuracy": 0.8169586062431335, "num_tokens": 138959996.0, "step": 13400 }, { "entropy": 0.6287719011306763, "epoch": 0.10728, "grad_norm": 1.5317541360855103, "learning_rate": 4.4654261704681875e-05, "loss": 0.6322, "mean_token_accuracy": 0.7975288271903992, "num_tokens": 139123806.0, "step": 13410 }, { "entropy": 0.7139669835567475, "epoch": 0.10736, "grad_norm": 3.4152605533599854, "learning_rate": 4.465026010404162e-05, "loss": 0.7108, "mean_token_accuracy": 0.7988036692142486, "num_tokens": 139207407.0, "step": 13420 }, { "entropy": 0.7087276697158813, "epoch": 0.10744, "grad_norm": 1.5343018770217896, "learning_rate": 4.464625850340136e-05, "loss": 0.7007, "mean_token_accuracy": 0.79453906416893, "num_tokens": 139302643.0, "step": 13430 }, { "entropy": 0.7114257872104645, "epoch": 0.10752, "grad_norm": 2.2290706634521484, "learning_rate": 4.464225690276111e-05, "loss": 0.7139, "mean_token_accuracy": 0.7875915706157685, "num_tokens": 139437161.0, "step": 13440 }, { "entropy": 0.742443984746933, "epoch": 0.1076, "grad_norm": 5.8118510246276855, "learning_rate": 4.463825530212085e-05, "loss": 0.7375, "mean_token_accuracy": 0.8099598705768585, "num_tokens": 139474100.0, "step": 13450 }, { "entropy": 0.7052219927310943, "epoch": 0.10768, "grad_norm": 1.4226816892623901, "learning_rate": 4.463425370148059e-05, "loss": 0.6945, "mean_token_accuracy": 0.7842330276966095, "num_tokens": 139637940.0, "step": 13460 }, { "entropy": 0.7397510766983032, "epoch": 0.10776, "grad_norm": 5.591158866882324, "learning_rate": 4.463025210084034e-05, "loss": 0.7375, "mean_token_accuracy": 0.7887979567050933, "num_tokens": 139727813.0, "step": 13470 }, { "entropy": 0.7249106347560883, "epoch": 0.10784, "grad_norm": 2.0629029273986816, "learning_rate": 4.462625050020009e-05, "loss": 0.7354, "mean_token_accuracy": 0.7920652091503143, "num_tokens": 139820743.0, "step": 13480 }, { "entropy": 0.7306534826755524, "epoch": 0.10792, "grad_norm": 2.398099184036255, "learning_rate": 4.4622248899559824e-05, "loss": 0.7256, "mean_token_accuracy": 0.7848012089729309, "num_tokens": 139956230.0, "step": 13490 }, { "entropy": 0.6802960395812988, "epoch": 0.108, "grad_norm": 4.387956619262695, "learning_rate": 4.461824729891957e-05, "loss": 0.6756, "mean_token_accuracy": 0.8208883047103882, "num_tokens": 139989506.0, "step": 13500 }, { "entropy": 0.6716837346553802, "epoch": 0.10808, "grad_norm": 1.973076581954956, "learning_rate": 4.461424569827931e-05, "loss": 0.675, "mean_token_accuracy": 0.7853871583938599, "num_tokens": 140153346.0, "step": 13510 }, { "entropy": 0.6670971870422363, "epoch": 0.10816, "grad_norm": 3.779250383377075, "learning_rate": 4.461024409763906e-05, "loss": 0.6619, "mean_token_accuracy": 0.802994179725647, "num_tokens": 140241615.0, "step": 13520 }, { "entropy": 0.8041805446147918, "epoch": 0.10824, "grad_norm": 1.8028676509857178, "learning_rate": 4.46062424969988e-05, "loss": 0.8115, "mean_token_accuracy": 0.7753578901290894, "num_tokens": 140337169.0, "step": 13530 }, { "entropy": 0.7099717736244202, "epoch": 0.10832, "grad_norm": 3.327911853790283, "learning_rate": 4.460224089635854e-05, "loss": 0.701, "mean_token_accuracy": 0.7849129676818848, "num_tokens": 140478295.0, "step": 13540 }, { "entropy": 0.6699130326509476, "epoch": 0.1084, "grad_norm": 4.284481525421143, "learning_rate": 4.459823929571829e-05, "loss": 0.6652, "mean_token_accuracy": 0.8217287480831146, "num_tokens": 140520444.0, "step": 13550 }, { "entropy": 0.702704393863678, "epoch": 0.10848, "grad_norm": 2.180697202682495, "learning_rate": 4.459423769507804e-05, "loss": 0.7029, "mean_token_accuracy": 0.7797630786895752, "num_tokens": 140684284.0, "step": 13560 }, { "entropy": 0.609523069858551, "epoch": 0.10856, "grad_norm": 4.056252956390381, "learning_rate": 4.4590236094437774e-05, "loss": 0.6118, "mean_token_accuracy": 0.8193258285522461, "num_tokens": 140768125.0, "step": 13570 }, { "entropy": 0.7716795921325683, "epoch": 0.10864, "grad_norm": 2.0011706352233887, "learning_rate": 4.458623449379752e-05, "loss": 0.8007, "mean_token_accuracy": 0.7776495814323425, "num_tokens": 140863264.0, "step": 13580 }, { "entropy": 0.710615074634552, "epoch": 0.10872, "grad_norm": 2.9999051094055176, "learning_rate": 4.458223289315727e-05, "loss": 0.6926, "mean_token_accuracy": 0.7885177493095398, "num_tokens": 141000617.0, "step": 13590 }, { "entropy": 0.6420779585838318, "epoch": 0.1088, "grad_norm": 5.27156400680542, "learning_rate": 4.457823129251701e-05, "loss": 0.6617, "mean_token_accuracy": 0.8207698583602905, "num_tokens": 141039683.0, "step": 13600 }, { "entropy": 0.7179996728897095, "epoch": 0.10888, "grad_norm": 1.6806533336639404, "learning_rate": 4.457422969187675e-05, "loss": 0.7183, "mean_token_accuracy": 0.777155601978302, "num_tokens": 141203523.0, "step": 13610 }, { "entropy": 0.6440317869186402, "epoch": 0.10896, "grad_norm": 3.259068727493286, "learning_rate": 4.45702280912365e-05, "loss": 0.6309, "mean_token_accuracy": 0.8135764420032501, "num_tokens": 141286918.0, "step": 13620 }, { "entropy": 0.7723020255565644, "epoch": 0.10904, "grad_norm": 2.223850965499878, "learning_rate": 4.456622649059624e-05, "loss": 0.7798, "mean_token_accuracy": 0.7808313012123108, "num_tokens": 141381494.0, "step": 13630 }, { "entropy": 0.7448275685310364, "epoch": 0.10912, "grad_norm": 3.577585220336914, "learning_rate": 4.456222488995599e-05, "loss": 0.7431, "mean_token_accuracy": 0.7795851349830627, "num_tokens": 141509111.0, "step": 13640 }, { "entropy": 0.7436928689479828, "epoch": 0.1092, "grad_norm": 7.137475490570068, "learning_rate": 4.4558223289315724e-05, "loss": 0.7282, "mean_token_accuracy": 0.8132046461105347, "num_tokens": 141541487.0, "step": 13650 }, { "entropy": 0.7231678247451783, "epoch": 0.10928, "grad_norm": 2.044989585876465, "learning_rate": 4.4554221688675474e-05, "loss": 0.7315, "mean_token_accuracy": 0.7757205784320831, "num_tokens": 141705327.0, "step": 13660 }, { "entropy": 0.7021678030490875, "epoch": 0.10936, "grad_norm": 4.673795700073242, "learning_rate": 4.455022008803522e-05, "loss": 0.6761, "mean_token_accuracy": 0.8084500432014465, "num_tokens": 141780973.0, "step": 13670 }, { "entropy": 0.6758604407310486, "epoch": 0.10944, "grad_norm": 1.8211995363235474, "learning_rate": 4.454621848739496e-05, "loss": 0.6733, "mean_token_accuracy": 0.803162407875061, "num_tokens": 141873997.0, "step": 13680 }, { "entropy": 0.7123992204666137, "epoch": 0.10952, "grad_norm": 3.2815349102020264, "learning_rate": 4.4542216886754705e-05, "loss": 0.7177, "mean_token_accuracy": 0.7822850286960602, "num_tokens": 142018875.0, "step": 13690 }, { "entropy": 0.6768414348363876, "epoch": 0.1096, "grad_norm": 5.956862449645996, "learning_rate": 4.453821528611445e-05, "loss": 0.6585, "mean_token_accuracy": 0.8221333503723145, "num_tokens": 142062054.0, "step": 13700 }, { "entropy": 0.6605724632740021, "epoch": 0.10968, "grad_norm": 1.570881724357605, "learning_rate": 4.453421368547419e-05, "loss": 0.6596, "mean_token_accuracy": 0.7908097326755523, "num_tokens": 142225894.0, "step": 13710 }, { "entropy": 0.6772403717041016, "epoch": 0.10976, "grad_norm": 3.5056025981903076, "learning_rate": 4.4530212084833936e-05, "loss": 0.6753, "mean_token_accuracy": 0.8063795626163482, "num_tokens": 142316056.0, "step": 13720 }, { "entropy": 0.7558443248271942, "epoch": 0.10984, "grad_norm": 1.8553317785263062, "learning_rate": 4.452621048419368e-05, "loss": 0.7599, "mean_token_accuracy": 0.7866153717041016, "num_tokens": 142411016.0, "step": 13730 }, { "entropy": 0.6906444102525711, "epoch": 0.10992, "grad_norm": 2.3616862297058105, "learning_rate": 4.4522208883553424e-05, "loss": 0.6922, "mean_token_accuracy": 0.7896946489810943, "num_tokens": 142550616.0, "step": 13740 }, { "entropy": 0.7157075583934784, "epoch": 0.11, "grad_norm": 6.260347843170166, "learning_rate": 4.451820728291317e-05, "loss": 0.7118, "mean_token_accuracy": 0.8103891134262085, "num_tokens": 142590683.0, "step": 13750 }, { "entropy": 0.6680937767028808, "epoch": 0.11008, "grad_norm": 1.864138126373291, "learning_rate": 4.451420568227291e-05, "loss": 0.6674, "mean_token_accuracy": 0.7904189169406891, "num_tokens": 142754523.0, "step": 13760 }, { "entropy": 0.7727029263973236, "epoch": 0.11016, "grad_norm": 3.159268379211426, "learning_rate": 4.4510204081632655e-05, "loss": 0.7693, "mean_token_accuracy": 0.787707382440567, "num_tokens": 142838857.0, "step": 13770 }, { "entropy": 0.768585467338562, "epoch": 0.11024, "grad_norm": 1.9234226942062378, "learning_rate": 4.45062024809924e-05, "loss": 0.7674, "mean_token_accuracy": 0.7871525883674622, "num_tokens": 142931553.0, "step": 13780 }, { "entropy": 0.7354207038879395, "epoch": 0.11032, "grad_norm": 2.735771417617798, "learning_rate": 4.450220088035214e-05, "loss": 0.7335, "mean_token_accuracy": 0.7798019468784332, "num_tokens": 143064197.0, "step": 13790 }, { "entropy": 0.714363980293274, "epoch": 0.1104, "grad_norm": 4.494022369384766, "learning_rate": 4.4498199279711886e-05, "loss": 0.7054, "mean_token_accuracy": 0.8096810936927795, "num_tokens": 143098561.0, "step": 13800 }, { "entropy": 0.7307667374610901, "epoch": 0.11048, "grad_norm": 1.6420880556106567, "learning_rate": 4.449419767907163e-05, "loss": 0.7315, "mean_token_accuracy": 0.7783468306064606, "num_tokens": 143261433.0, "step": 13810 }, { "entropy": 0.638895896077156, "epoch": 0.11056, "grad_norm": 3.7772347927093506, "learning_rate": 4.4490196078431374e-05, "loss": 0.6372, "mean_token_accuracy": 0.8147620916366577, "num_tokens": 143340550.0, "step": 13820 }, { "entropy": 0.7962376415729523, "epoch": 0.11064, "grad_norm": 1.462742567062378, "learning_rate": 4.4486194477791124e-05, "loss": 0.7885, "mean_token_accuracy": 0.7806614935398102, "num_tokens": 143434440.0, "step": 13830 }, { "entropy": 0.729513156414032, "epoch": 0.11072, "grad_norm": 2.983980894088745, "learning_rate": 4.448219287715086e-05, "loss": 0.7299, "mean_token_accuracy": 0.7801513493061065, "num_tokens": 143575225.0, "step": 13840 }, { "entropy": 0.7747701287269593, "epoch": 0.1108, "grad_norm": 5.510197639465332, "learning_rate": 4.4478191276510605e-05, "loss": 0.7692, "mean_token_accuracy": 0.7958510398864747, "num_tokens": 143614493.0, "step": 13850 }, { "entropy": 0.7338309347629547, "epoch": 0.11088, "grad_norm": 1.8589494228363037, "learning_rate": 4.447418967587035e-05, "loss": 0.7338, "mean_token_accuracy": 0.7777784645557404, "num_tokens": 143778333.0, "step": 13860 }, { "entropy": 0.6903732240200042, "epoch": 0.11096, "grad_norm": 3.9449782371520996, "learning_rate": 4.44701880752301e-05, "loss": 0.69, "mean_token_accuracy": 0.8022910833358765, "num_tokens": 143859766.0, "step": 13870 }, { "entropy": 0.7279296040534973, "epoch": 0.11104, "grad_norm": 2.600619077682495, "learning_rate": 4.4466186474589836e-05, "loss": 0.734, "mean_token_accuracy": 0.7916768372058869, "num_tokens": 143952390.0, "step": 13880 }, { "entropy": 0.7081697225570679, "epoch": 0.11112, "grad_norm": 3.3202836513519287, "learning_rate": 4.446218487394958e-05, "loss": 0.6952, "mean_token_accuracy": 0.7879089295864106, "num_tokens": 144092906.0, "step": 13890 }, { "entropy": 0.6096508383750916, "epoch": 0.1112, "grad_norm": 5.378994464874268, "learning_rate": 4.445818327330933e-05, "loss": 0.6196, "mean_token_accuracy": 0.8293969511985779, "num_tokens": 144130598.0, "step": 13900 }, { "entropy": 0.6627937495708466, "epoch": 0.11128, "grad_norm": 1.5754203796386719, "learning_rate": 4.4454181672669074e-05, "loss": 0.6637, "mean_token_accuracy": 0.7921409487724305, "num_tokens": 144294438.0, "step": 13910 }, { "entropy": 0.7078159987926483, "epoch": 0.11136, "grad_norm": 3.7972891330718994, "learning_rate": 4.445018007202881e-05, "loss": 0.7118, "mean_token_accuracy": 0.8000567495822907, "num_tokens": 144380345.0, "step": 13920 }, { "entropy": 0.7439123928546906, "epoch": 0.11144, "grad_norm": 2.2515578269958496, "learning_rate": 4.4446178471388554e-05, "loss": 0.7303, "mean_token_accuracy": 0.7952271044254303, "num_tokens": 144473819.0, "step": 13930 }, { "entropy": 0.6859265804290772, "epoch": 0.11152, "grad_norm": 3.52822208404541, "learning_rate": 4.4442176870748305e-05, "loss": 0.6755, "mean_token_accuracy": 0.794916981458664, "num_tokens": 144591596.0, "step": 13940 }, { "entropy": 0.7431561827659607, "epoch": 0.1116, "grad_norm": 6.824023723602295, "learning_rate": 4.443817527010805e-05, "loss": 0.75, "mean_token_accuracy": 0.8086226582527161, "num_tokens": 144622302.0, "step": 13950 }, { "entropy": 0.6780237436294556, "epoch": 0.11168, "grad_norm": 1.5918776988983154, "learning_rate": 4.4434173669467786e-05, "loss": 0.675, "mean_token_accuracy": 0.7868649303913117, "num_tokens": 144786142.0, "step": 13960 }, { "entropy": 0.6718255788087845, "epoch": 0.11176, "grad_norm": 2.495197057723999, "learning_rate": 4.4430172068827536e-05, "loss": 0.6716, "mean_token_accuracy": 0.8072348594665527, "num_tokens": 144880314.0, "step": 13970 }, { "entropy": 0.6773240089416503, "epoch": 0.11184, "grad_norm": 1.581639051437378, "learning_rate": 4.442617046818728e-05, "loss": 0.686, "mean_token_accuracy": 0.8004550695419311, "num_tokens": 144975026.0, "step": 13980 }, { "entropy": 0.7018521428108215, "epoch": 0.11192, "grad_norm": 3.7071564197540283, "learning_rate": 4.4422168867547024e-05, "loss": 0.6907, "mean_token_accuracy": 0.787895393371582, "num_tokens": 145119243.0, "step": 13990 }, { "entropy": 0.7615679740905762, "epoch": 0.112, "grad_norm": 4.488683700561523, "learning_rate": 4.441816726690676e-05, "loss": 0.7749, "mean_token_accuracy": 0.7952431261539459, "num_tokens": 145162778.0, "step": 14000 }, { "entropy": 0.7534519314765931, "epoch": 0.11208, "grad_norm": 2.5940351486206055, "learning_rate": 4.441416566626651e-05, "loss": 0.7506, "mean_token_accuracy": 0.7722607016563415, "num_tokens": 145326183.0, "step": 14010 }, { "entropy": 0.7013078302145004, "epoch": 0.11216, "grad_norm": 3.271075963973999, "learning_rate": 4.4410164065626255e-05, "loss": 0.6924, "mean_token_accuracy": 0.7974757730960846, "num_tokens": 145415471.0, "step": 14020 }, { "entropy": 0.7032326221466064, "epoch": 0.11224, "grad_norm": 2.266655445098877, "learning_rate": 4.4406162464986e-05, "loss": 0.703, "mean_token_accuracy": 0.7993625342845917, "num_tokens": 145510055.0, "step": 14030 }, { "entropy": 0.7215204775333405, "epoch": 0.11232, "grad_norm": 2.5365347862243652, "learning_rate": 4.440216086434574e-05, "loss": 0.7174, "mean_token_accuracy": 0.7876333773136139, "num_tokens": 145649492.0, "step": 14040 }, { "entropy": 0.774131965637207, "epoch": 0.1124, "grad_norm": 5.307178974151611, "learning_rate": 4.4398159263705486e-05, "loss": 0.7785, "mean_token_accuracy": 0.7985602915287018, "num_tokens": 145687710.0, "step": 14050 }, { "entropy": 0.7158747494220734, "epoch": 0.11248, "grad_norm": 1.606046438217163, "learning_rate": 4.439415766306523e-05, "loss": 0.7168, "mean_token_accuracy": 0.7766487121582031, "num_tokens": 145851505.0, "step": 14060 }, { "entropy": 0.8297863751649857, "epoch": 0.11256, "grad_norm": 3.377916097640991, "learning_rate": 4.439015606242497e-05, "loss": 0.814, "mean_token_accuracy": 0.7782183945178985, "num_tokens": 145925055.0, "step": 14070 }, { "entropy": 0.7025668859481812, "epoch": 0.11264, "grad_norm": 1.4892218112945557, "learning_rate": 4.438615446178472e-05, "loss": 0.7081, "mean_token_accuracy": 0.8016690671443939, "num_tokens": 146016721.0, "step": 14080 }, { "entropy": 0.6920515835285187, "epoch": 0.11272, "grad_norm": 2.729424238204956, "learning_rate": 4.438215286114446e-05, "loss": 0.6879, "mean_token_accuracy": 0.7927304446697235, "num_tokens": 146137047.0, "step": 14090 }, { "entropy": 0.6700530052185059, "epoch": 0.1128, "grad_norm": 4.637156963348389, "learning_rate": 4.4378151260504204e-05, "loss": 0.6677, "mean_token_accuracy": 0.8203172147274017, "num_tokens": 146177673.0, "step": 14100 }, { "entropy": 0.6858945369720459, "epoch": 0.11288, "grad_norm": 1.8907978534698486, "learning_rate": 4.437414965986395e-05, "loss": 0.6859, "mean_token_accuracy": 0.7861565768718719, "num_tokens": 146341513.0, "step": 14110 }, { "entropy": 0.6125393152236939, "epoch": 0.11296, "grad_norm": 3.8439176082611084, "learning_rate": 4.437014805922369e-05, "loss": 0.6154, "mean_token_accuracy": 0.8201582252979278, "num_tokens": 146423071.0, "step": 14120 }, { "entropy": 0.7328998863697052, "epoch": 0.11304, "grad_norm": 2.2699148654937744, "learning_rate": 4.4366146458583436e-05, "loss": 0.7288, "mean_token_accuracy": 0.7911107003688812, "num_tokens": 146516029.0, "step": 14130 }, { "entropy": 0.7400680363178254, "epoch": 0.11312, "grad_norm": 2.3878395557403564, "learning_rate": 4.436214485794318e-05, "loss": 0.7259, "mean_token_accuracy": 0.7762546598911285, "num_tokens": 146660023.0, "step": 14140 }, { "entropy": 0.7149584949016571, "epoch": 0.1132, "grad_norm": 6.981825828552246, "learning_rate": 4.435814325730292e-05, "loss": 0.7158, "mean_token_accuracy": 0.813197809457779, "num_tokens": 146697170.0, "step": 14150 }, { "entropy": 0.6912393867969513, "epoch": 0.11328, "grad_norm": 1.8225270509719849, "learning_rate": 4.435414165666267e-05, "loss": 0.6969, "mean_token_accuracy": 0.7840165317058563, "num_tokens": 146860819.0, "step": 14160 }, { "entropy": 0.6773286283016204, "epoch": 0.11336, "grad_norm": 3.2511584758758545, "learning_rate": 4.435014005602241e-05, "loss": 0.6632, "mean_token_accuracy": 0.8096414268016815, "num_tokens": 146941344.0, "step": 14170 }, { "entropy": 0.6717011570930481, "epoch": 0.11344, "grad_norm": 1.9554314613342285, "learning_rate": 4.4346138455382154e-05, "loss": 0.6723, "mean_token_accuracy": 0.804544848203659, "num_tokens": 147036165.0, "step": 14180 }, { "entropy": 0.6456089496612549, "epoch": 0.11352, "grad_norm": 1.9241539239883423, "learning_rate": 4.43421368547419e-05, "loss": 0.6373, "mean_token_accuracy": 0.8008222460746766, "num_tokens": 147184668.0, "step": 14190 }, { "entropy": 0.7230643272399903, "epoch": 0.1136, "grad_norm": 5.714893817901611, "learning_rate": 4.433813525410164e-05, "loss": 0.7168, "mean_token_accuracy": 0.809415590763092, "num_tokens": 147224154.0, "step": 14200 }, { "entropy": 0.7337169647216797, "epoch": 0.11368, "grad_norm": 2.5163609981536865, "learning_rate": 4.4334133653461385e-05, "loss": 0.7396, "mean_token_accuracy": 0.7705605804920197, "num_tokens": 147387994.0, "step": 14210 }, { "entropy": 0.7566720485687256, "epoch": 0.11376, "grad_norm": 4.856236934661865, "learning_rate": 4.4330132052821136e-05, "loss": 0.7539, "mean_token_accuracy": 0.7879972338676453, "num_tokens": 147467306.0, "step": 14220 }, { "entropy": 0.7022146284580231, "epoch": 0.11384, "grad_norm": 1.807555913925171, "learning_rate": 4.432613045218087e-05, "loss": 0.7066, "mean_token_accuracy": 0.8015453398227692, "num_tokens": 147558672.0, "step": 14230 }, { "entropy": 0.6759166717529297, "epoch": 0.11392, "grad_norm": 2.9465270042419434, "learning_rate": 4.4322128851540616e-05, "loss": 0.6672, "mean_token_accuracy": 0.7908577203750611, "num_tokens": 147711042.0, "step": 14240 }, { "entropy": 0.7246960639953614, "epoch": 0.114, "grad_norm": 6.091980457305908, "learning_rate": 4.431812725090036e-05, "loss": 0.7335, "mean_token_accuracy": 0.8021434843540192, "num_tokens": 147754229.0, "step": 14250 }, { "entropy": 0.6479903399944306, "epoch": 0.11408, "grad_norm": 2.1164658069610596, "learning_rate": 4.431412565026011e-05, "loss": 0.6475, "mean_token_accuracy": 0.7951646327972413, "num_tokens": 147916985.0, "step": 14260 }, { "entropy": 0.6937673568725586, "epoch": 0.11416, "grad_norm": 4.147453784942627, "learning_rate": 4.431012404961985e-05, "loss": 0.6871, "mean_token_accuracy": 0.8046681523323059, "num_tokens": 147996893.0, "step": 14270 }, { "entropy": 0.7375596940517426, "epoch": 0.11424, "grad_norm": 2.6468100547790527, "learning_rate": 4.430612244897959e-05, "loss": 0.7447, "mean_token_accuracy": 0.7848497450351715, "num_tokens": 148091090.0, "step": 14280 }, { "entropy": 0.7154874444007874, "epoch": 0.11432, "grad_norm": 2.6503758430480957, "learning_rate": 4.430212084833934e-05, "loss": 0.7118, "mean_token_accuracy": 0.7803176581859589, "num_tokens": 148228972.0, "step": 14290 }, { "entropy": 0.7446654498577118, "epoch": 0.1144, "grad_norm": 5.8292412757873535, "learning_rate": 4.4298119247699085e-05, "loss": 0.731, "mean_token_accuracy": 0.8046973705291748, "num_tokens": 148268940.0, "step": 14300 }, { "entropy": 0.6704065680503846, "epoch": 0.11448, "grad_norm": 1.8059366941452026, "learning_rate": 4.429411764705882e-05, "loss": 0.6696, "mean_token_accuracy": 0.7886724472045898, "num_tokens": 148432780.0, "step": 14310 }, { "entropy": 0.6835756182670594, "epoch": 0.11456, "grad_norm": 3.5079550743103027, "learning_rate": 4.4290116046418566e-05, "loss": 0.6752, "mean_token_accuracy": 0.8016940891742707, "num_tokens": 148525236.0, "step": 14320 }, { "entropy": 0.7794900596141815, "epoch": 0.11464, "grad_norm": 2.301759719848633, "learning_rate": 4.4286114445778317e-05, "loss": 0.7968, "mean_token_accuracy": 0.7727092862129211, "num_tokens": 148619175.0, "step": 14330 }, { "entropy": 0.7232289552688599, "epoch": 0.11472, "grad_norm": 2.575477123260498, "learning_rate": 4.428211284513806e-05, "loss": 0.7133, "mean_token_accuracy": 0.7855717957019805, "num_tokens": 148758815.0, "step": 14340 }, { "entropy": 0.7472393095493317, "epoch": 0.1148, "grad_norm": 5.4523138999938965, "learning_rate": 4.42781112444978e-05, "loss": 0.7506, "mean_token_accuracy": 0.8029707491397857, "num_tokens": 148798369.0, "step": 14350 }, { "entropy": 0.6884359061717987, "epoch": 0.11488, "grad_norm": 2.000256299972534, "learning_rate": 4.427410964385755e-05, "loss": 0.6849, "mean_token_accuracy": 0.783778065443039, "num_tokens": 148962033.0, "step": 14360 }, { "entropy": 0.7163511753082276, "epoch": 0.11496, "grad_norm": 2.8980836868286133, "learning_rate": 4.427010804321729e-05, "loss": 0.7126, "mean_token_accuracy": 0.8009560465812683, "num_tokens": 149041271.0, "step": 14370 }, { "entropy": 0.6849063336849213, "epoch": 0.11504, "grad_norm": 2.3728084564208984, "learning_rate": 4.4266106442577035e-05, "loss": 0.6969, "mean_token_accuracy": 0.7985885441303253, "num_tokens": 149134855.0, "step": 14380 }, { "entropy": 0.6453555405139924, "epoch": 0.11512, "grad_norm": 2.9263830184936523, "learning_rate": 4.426210484193677e-05, "loss": 0.6391, "mean_token_accuracy": 0.7995188593864441, "num_tokens": 149268306.0, "step": 14390 }, { "entropy": 0.6896302103996277, "epoch": 0.1152, "grad_norm": 5.07102108001709, "learning_rate": 4.425810324129652e-05, "loss": 0.6894, "mean_token_accuracy": 0.8093934834003449, "num_tokens": 149308875.0, "step": 14400 }, { "entropy": 0.6415322601795197, "epoch": 0.11528, "grad_norm": 1.6381442546844482, "learning_rate": 4.4254101640656266e-05, "loss": 0.6493, "mean_token_accuracy": 0.7944797217845917, "num_tokens": 149472715.0, "step": 14410 }, { "entropy": 0.651087862253189, "epoch": 0.11536, "grad_norm": 3.0312612056732178, "learning_rate": 4.425010004001601e-05, "loss": 0.638, "mean_token_accuracy": 0.8135944545269013, "num_tokens": 149561183.0, "step": 14420 }, { "entropy": 0.7230640649795532, "epoch": 0.11544, "grad_norm": 1.773868441581726, "learning_rate": 4.4246098439375754e-05, "loss": 0.7469, "mean_token_accuracy": 0.7902368128299713, "num_tokens": 149656277.0, "step": 14430 }, { "entropy": 0.7080495059490204, "epoch": 0.11552, "grad_norm": 2.7376701831817627, "learning_rate": 4.42420968387355e-05, "loss": 0.7028, "mean_token_accuracy": 0.7846698760986328, "num_tokens": 149808250.0, "step": 14440 }, { "entropy": 0.6966323792934418, "epoch": 0.1156, "grad_norm": 4.3460693359375, "learning_rate": 4.423809523809524e-05, "loss": 0.6844, "mean_token_accuracy": 0.8098299384117127, "num_tokens": 149860798.0, "step": 14450 }, { "entropy": 0.6609305322170258, "epoch": 0.11568, "grad_norm": 2.1107821464538574, "learning_rate": 4.4234093637454985e-05, "loss": 0.6584, "mean_token_accuracy": 0.7910112380981446, "num_tokens": 150024638.0, "step": 14460 }, { "entropy": 0.7167943239212036, "epoch": 0.11576, "grad_norm": 3.0979342460632324, "learning_rate": 4.423009203681473e-05, "loss": 0.714, "mean_token_accuracy": 0.7945050120353698, "num_tokens": 150126180.0, "step": 14470 }, { "entropy": 0.7188735246658325, "epoch": 0.11584, "grad_norm": 1.5726532936096191, "learning_rate": 4.422609043617447e-05, "loss": 0.7268, "mean_token_accuracy": 0.7920298933982849, "num_tokens": 150222083.0, "step": 14480 }, { "entropy": 0.7654588162899018, "epoch": 0.11592, "grad_norm": 3.0299274921417236, "learning_rate": 4.4222088835534216e-05, "loss": 0.7506, "mean_token_accuracy": 0.7762788653373718, "num_tokens": 150355587.0, "step": 14490 }, { "entropy": 0.6850974231958389, "epoch": 0.116, "grad_norm": 6.037807464599609, "learning_rate": 4.421808723489396e-05, "loss": 0.674, "mean_token_accuracy": 0.8179493904113769, "num_tokens": 150396228.0, "step": 14500 }, { "entropy": 0.6936413884162903, "epoch": 0.11608, "grad_norm": 1.6470681428909302, "learning_rate": 4.4214085634253703e-05, "loss": 0.7027, "mean_token_accuracy": 0.7801543653011322, "num_tokens": 150559553.0, "step": 14510 }, { "entropy": 0.5805510222911835, "epoch": 0.11616, "grad_norm": 2.903913736343384, "learning_rate": 4.421008403361345e-05, "loss": 0.57, "mean_token_accuracy": 0.8296395301818847, "num_tokens": 150632297.0, "step": 14520 }, { "entropy": 0.7281055092811585, "epoch": 0.11624, "grad_norm": 2.7253940105438232, "learning_rate": 4.420608243297319e-05, "loss": 0.7396, "mean_token_accuracy": 0.7908038914203643, "num_tokens": 150725253.0, "step": 14530 }, { "entropy": 0.7049033164978027, "epoch": 0.11632, "grad_norm": 2.3956048488616943, "learning_rate": 4.4202080832332935e-05, "loss": 0.7063, "mean_token_accuracy": 0.7883127450942993, "num_tokens": 150869269.0, "step": 14540 }, { "entropy": 0.654688423871994, "epoch": 0.1164, "grad_norm": 8.0786714553833, "learning_rate": 4.419807923169268e-05, "loss": 0.6444, "mean_token_accuracy": 0.8280728936195374, "num_tokens": 150905315.0, "step": 14550 }, { "entropy": 0.6647000670433044, "epoch": 0.11648, "grad_norm": 1.665379524230957, "learning_rate": 4.419407763105242e-05, "loss": 0.6606, "mean_token_accuracy": 0.7926844239234925, "num_tokens": 151069155.0, "step": 14560 }, { "entropy": 0.713859674334526, "epoch": 0.11656, "grad_norm": 2.9781219959259033, "learning_rate": 4.419007603041217e-05, "loss": 0.711, "mean_token_accuracy": 0.7916568756103516, "num_tokens": 151167789.0, "step": 14570 }, { "entropy": 0.680623322725296, "epoch": 0.11664, "grad_norm": 2.3997063636779785, "learning_rate": 4.418607442977191e-05, "loss": 0.696, "mean_token_accuracy": 0.7968083024024963, "num_tokens": 151262876.0, "step": 14580 }, { "entropy": 0.700680410861969, "epoch": 0.11672, "grad_norm": 2.1387627124786377, "learning_rate": 4.418207282913165e-05, "loss": 0.6893, "mean_token_accuracy": 0.7878820657730102, "num_tokens": 151415008.0, "step": 14590 }, { "entropy": 0.7220070958137512, "epoch": 0.1168, "grad_norm": 5.25945520401001, "learning_rate": 4.41780712284914e-05, "loss": 0.7134, "mean_token_accuracy": 0.807616651058197, "num_tokens": 151460417.0, "step": 14600 }, { "entropy": 0.6973098754882813, "epoch": 0.11688, "grad_norm": 1.7499048709869385, "learning_rate": 4.417406962785115e-05, "loss": 0.7002, "mean_token_accuracy": 0.7841432094573975, "num_tokens": 151621116.0, "step": 14610 }, { "entropy": 0.6568103462457657, "epoch": 0.11696, "grad_norm": 4.342935562133789, "learning_rate": 4.4170068027210884e-05, "loss": 0.65, "mean_token_accuracy": 0.8145648181438446, "num_tokens": 151692783.0, "step": 14620 }, { "entropy": 0.7246314346790313, "epoch": 0.11704, "grad_norm": 1.4981032609939575, "learning_rate": 4.416606642657063e-05, "loss": 0.7334, "mean_token_accuracy": 0.7939691126346589, "num_tokens": 151788283.0, "step": 14630 }, { "entropy": 0.7251832246780395, "epoch": 0.11712, "grad_norm": 2.3203063011169434, "learning_rate": 4.416206482593037e-05, "loss": 0.7319, "mean_token_accuracy": 0.7793831288814544, "num_tokens": 151928446.0, "step": 14640 }, { "entropy": 0.7221195250749588, "epoch": 0.1172, "grad_norm": 6.288846015930176, "learning_rate": 4.415806322529012e-05, "loss": 0.7161, "mean_token_accuracy": 0.8101840674877167, "num_tokens": 151967453.0, "step": 14650 }, { "entropy": 0.7028827786445617, "epoch": 0.11728, "grad_norm": 1.7562168836593628, "learning_rate": 4.415406162464986e-05, "loss": 0.7012, "mean_token_accuracy": 0.7826025784015656, "num_tokens": 152131293.0, "step": 14660 }, { "entropy": 0.6554537773132324, "epoch": 0.11736, "grad_norm": 2.891988515853882, "learning_rate": 4.41500600240096e-05, "loss": 0.6529, "mean_token_accuracy": 0.8100679874420166, "num_tokens": 152228321.0, "step": 14670 }, { "entropy": 0.6666979908943176, "epoch": 0.11744, "grad_norm": 1.8625670671463013, "learning_rate": 4.414605842336935e-05, "loss": 0.6884, "mean_token_accuracy": 0.799882972240448, "num_tokens": 152322532.0, "step": 14680 }, { "entropy": 0.8012755692005158, "epoch": 0.11752, "grad_norm": 2.098201036453247, "learning_rate": 4.41420568227291e-05, "loss": 0.777, "mean_token_accuracy": 0.7688610076904296, "num_tokens": 152461131.0, "step": 14690 }, { "entropy": 0.6734675586223602, "epoch": 0.1176, "grad_norm": 4.525849342346191, "learning_rate": 4.4138055222088834e-05, "loss": 0.6736, "mean_token_accuracy": 0.8157640039920807, "num_tokens": 152503726.0, "step": 14700 }, { "entropy": 0.66890290081501, "epoch": 0.11768, "grad_norm": 1.7982635498046875, "learning_rate": 4.413405362144858e-05, "loss": 0.6778, "mean_token_accuracy": 0.7876099169254303, "num_tokens": 152667566.0, "step": 14710 }, { "entropy": 0.7087140023708344, "epoch": 0.11776, "grad_norm": 3.5807831287384033, "learning_rate": 4.413005202080833e-05, "loss": 0.7021, "mean_token_accuracy": 0.7994684159755707, "num_tokens": 152747571.0, "step": 14720 }, { "entropy": 0.7397977650165558, "epoch": 0.11784, "grad_norm": 1.554025411605835, "learning_rate": 4.412605042016807e-05, "loss": 0.7558, "mean_token_accuracy": 0.7867425680160522, "num_tokens": 152841475.0, "step": 14730 }, { "entropy": 0.7701398670673371, "epoch": 0.11792, "grad_norm": 2.169642448425293, "learning_rate": 4.412204881952781e-05, "loss": 0.7547, "mean_token_accuracy": 0.7802496612071991, "num_tokens": 152967964.0, "step": 14740 }, { "entropy": 0.693942990899086, "epoch": 0.118, "grad_norm": 5.358114719390869, "learning_rate": 4.411804721888756e-05, "loss": 0.6856, "mean_token_accuracy": 0.8151604235172272, "num_tokens": 153005226.0, "step": 14750 }, { "entropy": 0.697349202632904, "epoch": 0.11808, "grad_norm": 1.9005184173583984, "learning_rate": 4.41140456182473e-05, "loss": 0.7032, "mean_token_accuracy": 0.7853016674518585, "num_tokens": 153169066.0, "step": 14760 }, { "entropy": 0.6717886924743652, "epoch": 0.11816, "grad_norm": 3.8569412231445312, "learning_rate": 4.411004401760705e-05, "loss": 0.6646, "mean_token_accuracy": 0.8082966983318329, "num_tokens": 153255664.0, "step": 14770 }, { "entropy": 0.7319947481155396, "epoch": 0.11824, "grad_norm": 2.487520456314087, "learning_rate": 4.4106042416966784e-05, "loss": 0.7515, "mean_token_accuracy": 0.7910185158252716, "num_tokens": 153349969.0, "step": 14780 }, { "entropy": 0.7251519024372101, "epoch": 0.11832, "grad_norm": 2.57621431350708, "learning_rate": 4.4102040816326534e-05, "loss": 0.7108, "mean_token_accuracy": 0.7873547315597534, "num_tokens": 153472398.0, "step": 14790 }, { "entropy": 0.6720200657844544, "epoch": 0.1184, "grad_norm": 6.328575134277344, "learning_rate": 4.409803921568628e-05, "loss": 0.6509, "mean_token_accuracy": 0.8201287031173706, "num_tokens": 153507975.0, "step": 14800 }, { "entropy": 0.6747949719429016, "epoch": 0.11848, "grad_norm": 2.4004244804382324, "learning_rate": 4.409403761504602e-05, "loss": 0.6819, "mean_token_accuracy": 0.7857840836048127, "num_tokens": 153671815.0, "step": 14810 }, { "entropy": 0.7686196386814117, "epoch": 0.11856, "grad_norm": 3.972884178161621, "learning_rate": 4.4090036014405765e-05, "loss": 0.7631, "mean_token_accuracy": 0.784716111421585, "num_tokens": 153768028.0, "step": 14820 }, { "entropy": 0.684772178530693, "epoch": 0.11864, "grad_norm": 2.5076913833618164, "learning_rate": 4.408603441376551e-05, "loss": 0.6855, "mean_token_accuracy": 0.8070859849452973, "num_tokens": 153861994.0, "step": 14830 }, { "entropy": 0.6757087051868439, "epoch": 0.11872, "grad_norm": 2.039438247680664, "learning_rate": 4.408203281312525e-05, "loss": 0.6686, "mean_token_accuracy": 0.794170743227005, "num_tokens": 154007661.0, "step": 14840 }, { "entropy": 0.6318200469017029, "epoch": 0.1188, "grad_norm": 4.7805705070495605, "learning_rate": 4.4078031212484996e-05, "loss": 0.6519, "mean_token_accuracy": 0.8214571237564087, "num_tokens": 154051409.0, "step": 14850 }, { "entropy": 0.692693966627121, "epoch": 0.11888, "grad_norm": 1.8561394214630127, "learning_rate": 4.407402961184474e-05, "loss": 0.6884, "mean_token_accuracy": 0.7853917002677917, "num_tokens": 154214894.0, "step": 14860 }, { "entropy": 0.7273220658302307, "epoch": 0.11896, "grad_norm": 4.4402666091918945, "learning_rate": 4.4070028011204484e-05, "loss": 0.7246, "mean_token_accuracy": 0.7937520861625671, "num_tokens": 154301565.0, "step": 14870 }, { "entropy": 0.7083954155445099, "epoch": 0.11904, "grad_norm": 1.877868413925171, "learning_rate": 4.406602641056423e-05, "loss": 0.7011, "mean_token_accuracy": 0.7954654514789581, "num_tokens": 154396935.0, "step": 14880 }, { "entropy": 0.656999796628952, "epoch": 0.11912, "grad_norm": 2.3503804206848145, "learning_rate": 4.406202480992397e-05, "loss": 0.6532, "mean_token_accuracy": 0.7969020664691925, "num_tokens": 154540545.0, "step": 14890 }, { "entropy": 0.7083822548389435, "epoch": 0.1192, "grad_norm": 4.495091438293457, "learning_rate": 4.4058023209283715e-05, "loss": 0.696, "mean_token_accuracy": 0.811091935634613, "num_tokens": 154579643.0, "step": 14900 }, { "entropy": 0.69821697473526, "epoch": 0.11928, "grad_norm": 1.6057088375091553, "learning_rate": 4.405402160864346e-05, "loss": 0.6989, "mean_token_accuracy": 0.7802515923976898, "num_tokens": 154743483.0, "step": 14910 }, { "entropy": 0.6672575861215592, "epoch": 0.11936, "grad_norm": 3.062239646911621, "learning_rate": 4.40500200080032e-05, "loss": 0.6533, "mean_token_accuracy": 0.8093208074569702, "num_tokens": 154829982.0, "step": 14920 }, { "entropy": 0.6996736288070678, "epoch": 0.11944, "grad_norm": 1.7245436906814575, "learning_rate": 4.4046018407362946e-05, "loss": 0.7205, "mean_token_accuracy": 0.7933784544467926, "num_tokens": 154923586.0, "step": 14930 }, { "entropy": 0.7369634836912156, "epoch": 0.11952, "grad_norm": 2.7572021484375, "learning_rate": 4.404201680672269e-05, "loss": 0.7278, "mean_token_accuracy": 0.7810986876487732, "num_tokens": 155069270.0, "step": 14940 }, { "entropy": 0.6766269445419312, "epoch": 0.1196, "grad_norm": 5.386664390563965, "learning_rate": 4.4038015206082434e-05, "loss": 0.6702, "mean_token_accuracy": 0.8197696030139923, "num_tokens": 155117411.0, "step": 14950 }, { "entropy": 0.6780711352825165, "epoch": 0.11968, "grad_norm": 1.9526798725128174, "learning_rate": 4.4034013605442184e-05, "loss": 0.6772, "mean_token_accuracy": 0.7876343488693237, "num_tokens": 155281251.0, "step": 14960 }, { "entropy": 0.6682603120803833, "epoch": 0.11976, "grad_norm": 3.1559388637542725, "learning_rate": 4.403001200480192e-05, "loss": 0.6644, "mean_token_accuracy": 0.8070485055446625, "num_tokens": 155370633.0, "step": 14970 }, { "entropy": 0.7076525092124939, "epoch": 0.11984, "grad_norm": 1.5215338468551636, "learning_rate": 4.4026010404161665e-05, "loss": 0.7221, "mean_token_accuracy": 0.7948191463947296, "num_tokens": 155466363.0, "step": 14980 }, { "entropy": 0.6949032336473465, "epoch": 0.11992, "grad_norm": 2.5140626430511475, "learning_rate": 4.402200880352141e-05, "loss": 0.6871, "mean_token_accuracy": 0.7954028129577637, "num_tokens": 155584535.0, "step": 14990 }, { "entropy": 0.7404985547065734, "epoch": 0.12, "grad_norm": 5.4600911140441895, "learning_rate": 4.401800720288116e-05, "loss": 0.7166, "mean_token_accuracy": 0.8073473334312439, "num_tokens": 155616499.0, "step": 15000 }, { "entropy": 0.6773278057575226, "epoch": 0.12008, "grad_norm": 2.0293400287628174, "learning_rate": 4.4014005602240896e-05, "loss": 0.679, "mean_token_accuracy": 0.7908801257610321, "num_tokens": 155779041.0, "step": 15010 }, { "entropy": 0.7087663531303405, "epoch": 0.12016, "grad_norm": 4.210256099700928, "learning_rate": 4.401000400160064e-05, "loss": 0.703, "mean_token_accuracy": 0.7994710326194763, "num_tokens": 155858968.0, "step": 15020 }, { "entropy": 0.6796430230140686, "epoch": 0.12024, "grad_norm": 2.555490255355835, "learning_rate": 4.400600240096039e-05, "loss": 0.688, "mean_token_accuracy": 0.8035194575786591, "num_tokens": 155952337.0, "step": 15030 }, { "entropy": 0.7666067540645599, "epoch": 0.12032, "grad_norm": 2.7230277061462402, "learning_rate": 4.4002000800320134e-05, "loss": 0.7569, "mean_token_accuracy": 0.775892972946167, "num_tokens": 156097037.0, "step": 15040 }, { "entropy": 0.651194167137146, "epoch": 0.1204, "grad_norm": 4.7484331130981445, "learning_rate": 4.399799919967987e-05, "loss": 0.6458, "mean_token_accuracy": 0.8209010124206543, "num_tokens": 156137073.0, "step": 15050 }, { "entropy": 0.6342995762825012, "epoch": 0.12048, "grad_norm": 1.6891725063323975, "learning_rate": 4.3993997599039614e-05, "loss": 0.6341, "mean_token_accuracy": 0.7983817815780639, "num_tokens": 156300913.0, "step": 15060 }, { "entropy": 0.7321436107158661, "epoch": 0.12056, "grad_norm": 3.1038992404937744, "learning_rate": 4.3989995998399365e-05, "loss": 0.7288, "mean_token_accuracy": 0.7969163835048676, "num_tokens": 156375916.0, "step": 15070 }, { "entropy": 0.7151026129722595, "epoch": 0.12064, "grad_norm": 2.115730047225952, "learning_rate": 4.398599439775911e-05, "loss": 0.7314, "mean_token_accuracy": 0.7946302711963653, "num_tokens": 156467670.0, "step": 15080 }, { "entropy": 0.8127421200275421, "epoch": 0.12072, "grad_norm": 2.5874955654144287, "learning_rate": 4.3981992797118846e-05, "loss": 0.8026, "mean_token_accuracy": 0.7651125729084015, "num_tokens": 156607571.0, "step": 15090 }, { "entropy": 0.6452181279659271, "epoch": 0.1208, "grad_norm": 5.120957851409912, "learning_rate": 4.3977991196478596e-05, "loss": 0.6286, "mean_token_accuracy": 0.8254880249500275, "num_tokens": 156648871.0, "step": 15100 }, { "entropy": 0.7108005940914154, "epoch": 0.12088, "grad_norm": 1.851671576499939, "learning_rate": 4.397398959583834e-05, "loss": 0.7205, "mean_token_accuracy": 0.7740290701389313, "num_tokens": 156812711.0, "step": 15110 }, { "entropy": 0.669784951210022, "epoch": 0.12096, "grad_norm": 4.633795738220215, "learning_rate": 4.3969987995198083e-05, "loss": 0.6624, "mean_token_accuracy": 0.8101229190826416, "num_tokens": 156887409.0, "step": 15120 }, { "entropy": 0.6713050723075866, "epoch": 0.12104, "grad_norm": 1.5717787742614746, "learning_rate": 4.396598639455782e-05, "loss": 0.6701, "mean_token_accuracy": 0.8072080552577973, "num_tokens": 156978774.0, "step": 15130 }, { "entropy": 0.695129805803299, "epoch": 0.12112, "grad_norm": 3.2607874870300293, "learning_rate": 4.396198479391757e-05, "loss": 0.7064, "mean_token_accuracy": 0.7874440014362335, "num_tokens": 157114968.0, "step": 15140 }, { "entropy": 0.6466750860214233, "epoch": 0.1212, "grad_norm": 4.587413787841797, "learning_rate": 4.3957983193277315e-05, "loss": 0.628, "mean_token_accuracy": 0.8264996469020843, "num_tokens": 157156676.0, "step": 15150 }, { "entropy": 0.6799979388713837, "epoch": 0.12128, "grad_norm": 2.215203285217285, "learning_rate": 4.395398159263706e-05, "loss": 0.6808, "mean_token_accuracy": 0.7857107996940613, "num_tokens": 157320516.0, "step": 15160 }, { "entropy": 0.7890536248683929, "epoch": 0.12136, "grad_norm": 3.3017454147338867, "learning_rate": 4.3949979991996795e-05, "loss": 0.788, "mean_token_accuracy": 0.7771360039710998, "num_tokens": 157406845.0, "step": 15170 }, { "entropy": 0.6857055246829986, "epoch": 0.12144, "grad_norm": 2.0413990020751953, "learning_rate": 4.3945978391356546e-05, "loss": 0.6869, "mean_token_accuracy": 0.8015466928482056, "num_tokens": 157499965.0, "step": 15180 }, { "entropy": 0.7272392690181733, "epoch": 0.12152, "grad_norm": 3.0517635345458984, "learning_rate": 4.394197679071629e-05, "loss": 0.7345, "mean_token_accuracy": 0.7815478682518006, "num_tokens": 157635194.0, "step": 15190 }, { "entropy": 0.7019682109355927, "epoch": 0.1216, "grad_norm": 4.708246231079102, "learning_rate": 4.393797519007603e-05, "loss": 0.6871, "mean_token_accuracy": 0.8147933840751648, "num_tokens": 157673124.0, "step": 15200 }, { "entropy": 0.7061561048030853, "epoch": 0.12168, "grad_norm": 2.42496395111084, "learning_rate": 4.393397358943578e-05, "loss": 0.712, "mean_token_accuracy": 0.7773510038852691, "num_tokens": 157836964.0, "step": 15210 }, { "entropy": 0.7354114532470704, "epoch": 0.12176, "grad_norm": 3.588256359100342, "learning_rate": 4.392997198879552e-05, "loss": 0.729, "mean_token_accuracy": 0.791748297214508, "num_tokens": 157921946.0, "step": 15220 }, { "entropy": 0.7350066602230072, "epoch": 0.12184, "grad_norm": 2.6807754039764404, "learning_rate": 4.3925970388155264e-05, "loss": 0.7394, "mean_token_accuracy": 0.7886216342449188, "num_tokens": 158015705.0, "step": 15230 }, { "entropy": 0.7233297169208527, "epoch": 0.12192, "grad_norm": 2.757396697998047, "learning_rate": 4.392196878751501e-05, "loss": 0.711, "mean_token_accuracy": 0.7850882053375244, "num_tokens": 158147719.0, "step": 15240 }, { "entropy": 0.6856438279151916, "epoch": 0.122, "grad_norm": 4.48367166519165, "learning_rate": 4.391796718687475e-05, "loss": 0.6871, "mean_token_accuracy": 0.8154216349124909, "num_tokens": 158184112.0, "step": 15250 }, { "entropy": 0.6859181165695191, "epoch": 0.12208, "grad_norm": 1.6909116506576538, "learning_rate": 4.3913965586234495e-05, "loss": 0.6813, "mean_token_accuracy": 0.7859245300292969, "num_tokens": 158347952.0, "step": 15260 }, { "entropy": 0.6360074490308761, "epoch": 0.12216, "grad_norm": 4.303045272827148, "learning_rate": 4.390996398559424e-05, "loss": 0.6415, "mean_token_accuracy": 0.811501556634903, "num_tokens": 158436335.0, "step": 15270 }, { "entropy": 0.6517377376556397, "epoch": 0.12224, "grad_norm": 1.516862154006958, "learning_rate": 4.390596238495398e-05, "loss": 0.6517, "mean_token_accuracy": 0.809248173236847, "num_tokens": 158530179.0, "step": 15280 }, { "entropy": 0.7311360597610473, "epoch": 0.12232, "grad_norm": 2.7930681705474854, "learning_rate": 4.390196078431373e-05, "loss": 0.7221, "mean_token_accuracy": 0.788478422164917, "num_tokens": 158655257.0, "step": 15290 }, { "entropy": 0.6714531809091568, "epoch": 0.1224, "grad_norm": 5.1848955154418945, "learning_rate": 4.389795918367347e-05, "loss": 0.667, "mean_token_accuracy": 0.8192797601222992, "num_tokens": 158690137.0, "step": 15300 }, { "entropy": 0.6782655477523803, "epoch": 0.12248, "grad_norm": 2.057469367980957, "learning_rate": 4.3893957583033214e-05, "loss": 0.6823, "mean_token_accuracy": 0.7864913582801819, "num_tokens": 158853885.0, "step": 15310 }, { "entropy": 0.7066805422306061, "epoch": 0.12256, "grad_norm": 3.6638569831848145, "learning_rate": 4.388995598239296e-05, "loss": 0.7053, "mean_token_accuracy": 0.8009966075420379, "num_tokens": 158935157.0, "step": 15320 }, { "entropy": 0.7210963428020477, "epoch": 0.12264, "grad_norm": 1.789664626121521, "learning_rate": 4.38859543817527e-05, "loss": 0.709, "mean_token_accuracy": 0.7992099821567535, "num_tokens": 159027901.0, "step": 15330 }, { "entropy": 0.6559651851654053, "epoch": 0.12272, "grad_norm": 2.704598903656006, "learning_rate": 4.3881952781112445e-05, "loss": 0.6554, "mean_token_accuracy": 0.798371434211731, "num_tokens": 159165490.0, "step": 15340 }, { "entropy": 0.7576385796070099, "epoch": 0.1228, "grad_norm": 4.697935581207275, "learning_rate": 4.3877951180472196e-05, "loss": 0.757, "mean_token_accuracy": 0.8032507479190827, "num_tokens": 159203417.0, "step": 15350 }, { "entropy": 0.6862546741962433, "epoch": 0.12288, "grad_norm": 1.7918341159820557, "learning_rate": 4.387394957983193e-05, "loss": 0.6893, "mean_token_accuracy": 0.7893393754959106, "num_tokens": 159365909.0, "step": 15360 }, { "entropy": 0.6886273682117462, "epoch": 0.12296, "grad_norm": 3.547689914703369, "learning_rate": 4.3869947979191676e-05, "loss": 0.6748, "mean_token_accuracy": 0.802646940946579, "num_tokens": 159440816.0, "step": 15370 }, { "entropy": 0.7272379219532012, "epoch": 0.12304, "grad_norm": 2.2453696727752686, "learning_rate": 4.386594637855142e-05, "loss": 0.7278, "mean_token_accuracy": 0.7945460736751556, "num_tokens": 159533321.0, "step": 15380 }, { "entropy": 0.6727491974830627, "epoch": 0.12312, "grad_norm": 2.0410640239715576, "learning_rate": 4.386194477791117e-05, "loss": 0.6739, "mean_token_accuracy": 0.7932830393314362, "num_tokens": 159671042.0, "step": 15390 }, { "entropy": 0.7726581871509552, "epoch": 0.1232, "grad_norm": 6.120673656463623, "learning_rate": 4.385794317727091e-05, "loss": 0.7694, "mean_token_accuracy": 0.8010774314403534, "num_tokens": 159710185.0, "step": 15400 }, { "entropy": 0.623664528131485, "epoch": 0.12328, "grad_norm": 1.5445175170898438, "learning_rate": 4.385394157663065e-05, "loss": 0.6231, "mean_token_accuracy": 0.801139098405838, "num_tokens": 159873706.0, "step": 15410 }, { "entropy": 0.6997706770896912, "epoch": 0.12336, "grad_norm": 2.988966941833496, "learning_rate": 4.38499399759904e-05, "loss": 0.6942, "mean_token_accuracy": 0.8037103831768035, "num_tokens": 159949755.0, "step": 15420 }, { "entropy": 0.7208093047142029, "epoch": 0.12344, "grad_norm": 2.2745816707611084, "learning_rate": 4.3845938375350145e-05, "loss": 0.7363, "mean_token_accuracy": 0.7914779126644135, "num_tokens": 160041624.0, "step": 15430 }, { "entropy": 0.6711148023605347, "epoch": 0.12352, "grad_norm": 2.6471664905548096, "learning_rate": 4.384193677470988e-05, "loss": 0.6554, "mean_token_accuracy": 0.7969404697418213, "num_tokens": 160173579.0, "step": 15440 }, { "entropy": 0.685212242603302, "epoch": 0.1236, "grad_norm": 4.858941078186035, "learning_rate": 4.3837935174069626e-05, "loss": 0.6883, "mean_token_accuracy": 0.8162374675273896, "num_tokens": 160212614.0, "step": 15450 }, { "entropy": 0.6454020321369172, "epoch": 0.12368, "grad_norm": 1.9226670265197754, "learning_rate": 4.3833933573429377e-05, "loss": 0.6496, "mean_token_accuracy": 0.7970105648040772, "num_tokens": 160376004.0, "step": 15460 }, { "entropy": 0.6770611494779587, "epoch": 0.12376, "grad_norm": 4.032089710235596, "learning_rate": 4.382993197278912e-05, "loss": 0.6658, "mean_token_accuracy": 0.8056073188781738, "num_tokens": 160451778.0, "step": 15470 }, { "entropy": 0.735640013217926, "epoch": 0.12384, "grad_norm": 1.6304174661636353, "learning_rate": 4.382593037214886e-05, "loss": 0.7399, "mean_token_accuracy": 0.7907910466194152, "num_tokens": 160545612.0, "step": 15480 }, { "entropy": 0.7207074642181397, "epoch": 0.12392, "grad_norm": 1.7761390209197998, "learning_rate": 4.382192877150861e-05, "loss": 0.7211, "mean_token_accuracy": 0.7827859699726105, "num_tokens": 160680183.0, "step": 15490 }, { "entropy": 0.7193065881729126, "epoch": 0.124, "grad_norm": 6.777990818023682, "learning_rate": 4.381792717086835e-05, "loss": 0.6951, "mean_token_accuracy": 0.8115652203559875, "num_tokens": 160718296.0, "step": 15500 }, { "entropy": 0.6611809670925141, "epoch": 0.12408, "grad_norm": 1.6990197896957397, "learning_rate": 4.3813925570228095e-05, "loss": 0.661, "mean_token_accuracy": 0.7937879979610443, "num_tokens": 160880226.0, "step": 15510 }, { "entropy": 0.6268722474575043, "epoch": 0.12416, "grad_norm": 3.9021010398864746, "learning_rate": 4.380992396958783e-05, "loss": 0.6321, "mean_token_accuracy": 0.8175712943077087, "num_tokens": 160950319.0, "step": 15520 }, { "entropy": 0.7537937939167023, "epoch": 0.12424, "grad_norm": 1.3841667175292969, "learning_rate": 4.380592236894758e-05, "loss": 0.7554, "mean_token_accuracy": 0.789286857843399, "num_tokens": 161043054.0, "step": 15530 }, { "entropy": 0.6670387506484985, "epoch": 0.12432, "grad_norm": 2.7597899436950684, "learning_rate": 4.3801920768307326e-05, "loss": 0.6624, "mean_token_accuracy": 0.7938944578170777, "num_tokens": 161188916.0, "step": 15540 }, { "entropy": 0.7144376456737518, "epoch": 0.1244, "grad_norm": 5.623250961303711, "learning_rate": 4.379791916766707e-05, "loss": 0.7173, "mean_token_accuracy": 0.8077467799186706, "num_tokens": 161228488.0, "step": 15550 }, { "entropy": 0.6630600750446319, "epoch": 0.12448, "grad_norm": 1.7950737476348877, "learning_rate": 4.3793917567026814e-05, "loss": 0.6621, "mean_token_accuracy": 0.7943004190921783, "num_tokens": 161392168.0, "step": 15560 }, { "entropy": 0.7233677715063095, "epoch": 0.12456, "grad_norm": 3.1638295650482178, "learning_rate": 4.378991596638656e-05, "loss": 0.7178, "mean_token_accuracy": 0.7978087246418, "num_tokens": 161472879.0, "step": 15570 }, { "entropy": 0.6948908984661102, "epoch": 0.12464, "grad_norm": 1.5089439153671265, "learning_rate": 4.37859143657463e-05, "loss": 0.6937, "mean_token_accuracy": 0.8013047635555267, "num_tokens": 161567288.0, "step": 15580 }, { "entropy": 0.7040693998336792, "epoch": 0.12472, "grad_norm": 3.3037102222442627, "learning_rate": 4.3781912765106045e-05, "loss": 0.6998, "mean_token_accuracy": 0.7851200044155121, "num_tokens": 161707671.0, "step": 15590 }, { "entropy": 0.6811096459627152, "epoch": 0.1248, "grad_norm": 4.843608379364014, "learning_rate": 4.377791116446579e-05, "loss": 0.6822, "mean_token_accuracy": 0.8174721777439118, "num_tokens": 161745398.0, "step": 15600 }, { "entropy": 0.6608133971691131, "epoch": 0.12488, "grad_norm": 1.545520305633545, "learning_rate": 4.377390956382553e-05, "loss": 0.6555, "mean_token_accuracy": 0.7924096286296844, "num_tokens": 161909238.0, "step": 15610 }, { "entropy": 0.6877905488014221, "epoch": 0.12496, "grad_norm": 3.9575247764587402, "learning_rate": 4.3769907963185276e-05, "loss": 0.6855, "mean_token_accuracy": 0.8069523513317108, "num_tokens": 161991323.0, "step": 15620 }, { "entropy": 0.7317264378070831, "epoch": 0.12504, "grad_norm": 2.025707960128784, "learning_rate": 4.376590636254502e-05, "loss": 0.737, "mean_token_accuracy": 0.7912195026874542, "num_tokens": 162083893.0, "step": 15630 }, { "entropy": 0.6811104834079742, "epoch": 0.12512, "grad_norm": 1.833824872970581, "learning_rate": 4.376190476190476e-05, "loss": 0.6742, "mean_token_accuracy": 0.7881046175956726, "num_tokens": 162234701.0, "step": 15640 }, { "entropy": 0.6556142687797546, "epoch": 0.1252, "grad_norm": 6.695104122161865, "learning_rate": 4.375790316126451e-05, "loss": 0.664, "mean_token_accuracy": 0.8173115193843842, "num_tokens": 162278102.0, "step": 15650 }, { "entropy": 0.6980163931846619, "epoch": 0.12528, "grad_norm": 1.6440143585205078, "learning_rate": 4.375390156062425e-05, "loss": 0.7017, "mean_token_accuracy": 0.7825035393238068, "num_tokens": 162441844.0, "step": 15660 }, { "entropy": 0.6601454317569733, "epoch": 0.12536, "grad_norm": 2.9454996585845947, "learning_rate": 4.3749899959983995e-05, "loss": 0.6506, "mean_token_accuracy": 0.8073014438152313, "num_tokens": 162532631.0, "step": 15670 }, { "entropy": 0.6559243321418762, "epoch": 0.12544, "grad_norm": 1.584830641746521, "learning_rate": 4.374589835934374e-05, "loss": 0.6629, "mean_token_accuracy": 0.80797558426857, "num_tokens": 162628764.0, "step": 15680 }, { "entropy": 0.714905297756195, "epoch": 0.12552, "grad_norm": 2.526925563812256, "learning_rate": 4.374189675870348e-05, "loss": 0.7038, "mean_token_accuracy": 0.7912886679172516, "num_tokens": 162755225.0, "step": 15690 }, { "entropy": 0.7092864990234375, "epoch": 0.1256, "grad_norm": 4.535562992095947, "learning_rate": 4.373789515806323e-05, "loss": 0.7079, "mean_token_accuracy": 0.8137303948402405, "num_tokens": 162790363.0, "step": 15700 }, { "entropy": 0.6450768828392028, "epoch": 0.12568, "grad_norm": 1.679663062095642, "learning_rate": 4.373389355742297e-05, "loss": 0.6436, "mean_token_accuracy": 0.7920065879821777, "num_tokens": 162954203.0, "step": 15710 }, { "entropy": 0.6686464250087738, "epoch": 0.12576, "grad_norm": 2.701185464859009, "learning_rate": 4.372989195678271e-05, "loss": 0.6654, "mean_token_accuracy": 0.8072770774364472, "num_tokens": 163050294.0, "step": 15720 }, { "entropy": 0.6946273446083069, "epoch": 0.12584, "grad_norm": 1.6429927349090576, "learning_rate": 4.372589035614246e-05, "loss": 0.6926, "mean_token_accuracy": 0.8022482514381408, "num_tokens": 163145588.0, "step": 15730 }, { "entropy": 0.6864681541919708, "epoch": 0.12592, "grad_norm": 2.2199792861938477, "learning_rate": 4.372188875550221e-05, "loss": 0.6748, "mean_token_accuracy": 0.7908857882022857, "num_tokens": 163287347.0, "step": 15740 }, { "entropy": 0.6746859312057495, "epoch": 0.126, "grad_norm": 4.605494022369385, "learning_rate": 4.3717887154861944e-05, "loss": 0.671, "mean_token_accuracy": 0.8169751048088074, "num_tokens": 163327698.0, "step": 15750 }, { "entropy": 0.6706317842006684, "epoch": 0.12608, "grad_norm": 1.4730818271636963, "learning_rate": 4.371388555422169e-05, "loss": 0.6726, "mean_token_accuracy": 0.7885038912296295, "num_tokens": 163490988.0, "step": 15760 }, { "entropy": 0.8133762657642365, "epoch": 0.12616, "grad_norm": 4.987318992614746, "learning_rate": 4.370988395358143e-05, "loss": 0.8101, "mean_token_accuracy": 0.7733116805553436, "num_tokens": 163571728.0, "step": 15770 }, { "entropy": 0.7028828144073487, "epoch": 0.12624, "grad_norm": 2.348848342895508, "learning_rate": 4.370588235294118e-05, "loss": 0.7161, "mean_token_accuracy": 0.7948881149291992, "num_tokens": 163663987.0, "step": 15780 }, { "entropy": 0.6703800797462464, "epoch": 0.12632, "grad_norm": 3.3402326107025146, "learning_rate": 4.370188075230092e-05, "loss": 0.6571, "mean_token_accuracy": 0.7983280837535858, "num_tokens": 163798583.0, "step": 15790 }, { "entropy": 0.7422051370143891, "epoch": 0.1264, "grad_norm": 4.862460136413574, "learning_rate": 4.369787915166066e-05, "loss": 0.7359, "mean_token_accuracy": 0.8041999459266662, "num_tokens": 163835643.0, "step": 15800 }, { "entropy": 0.682075023651123, "epoch": 0.12648, "grad_norm": 2.6125144958496094, "learning_rate": 4.369387755102041e-05, "loss": 0.6856, "mean_token_accuracy": 0.781808751821518, "num_tokens": 163999483.0, "step": 15810 }, { "entropy": 0.6916822671890259, "epoch": 0.12656, "grad_norm": 3.209850549697876, "learning_rate": 4.368987595038016e-05, "loss": 0.6769, "mean_token_accuracy": 0.8044514179229736, "num_tokens": 164089478.0, "step": 15820 }, { "entropy": 0.7552872061729431, "epoch": 0.12664, "grad_norm": 1.698830008506775, "learning_rate": 4.3685874349739894e-05, "loss": 0.7658, "mean_token_accuracy": 0.7853499948978424, "num_tokens": 164184695.0, "step": 15830 }, { "entropy": 0.7367381513118744, "epoch": 0.12672, "grad_norm": 2.378694534301758, "learning_rate": 4.368187274909964e-05, "loss": 0.7375, "mean_token_accuracy": 0.7777523100376129, "num_tokens": 164326161.0, "step": 15840 }, { "entropy": 0.7369570910930634, "epoch": 0.1268, "grad_norm": 6.638513565063477, "learning_rate": 4.367787114845939e-05, "loss": 0.7194, "mean_token_accuracy": 0.8064021527767181, "num_tokens": 164370121.0, "step": 15850 }, { "entropy": 0.6768566727638244, "epoch": 0.12688, "grad_norm": 1.6052701473236084, "learning_rate": 4.367386954781913e-05, "loss": 0.6816, "mean_token_accuracy": 0.7902102649211884, "num_tokens": 164533553.0, "step": 15860 }, { "entropy": 0.6995029091835022, "epoch": 0.12696, "grad_norm": 3.5813522338867188, "learning_rate": 4.366986794717887e-05, "loss": 0.6855, "mean_token_accuracy": 0.8032395660877227, "num_tokens": 164600747.0, "step": 15870 }, { "entropy": 0.7163476765155792, "epoch": 0.12704, "grad_norm": 1.989370584487915, "learning_rate": 4.366586634653862e-05, "loss": 0.747, "mean_token_accuracy": 0.7960987389087677, "num_tokens": 164692384.0, "step": 15880 }, { "entropy": 0.7279501080513, "epoch": 0.12712, "grad_norm": 3.33444881439209, "learning_rate": 4.366186474589836e-05, "loss": 0.7092, "mean_token_accuracy": 0.7859350383281708, "num_tokens": 164809712.0, "step": 15890 }, { "entropy": 0.6191178351640702, "epoch": 0.1272, "grad_norm": 4.7624030113220215, "learning_rate": 4.365786314525811e-05, "loss": 0.6124, "mean_token_accuracy": 0.8282551050186158, "num_tokens": 164846568.0, "step": 15900 }, { "entropy": 0.6835008084774017, "epoch": 0.12728, "grad_norm": 1.3449963331222534, "learning_rate": 4.3653861544617844e-05, "loss": 0.6856, "mean_token_accuracy": 0.7822300970554352, "num_tokens": 165010408.0, "step": 15910 }, { "entropy": 0.7703873574733734, "epoch": 0.12736, "grad_norm": 3.4700911045074463, "learning_rate": 4.3649859943977594e-05, "loss": 0.7589, "mean_token_accuracy": 0.7815527737140655, "num_tokens": 165100725.0, "step": 15920 }, { "entropy": 0.716724294424057, "epoch": 0.12744, "grad_norm": 1.956160068511963, "learning_rate": 4.364585834333734e-05, "loss": 0.7309, "mean_token_accuracy": 0.793292784690857, "num_tokens": 165193815.0, "step": 15930 }, { "entropy": 0.6930659472942352, "epoch": 0.12752, "grad_norm": 2.4504711627960205, "learning_rate": 4.364185674269708e-05, "loss": 0.6842, "mean_token_accuracy": 0.792094212770462, "num_tokens": 165320784.0, "step": 15940 }, { "entropy": 0.7165239512920379, "epoch": 0.1276, "grad_norm": 5.581003189086914, "learning_rate": 4.3637855142056825e-05, "loss": 0.7045, "mean_token_accuracy": 0.8165974617004395, "num_tokens": 165355681.0, "step": 15950 }, { "entropy": 0.6606840193271637, "epoch": 0.12768, "grad_norm": 1.8469287157058716, "learning_rate": 4.363385354141657e-05, "loss": 0.6665, "mean_token_accuracy": 0.7903578400611877, "num_tokens": 165519521.0, "step": 15960 }, { "entropy": 0.7674797415733338, "epoch": 0.12776, "grad_norm": 5.062646389007568, "learning_rate": 4.362985194077631e-05, "loss": 0.7672, "mean_token_accuracy": 0.781598699092865, "num_tokens": 165612729.0, "step": 15970 }, { "entropy": 0.6745055913925171, "epoch": 0.12784, "grad_norm": 2.158216714859009, "learning_rate": 4.3625850340136056e-05, "loss": 0.6631, "mean_token_accuracy": 0.8053557455539704, "num_tokens": 165706568.0, "step": 15980 }, { "entropy": 0.6889470994472504, "epoch": 0.12792, "grad_norm": 3.425575017929077, "learning_rate": 4.36218487394958e-05, "loss": 0.6906, "mean_token_accuracy": 0.7870356440544128, "num_tokens": 165852817.0, "step": 15990 }, { "entropy": 0.6943177163600922, "epoch": 0.128, "grad_norm": 4.054020881652832, "learning_rate": 4.3617847138855544e-05, "loss": 0.7092, "mean_token_accuracy": 0.8075175404548645, "num_tokens": 165891019.0, "step": 16000 }, { "entropy": 0.6663215190172196, "epoch": 0.12808, "grad_norm": 2.330023765563965, "learning_rate": 4.361384553821529e-05, "loss": 0.6587, "mean_token_accuracy": 0.79099902510643, "num_tokens": 166054859.0, "step": 16010 }, { "entropy": 0.7479134500026703, "epoch": 0.12816, "grad_norm": 3.2896735668182373, "learning_rate": 4.360984393757503e-05, "loss": 0.7417, "mean_token_accuracy": 0.7862857937812805, "num_tokens": 166154003.0, "step": 16020 }, { "entropy": 0.7639139354228973, "epoch": 0.12824, "grad_norm": 1.5650080442428589, "learning_rate": 4.3605842336934775e-05, "loss": 0.7485, "mean_token_accuracy": 0.789051216840744, "num_tokens": 166248538.0, "step": 16030 }, { "entropy": 0.6647215247154236, "epoch": 0.12832, "grad_norm": 3.8794097900390625, "learning_rate": 4.360184073629452e-05, "loss": 0.6688, "mean_token_accuracy": 0.7933046996593476, "num_tokens": 166381181.0, "step": 16040 }, { "entropy": 0.7715520083904266, "epoch": 0.1284, "grad_norm": 6.5184855461120605, "learning_rate": 4.359783913565426e-05, "loss": 0.7568, "mean_token_accuracy": 0.8044127941131591, "num_tokens": 166414045.0, "step": 16050 }, { "entropy": 0.7576490700244903, "epoch": 0.12848, "grad_norm": 2.8008475303649902, "learning_rate": 4.3593837535014006e-05, "loss": 0.7547, "mean_token_accuracy": 0.7721492052078247, "num_tokens": 166576827.0, "step": 16060 }, { "entropy": 0.7016297936439514, "epoch": 0.12856, "grad_norm": 4.142853260040283, "learning_rate": 4.358983593437375e-05, "loss": 0.7105, "mean_token_accuracy": 0.7998201131820679, "num_tokens": 166649453.0, "step": 16070 }, { "entropy": 0.7406574189662933, "epoch": 0.12864, "grad_norm": 1.661831021308899, "learning_rate": 4.3585834333733494e-05, "loss": 0.7395, "mean_token_accuracy": 0.7895227432250976, "num_tokens": 166742873.0, "step": 16080 }, { "entropy": 0.7056031584739685, "epoch": 0.12872, "grad_norm": 3.2520298957824707, "learning_rate": 4.3581832733093244e-05, "loss": 0.6951, "mean_token_accuracy": 0.7866340637207031, "num_tokens": 166888810.0, "step": 16090 }, { "entropy": 0.6535335004329681, "epoch": 0.1288, "grad_norm": 5.182579040527344, "learning_rate": 4.357783113245298e-05, "loss": 0.6398, "mean_token_accuracy": 0.8245901107788086, "num_tokens": 166931487.0, "step": 16100 }, { "entropy": 0.7000283598899841, "epoch": 0.12888, "grad_norm": 2.042590618133545, "learning_rate": 4.3573829531812725e-05, "loss": 0.7122, "mean_token_accuracy": 0.7819603681564331, "num_tokens": 167094099.0, "step": 16110 }, { "entropy": 0.6963389933109283, "epoch": 0.12896, "grad_norm": 3.3366141319274902, "learning_rate": 4.356982793117247e-05, "loss": 0.6854, "mean_token_accuracy": 0.8059188485145569, "num_tokens": 167174211.0, "step": 16120 }, { "entropy": 0.6480215787887573, "epoch": 0.12904, "grad_norm": 1.8868143558502197, "learning_rate": 4.356582633053222e-05, "loss": 0.6487, "mean_token_accuracy": 0.811056125164032, "num_tokens": 167266319.0, "step": 16130 }, { "entropy": 0.7027740001678466, "epoch": 0.12912, "grad_norm": 2.5563766956329346, "learning_rate": 4.3561824729891956e-05, "loss": 0.6974, "mean_token_accuracy": 0.7887148499488831, "num_tokens": 167412903.0, "step": 16140 }, { "entropy": 0.6338808357715606, "epoch": 0.1292, "grad_norm": 5.13062858581543, "learning_rate": 4.35578231292517e-05, "loss": 0.6297, "mean_token_accuracy": 0.8278517365455628, "num_tokens": 167451828.0, "step": 16150 }, { "entropy": 0.6956696271896362, "epoch": 0.12928, "grad_norm": 2.1179494857788086, "learning_rate": 4.355382152861145e-05, "loss": 0.7009, "mean_token_accuracy": 0.7835979521274566, "num_tokens": 167615668.0, "step": 16160 }, { "entropy": 0.6049758553504944, "epoch": 0.12936, "grad_norm": 3.37272310256958, "learning_rate": 4.3549819927971194e-05, "loss": 0.5889, "mean_token_accuracy": 0.8282467365264893, "num_tokens": 167703252.0, "step": 16170 }, { "entropy": 0.7240542829036712, "epoch": 0.12944, "grad_norm": 1.8099011182785034, "learning_rate": 4.354581832733093e-05, "loss": 0.7395, "mean_token_accuracy": 0.7860257565975189, "num_tokens": 167795782.0, "step": 16180 }, { "entropy": 0.7239032030105591, "epoch": 0.12952, "grad_norm": 2.57125186920166, "learning_rate": 4.3541816726690674e-05, "loss": 0.7161, "mean_token_accuracy": 0.7831739962100983, "num_tokens": 167933609.0, "step": 16190 }, { "entropy": 0.6911595642566681, "epoch": 0.1296, "grad_norm": 5.273447513580322, "learning_rate": 4.3537815126050425e-05, "loss": 0.6923, "mean_token_accuracy": 0.8094763994216919, "num_tokens": 167970726.0, "step": 16200 }, { "entropy": 0.6794971466064453, "epoch": 0.12968, "grad_norm": 1.618104338645935, "learning_rate": 4.353381352541017e-05, "loss": 0.6836, "mean_token_accuracy": 0.7846632599830627, "num_tokens": 168134181.0, "step": 16210 }, { "entropy": 0.8579905152320861, "epoch": 0.12976, "grad_norm": 3.201550006866455, "learning_rate": 4.3529811924769906e-05, "loss": 0.8413, "mean_token_accuracy": 0.7651346683502197, "num_tokens": 168217091.0, "step": 16220 }, { "entropy": 0.7103419184684754, "epoch": 0.12984, "grad_norm": 2.0296213626861572, "learning_rate": 4.3525810324129656e-05, "loss": 0.7121, "mean_token_accuracy": 0.797471433877945, "num_tokens": 168309831.0, "step": 16230 }, { "entropy": 0.7067770391702652, "epoch": 0.12992, "grad_norm": 3.404764175415039, "learning_rate": 4.35218087234894e-05, "loss": 0.7049, "mean_token_accuracy": 0.7865528404712677, "num_tokens": 168446547.0, "step": 16240 }, { "entropy": 0.6742773085832596, "epoch": 0.13, "grad_norm": 4.493910789489746, "learning_rate": 4.3517807122849143e-05, "loss": 0.6746, "mean_token_accuracy": 0.8179567337036133, "num_tokens": 168487146.0, "step": 16250 }, { "entropy": 0.6540602922439576, "epoch": 0.13008, "grad_norm": 2.288848876953125, "learning_rate": 4.351380552220888e-05, "loss": 0.6499, "mean_token_accuracy": 0.7944910943508148, "num_tokens": 168650625.0, "step": 16260 }, { "entropy": 0.71592777967453, "epoch": 0.13016, "grad_norm": 5.098723411560059, "learning_rate": 4.350980392156863e-05, "loss": 0.7148, "mean_token_accuracy": 0.7952527463436126, "num_tokens": 168733693.0, "step": 16270 }, { "entropy": 0.7570150434970856, "epoch": 0.13024, "grad_norm": 1.7267060279846191, "learning_rate": 4.3505802320928375e-05, "loss": 0.7493, "mean_token_accuracy": 0.7885780274868012, "num_tokens": 168826775.0, "step": 16280 }, { "entropy": 0.7331264078617096, "epoch": 0.13032, "grad_norm": 2.8760221004486084, "learning_rate": 4.350180072028812e-05, "loss": 0.7199, "mean_token_accuracy": 0.7840746998786926, "num_tokens": 168957271.0, "step": 16290 }, { "entropy": 0.6777650117874146, "epoch": 0.1304, "grad_norm": 5.402580738067627, "learning_rate": 4.3497799119647855e-05, "loss": 0.6866, "mean_token_accuracy": 0.81805300116539, "num_tokens": 168988881.0, "step": 16300 }, { "entropy": 0.6727192312479019, "epoch": 0.13048, "grad_norm": 1.6977676153182983, "learning_rate": 4.3493797519007606e-05, "loss": 0.6769, "mean_token_accuracy": 0.7872191071510315, "num_tokens": 169152721.0, "step": 16310 }, { "entropy": 0.6799052000045777, "epoch": 0.13056, "grad_norm": 3.444612503051758, "learning_rate": 4.348979591836735e-05, "loss": 0.677, "mean_token_accuracy": 0.8025331914424896, "num_tokens": 169245638.0, "step": 16320 }, { "entropy": 0.680837082862854, "epoch": 0.13064, "grad_norm": 2.244570732116699, "learning_rate": 4.348579431772709e-05, "loss": 0.6827, "mean_token_accuracy": 0.8035716712474823, "num_tokens": 169339435.0, "step": 16330 }, { "entropy": 0.706517469882965, "epoch": 0.13072, "grad_norm": 2.314181327819824, "learning_rate": 4.348179271708684e-05, "loss": 0.695, "mean_token_accuracy": 0.7918171465396882, "num_tokens": 169464108.0, "step": 16340 }, { "entropy": 0.6495481193065643, "epoch": 0.1308, "grad_norm": 5.848228931427002, "learning_rate": 4.347779111644658e-05, "loss": 0.6605, "mean_token_accuracy": 0.8240030407905579, "num_tokens": 169499285.0, "step": 16350 }, { "entropy": 0.6568699240684509, "epoch": 0.13088, "grad_norm": 2.2614903450012207, "learning_rate": 4.3473789515806324e-05, "loss": 0.6563, "mean_token_accuracy": 0.7914264857769012, "num_tokens": 169663125.0, "step": 16360 }, { "entropy": 0.7671568512916564, "epoch": 0.13096, "grad_norm": 2.960545778274536, "learning_rate": 4.346978791516607e-05, "loss": 0.7717, "mean_token_accuracy": 0.7894834041595459, "num_tokens": 169746732.0, "step": 16370 }, { "entropy": 0.688265860080719, "epoch": 0.13104, "grad_norm": 2.515186071395874, "learning_rate": 4.346578631452581e-05, "loss": 0.686, "mean_token_accuracy": 0.8000817835330963, "num_tokens": 169840844.0, "step": 16380 }, { "entropy": 0.7111025512218475, "epoch": 0.13112, "grad_norm": 2.2621004581451416, "learning_rate": 4.3461784713885555e-05, "loss": 0.7054, "mean_token_accuracy": 0.7891093015670776, "num_tokens": 169978208.0, "step": 16390 }, { "entropy": 0.6314911723136902, "epoch": 0.1312, "grad_norm": 5.634739398956299, "learning_rate": 4.34577831132453e-05, "loss": 0.6375, "mean_token_accuracy": 0.8238798439502716, "num_tokens": 170016868.0, "step": 16400 }, { "entropy": 0.6518435001373291, "epoch": 0.13128, "grad_norm": 1.6774705648422241, "learning_rate": 4.345378151260505e-05, "loss": 0.6586, "mean_token_accuracy": 0.790486079454422, "num_tokens": 170180708.0, "step": 16410 }, { "entropy": 0.6724698841571808, "epoch": 0.13136, "grad_norm": 2.996283531188965, "learning_rate": 4.3449779911964787e-05, "loss": 0.6644, "mean_token_accuracy": 0.803514689207077, "num_tokens": 170276461.0, "step": 16420 }, { "entropy": 0.7299825429916382, "epoch": 0.13144, "grad_norm": 1.8909659385681152, "learning_rate": 4.344577831132453e-05, "loss": 0.7512, "mean_token_accuracy": 0.7887938022613525, "num_tokens": 170371338.0, "step": 16430 }, { "entropy": 0.7373366713523865, "epoch": 0.13152, "grad_norm": 2.316483736038208, "learning_rate": 4.3441776710684274e-05, "loss": 0.7247, "mean_token_accuracy": 0.7795604646205903, "num_tokens": 170511391.0, "step": 16440 }, { "entropy": 0.6785898983478547, "epoch": 0.1316, "grad_norm": 4.719942569732666, "learning_rate": 4.3437775110044024e-05, "loss": 0.6574, "mean_token_accuracy": 0.8192229032516479, "num_tokens": 170547906.0, "step": 16450 }, { "entropy": 0.6872938811779022, "epoch": 0.13168, "grad_norm": 2.326714277267456, "learning_rate": 4.343377350940376e-05, "loss": 0.6951, "mean_token_accuracy": 0.7860409080982208, "num_tokens": 170710964.0, "step": 16460 }, { "entropy": 0.7050812035799027, "epoch": 0.13176, "grad_norm": 3.280189037322998, "learning_rate": 4.3429771908763505e-05, "loss": 0.7054, "mean_token_accuracy": 0.8026165425777435, "num_tokens": 170786632.0, "step": 16470 }, { "entropy": 0.7794006049633027, "epoch": 0.13184, "grad_norm": 1.887311339378357, "learning_rate": 4.3425770308123256e-05, "loss": 0.7726, "mean_token_accuracy": 0.7809825122356415, "num_tokens": 170880007.0, "step": 16480 }, { "entropy": 0.7639862358570099, "epoch": 0.13192, "grad_norm": 3.1984429359436035, "learning_rate": 4.3421768707483e-05, "loss": 0.7557, "mean_token_accuracy": 0.7806077063083648, "num_tokens": 171007734.0, "step": 16490 }, { "entropy": 0.6875601977109909, "epoch": 0.132, "grad_norm": 6.7270708084106445, "learning_rate": 4.3417767106842736e-05, "loss": 0.6719, "mean_token_accuracy": 0.8212611198425293, "num_tokens": 171041705.0, "step": 16500 }, { "entropy": 0.7098792731761933, "epoch": 0.13208, "grad_norm": 2.1295011043548584, "learning_rate": 4.341376550620248e-05, "loss": 0.7089, "mean_token_accuracy": 0.7841975629329682, "num_tokens": 171203659.0, "step": 16510 }, { "entropy": 0.6725456774234772, "epoch": 0.13216, "grad_norm": 4.8061370849609375, "learning_rate": 4.340976390556223e-05, "loss": 0.6731, "mean_token_accuracy": 0.8059028804302215, "num_tokens": 171269977.0, "step": 16520 }, { "entropy": 0.7384838163852692, "epoch": 0.13224, "grad_norm": 1.9778046607971191, "learning_rate": 4.3405762304921974e-05, "loss": 0.7284, "mean_token_accuracy": 0.7904045522212982, "num_tokens": 171361729.0, "step": 16530 }, { "entropy": 0.6673194587230682, "epoch": 0.13232, "grad_norm": 2.5105690956115723, "learning_rate": 4.340176070428171e-05, "loss": 0.6595, "mean_token_accuracy": 0.8008254051208497, "num_tokens": 171490523.0, "step": 16540 }, { "entropy": 0.7616602063179017, "epoch": 0.1324, "grad_norm": 4.713784694671631, "learning_rate": 4.339775910364146e-05, "loss": 0.7745, "mean_token_accuracy": 0.8017818033695221, "num_tokens": 171523698.0, "step": 16550 }, { "entropy": 0.7178698360919953, "epoch": 0.13248, "grad_norm": 2.559288501739502, "learning_rate": 4.3393757503001205e-05, "loss": 0.7224, "mean_token_accuracy": 0.778724980354309, "num_tokens": 171687538.0, "step": 16560 }, { "entropy": 0.7698008239269256, "epoch": 0.13256, "grad_norm": 3.806236982345581, "learning_rate": 4.338975590236095e-05, "loss": 0.7597, "mean_token_accuracy": 0.7853487849235534, "num_tokens": 171776337.0, "step": 16570 }, { "entropy": 0.6621124804019928, "epoch": 0.13264, "grad_norm": 2.4208905696868896, "learning_rate": 4.3385754301720686e-05, "loss": 0.6546, "mean_token_accuracy": 0.8104467034339905, "num_tokens": 171868798.0, "step": 16580 }, { "entropy": 0.6527595698833466, "epoch": 0.13272, "grad_norm": 2.9481377601623535, "learning_rate": 4.3381752701080436e-05, "loss": 0.6526, "mean_token_accuracy": 0.7967582285404206, "num_tokens": 172011165.0, "step": 16590 }, { "entropy": 0.7113822519779205, "epoch": 0.1328, "grad_norm": 6.336725234985352, "learning_rate": 4.337775110044018e-05, "loss": 0.7251, "mean_token_accuracy": 0.811243611574173, "num_tokens": 172053019.0, "step": 16600 }, { "entropy": 0.6511559963226319, "epoch": 0.13288, "grad_norm": 1.6538914442062378, "learning_rate": 4.3373749499799924e-05, "loss": 0.6402, "mean_token_accuracy": 0.8013861835002899, "num_tokens": 172216859.0, "step": 16610 }, { "entropy": 0.6840826511383057, "epoch": 0.13296, "grad_norm": 3.5037431716918945, "learning_rate": 4.336974789915967e-05, "loss": 0.6761, "mean_token_accuracy": 0.7981440246105194, "num_tokens": 172308198.0, "step": 16620 }, { "entropy": 0.6668298363685607, "epoch": 0.13304, "grad_norm": 1.8397554159164429, "learning_rate": 4.336574629851941e-05, "loss": 0.6767, "mean_token_accuracy": 0.8042551100254058, "num_tokens": 172402335.0, "step": 16630 }, { "entropy": 0.6650022804737091, "epoch": 0.13312, "grad_norm": 3.5123283863067627, "learning_rate": 4.3361744697879155e-05, "loss": 0.6537, "mean_token_accuracy": 0.7987867176532746, "num_tokens": 172544780.0, "step": 16640 }, { "entropy": 0.6955614566802979, "epoch": 0.1332, "grad_norm": 4.6150102615356445, "learning_rate": 4.33577430972389e-05, "loss": 0.7036, "mean_token_accuracy": 0.8109398365020752, "num_tokens": 172583115.0, "step": 16650 }, { "entropy": 0.7058170914649964, "epoch": 0.13328, "grad_norm": 2.272512197494507, "learning_rate": 4.335374149659864e-05, "loss": 0.7091, "mean_token_accuracy": 0.7823400139808655, "num_tokens": 172746955.0, "step": 16660 }, { "entropy": 0.737828654050827, "epoch": 0.13336, "grad_norm": 3.8877971172332764, "learning_rate": 4.3349739895958386e-05, "loss": 0.7224, "mean_token_accuracy": 0.7944940030574799, "num_tokens": 172832986.0, "step": 16670 }, { "entropy": 0.7156781852245331, "epoch": 0.13344, "grad_norm": 3.011486768722534, "learning_rate": 4.334573829531813e-05, "loss": 0.7377, "mean_token_accuracy": 0.791131991147995, "num_tokens": 172927124.0, "step": 16680 }, { "entropy": 0.7514347493648529, "epoch": 0.13352, "grad_norm": 2.51627516746521, "learning_rate": 4.3341736694677874e-05, "loss": 0.7419, "mean_token_accuracy": 0.7782246887683868, "num_tokens": 173061140.0, "step": 16690 }, { "entropy": 0.7669072449207306, "epoch": 0.1336, "grad_norm": 4.247099876403809, "learning_rate": 4.333773509403762e-05, "loss": 0.7658, "mean_token_accuracy": 0.7920004785060882, "num_tokens": 173098896.0, "step": 16700 }, { "entropy": 0.624168461561203, "epoch": 0.13368, "grad_norm": 2.1068949699401855, "learning_rate": 4.333373349339736e-05, "loss": 0.6251, "mean_token_accuracy": 0.8017334938049316, "num_tokens": 173262624.0, "step": 16710 }, { "entropy": 0.6799476563930511, "epoch": 0.13376, "grad_norm": 3.299687147140503, "learning_rate": 4.3329731892757105e-05, "loss": 0.6709, "mean_token_accuracy": 0.805624270439148, "num_tokens": 173345309.0, "step": 16720 }, { "entropy": 0.6820218503475189, "epoch": 0.13384, "grad_norm": 2.931389808654785, "learning_rate": 4.332573029211685e-05, "loss": 0.6807, "mean_token_accuracy": 0.8006594240665436, "num_tokens": 173441110.0, "step": 16730 }, { "entropy": 0.7366493880748749, "epoch": 0.13392, "grad_norm": 2.5877912044525146, "learning_rate": 4.332172869147659e-05, "loss": 0.7372, "mean_token_accuracy": 0.784689599275589, "num_tokens": 173562679.0, "step": 16740 }, { "entropy": 0.7299908548593521, "epoch": 0.134, "grad_norm": 4.43053674697876, "learning_rate": 4.3317727090836336e-05, "loss": 0.7114, "mean_token_accuracy": 0.8129949271678925, "num_tokens": 173595123.0, "step": 16750 }, { "entropy": 0.6752152442932129, "epoch": 0.13408, "grad_norm": 2.1820247173309326, "learning_rate": 4.3313725490196086e-05, "loss": 0.6786, "mean_token_accuracy": 0.7900708496570588, "num_tokens": 173758958.0, "step": 16760 }, { "entropy": 0.707959309220314, "epoch": 0.13416, "grad_norm": 3.86738920211792, "learning_rate": 4.330972388955582e-05, "loss": 0.6958, "mean_token_accuracy": 0.8016533076763153, "num_tokens": 173847210.0, "step": 16770 }, { "entropy": 0.7317847192287446, "epoch": 0.13424, "grad_norm": 1.8228553533554077, "learning_rate": 4.330572228891557e-05, "loss": 0.723, "mean_token_accuracy": 0.7949450135231018, "num_tokens": 173941451.0, "step": 16780 }, { "entropy": 0.7014293968677521, "epoch": 0.13432, "grad_norm": 2.541348934173584, "learning_rate": 4.330172068827531e-05, "loss": 0.7164, "mean_token_accuracy": 0.7811103284358978, "num_tokens": 174077073.0, "step": 16790 }, { "entropy": 0.6865160912275314, "epoch": 0.1344, "grad_norm": 6.226760387420654, "learning_rate": 4.329771908763506e-05, "loss": 0.6758, "mean_token_accuracy": 0.8184459567070007, "num_tokens": 174112172.0, "step": 16800 }, { "entropy": 0.6609237313270568, "epoch": 0.13448, "grad_norm": 1.4146173000335693, "learning_rate": 4.32937174869948e-05, "loss": 0.6644, "mean_token_accuracy": 0.7885564267635345, "num_tokens": 174276012.0, "step": 16810 }, { "entropy": 0.6818812102079391, "epoch": 0.13456, "grad_norm": 3.2994112968444824, "learning_rate": 4.328971588635454e-05, "loss": 0.6699, "mean_token_accuracy": 0.8084677219390869, "num_tokens": 174363596.0, "step": 16820 }, { "entropy": 0.6624255239963531, "epoch": 0.13464, "grad_norm": 1.6655735969543457, "learning_rate": 4.328571428571429e-05, "loss": 0.6667, "mean_token_accuracy": 0.8087518692016602, "num_tokens": 174457189.0, "step": 16830 }, { "entropy": 0.7452205777168274, "epoch": 0.13472, "grad_norm": 2.4845998287200928, "learning_rate": 4.3281712685074036e-05, "loss": 0.7387, "mean_token_accuracy": 0.7799679040908813, "num_tokens": 174597911.0, "step": 16840 }, { "entropy": 0.747558730840683, "epoch": 0.1348, "grad_norm": 4.70809268951416, "learning_rate": 4.327771108443377e-05, "loss": 0.7537, "mean_token_accuracy": 0.798326563835144, "num_tokens": 174637878.0, "step": 16850 }, { "entropy": 0.733792120218277, "epoch": 0.13488, "grad_norm": 1.3932157754898071, "learning_rate": 4.327370948379352e-05, "loss": 0.7316, "mean_token_accuracy": 0.7764106035232544, "num_tokens": 174801718.0, "step": 16860 }, { "entropy": 0.6914155900478363, "epoch": 0.13496, "grad_norm": 3.4075746536254883, "learning_rate": 4.326970788315327e-05, "loss": 0.6823, "mean_token_accuracy": 0.8043722212314606, "num_tokens": 174884397.0, "step": 16870 }, { "entropy": 0.722374963760376, "epoch": 0.13504, "grad_norm": 1.6623389720916748, "learning_rate": 4.326570628251301e-05, "loss": 0.7104, "mean_token_accuracy": 0.7955937385559082, "num_tokens": 174978105.0, "step": 16880 }, { "entropy": 0.7162272512912751, "epoch": 0.13512, "grad_norm": 3.1674535274505615, "learning_rate": 4.326170468187275e-05, "loss": 0.7208, "mean_token_accuracy": 0.783701604604721, "num_tokens": 175119535.0, "step": 16890 }, { "entropy": 0.6323157846927643, "epoch": 0.1352, "grad_norm": 5.003846645355225, "learning_rate": 4.325770308123249e-05, "loss": 0.6128, "mean_token_accuracy": 0.8316928446292877, "num_tokens": 175156924.0, "step": 16900 }, { "entropy": 0.6805218756198883, "epoch": 0.13528, "grad_norm": 2.8337390422821045, "learning_rate": 4.325370148059224e-05, "loss": 0.6862, "mean_token_accuracy": 0.7842208266258239, "num_tokens": 175320764.0, "step": 16910 }, { "entropy": 0.6861242443323136, "epoch": 0.13536, "grad_norm": 3.3056206703186035, "learning_rate": 4.3249699879951986e-05, "loss": 0.6826, "mean_token_accuracy": 0.8013746082782746, "num_tokens": 175414959.0, "step": 16920 }, { "entropy": 0.6746872007846832, "epoch": 0.13544, "grad_norm": 2.6022422313690186, "learning_rate": 4.324569827931172e-05, "loss": 0.672, "mean_token_accuracy": 0.8079774618148804, "num_tokens": 175509861.0, "step": 16930 }, { "entropy": 0.743207859992981, "epoch": 0.13552, "grad_norm": 3.300058126449585, "learning_rate": 4.324169667867147e-05, "loss": 0.7396, "mean_token_accuracy": 0.7797751128673553, "num_tokens": 175649661.0, "step": 16940 }, { "entropy": 0.5803765475749969, "epoch": 0.1356, "grad_norm": 5.40296745300293, "learning_rate": 4.323769507803122e-05, "loss": 0.5748, "mean_token_accuracy": 0.8418874382972718, "num_tokens": 175691250.0, "step": 16950 }, { "entropy": 0.644460654258728, "epoch": 0.13568, "grad_norm": 1.5405014753341675, "learning_rate": 4.323369347739096e-05, "loss": 0.6416, "mean_token_accuracy": 0.796171224117279, "num_tokens": 175855090.0, "step": 16960 }, { "entropy": 0.745524337887764, "epoch": 0.13576, "grad_norm": 3.18379807472229, "learning_rate": 4.32296918767507e-05, "loss": 0.7387, "mean_token_accuracy": 0.7958160459995269, "num_tokens": 175945201.0, "step": 16970 }, { "entropy": 0.6866646885871888, "epoch": 0.13584, "grad_norm": 1.7367054224014282, "learning_rate": 4.322569027611045e-05, "loss": 0.6861, "mean_token_accuracy": 0.7992458462715148, "num_tokens": 176039639.0, "step": 16980 }, { "entropy": 0.7766000032424927, "epoch": 0.13592, "grad_norm": 3.456190824508667, "learning_rate": 4.322168867547019e-05, "loss": 0.7747, "mean_token_accuracy": 0.7747850060462952, "num_tokens": 176177188.0, "step": 16990 }, { "entropy": 0.6896273881196976, "epoch": 0.136, "grad_norm": 5.106326103210449, "learning_rate": 4.3217687074829936e-05, "loss": 0.6869, "mean_token_accuracy": 0.8164631366729737, "num_tokens": 176216418.0, "step": 17000 }, { "entropy": 0.71977299451828, "epoch": 0.13608, "grad_norm": 1.664036750793457, "learning_rate": 4.321368547418968e-05, "loss": 0.7246, "mean_token_accuracy": 0.7802581489086151, "num_tokens": 176377617.0, "step": 17010 }, { "entropy": 0.7536481618881226, "epoch": 0.13616, "grad_norm": 4.576295852661133, "learning_rate": 4.320968387354942e-05, "loss": 0.7354, "mean_token_accuracy": 0.7943808555603027, "num_tokens": 176444900.0, "step": 17020 }, { "entropy": 0.6921536147594451, "epoch": 0.13624, "grad_norm": 1.7269543409347534, "learning_rate": 4.320568227290917e-05, "loss": 0.6928, "mean_token_accuracy": 0.8012397289276123, "num_tokens": 176537504.0, "step": 17030 }, { "entropy": 0.7175227880477906, "epoch": 0.13632, "grad_norm": 2.6459126472473145, "learning_rate": 4.320168067226891e-05, "loss": 0.7115, "mean_token_accuracy": 0.7882783532142639, "num_tokens": 176672271.0, "step": 17040 }, { "entropy": 0.7313525021076203, "epoch": 0.1364, "grad_norm": 4.112053394317627, "learning_rate": 4.3197679071628654e-05, "loss": 0.7246, "mean_token_accuracy": 0.8117791712284088, "num_tokens": 176707080.0, "step": 17050 }, { "entropy": 0.7004928708076477, "epoch": 0.13648, "grad_norm": 1.7965370416641235, "learning_rate": 4.31936774709884e-05, "loss": 0.7091, "mean_token_accuracy": 0.7798424541950226, "num_tokens": 176870920.0, "step": 17060 }, { "entropy": 0.6928335309028626, "epoch": 0.13656, "grad_norm": 4.661550045013428, "learning_rate": 4.318967587034814e-05, "loss": 0.6909, "mean_token_accuracy": 0.8016665935516357, "num_tokens": 176954217.0, "step": 17070 }, { "entropy": 0.7592776417732239, "epoch": 0.13664, "grad_norm": 1.5642154216766357, "learning_rate": 4.3185674269707885e-05, "loss": 0.7382, "mean_token_accuracy": 0.7889100849628449, "num_tokens": 177049803.0, "step": 17080 }, { "entropy": 0.719226884841919, "epoch": 0.13672, "grad_norm": 3.129890203475952, "learning_rate": 4.318167266906763e-05, "loss": 0.732, "mean_token_accuracy": 0.7779904007911682, "num_tokens": 177187263.0, "step": 17090 }, { "entropy": 0.6696033418178559, "epoch": 0.1368, "grad_norm": 4.72597599029541, "learning_rate": 4.317767106842737e-05, "loss": 0.6639, "mean_token_accuracy": 0.821194076538086, "num_tokens": 177226949.0, "step": 17100 }, { "entropy": 0.696859461069107, "epoch": 0.13688, "grad_norm": 2.9793100357055664, "learning_rate": 4.3173669467787116e-05, "loss": 0.6977, "mean_token_accuracy": 0.7804470062255859, "num_tokens": 177390789.0, "step": 17110 }, { "entropy": 0.6873875856399536, "epoch": 0.13696, "grad_norm": 3.351240873336792, "learning_rate": 4.316966786714686e-05, "loss": 0.6858, "mean_token_accuracy": 0.8010689198970795, "num_tokens": 177487851.0, "step": 17120 }, { "entropy": 0.6597778856754303, "epoch": 0.13704, "grad_norm": 2.5957653522491455, "learning_rate": 4.3165666266506604e-05, "loss": 0.6388, "mean_token_accuracy": 0.8130904138088226, "num_tokens": 177583199.0, "step": 17130 }, { "entropy": 0.7019754350185394, "epoch": 0.13712, "grad_norm": 2.8899166584014893, "learning_rate": 4.316166466586635e-05, "loss": 0.6992, "mean_token_accuracy": 0.7841838419437408, "num_tokens": 177726370.0, "step": 17140 }, { "entropy": 0.7041898429393768, "epoch": 0.1372, "grad_norm": 5.65083122253418, "learning_rate": 4.31576630652261e-05, "loss": 0.7028, "mean_token_accuracy": 0.810396945476532, "num_tokens": 177764291.0, "step": 17150 }, { "entropy": 0.6800226867198944, "epoch": 0.13728, "grad_norm": 2.0298240184783936, "learning_rate": 4.3153661464585835e-05, "loss": 0.6793, "mean_token_accuracy": 0.7874618232250213, "num_tokens": 177925000.0, "step": 17160 }, { "entropy": 0.6997896432876587, "epoch": 0.13736, "grad_norm": 3.6187257766723633, "learning_rate": 4.314965986394558e-05, "loss": 0.6955, "mean_token_accuracy": 0.8072743356227875, "num_tokens": 177990820.0, "step": 17170 }, { "entropy": 0.7249401926994323, "epoch": 0.13744, "grad_norm": 1.5484740734100342, "learning_rate": 4.314565826330532e-05, "loss": 0.725, "mean_token_accuracy": 0.7894684970378876, "num_tokens": 178082959.0, "step": 17180 }, { "entropy": 0.6717774510383606, "epoch": 0.13752, "grad_norm": 2.150719404220581, "learning_rate": 4.314165666266507e-05, "loss": 0.6607, "mean_token_accuracy": 0.7980327486991883, "num_tokens": 178221477.0, "step": 17190 }, { "entropy": 0.7424629628658295, "epoch": 0.1376, "grad_norm": 4.988081455230713, "learning_rate": 4.313765506202481e-05, "loss": 0.7461, "mean_token_accuracy": 0.799429327249527, "num_tokens": 178259306.0, "step": 17200 }, { "entropy": 0.6638097822666168, "epoch": 0.13768, "grad_norm": 1.6760743856430054, "learning_rate": 4.3133653461384553e-05, "loss": 0.6795, "mean_token_accuracy": 0.7914448082447052, "num_tokens": 178423146.0, "step": 17210 }, { "entropy": 0.6966695934534073, "epoch": 0.13776, "grad_norm": 2.80950665473938, "learning_rate": 4.3129651860744304e-05, "loss": 0.6915, "mean_token_accuracy": 0.8015419363975524, "num_tokens": 178516314.0, "step": 17220 }, { "entropy": 0.7616559505462647, "epoch": 0.13784, "grad_norm": 1.960002064704895, "learning_rate": 4.312565026010405e-05, "loss": 0.7614, "mean_token_accuracy": 0.785990834236145, "num_tokens": 178611851.0, "step": 17230 }, { "entropy": 0.7182759523391724, "epoch": 0.13792, "grad_norm": 2.819300889968872, "learning_rate": 4.3121648659463785e-05, "loss": 0.7199, "mean_token_accuracy": 0.7823790311813354, "num_tokens": 178747521.0, "step": 17240 }, { "entropy": 0.6994527697563171, "epoch": 0.138, "grad_norm": 5.049473285675049, "learning_rate": 4.311764705882353e-05, "loss": 0.6837, "mean_token_accuracy": 0.815009069442749, "num_tokens": 178788508.0, "step": 17250 }, { "entropy": 0.6902012348175048, "epoch": 0.13808, "grad_norm": 1.6236774921417236, "learning_rate": 4.311364545818328e-05, "loss": 0.6916, "mean_token_accuracy": 0.7841353297233582, "num_tokens": 178952348.0, "step": 17260 }, { "entropy": 0.6655533611774445, "epoch": 0.13816, "grad_norm": 3.7427406311035156, "learning_rate": 4.310964385754302e-05, "loss": 0.6736, "mean_token_accuracy": 0.8054049909114838, "num_tokens": 179035171.0, "step": 17270 }, { "entropy": 0.6567423701286316, "epoch": 0.13824, "grad_norm": 2.31432843208313, "learning_rate": 4.310564225690276e-05, "loss": 0.6604, "mean_token_accuracy": 0.8064133524894714, "num_tokens": 179129097.0, "step": 17280 }, { "entropy": 0.7072276949882508, "epoch": 0.13832, "grad_norm": 3.6841001510620117, "learning_rate": 4.310164065626251e-05, "loss": 0.6902, "mean_token_accuracy": 0.7890251576900482, "num_tokens": 179265308.0, "step": 17290 }, { "entropy": 0.6729838430881501, "epoch": 0.1384, "grad_norm": 4.641575813293457, "learning_rate": 4.3097639055622254e-05, "loss": 0.6807, "mean_token_accuracy": 0.8142461836338043, "num_tokens": 179303823.0, "step": 17300 }, { "entropy": 0.6764416098594666, "epoch": 0.13848, "grad_norm": 2.1424636840820312, "learning_rate": 4.3093637454982e-05, "loss": 0.6735, "mean_token_accuracy": 0.7878114342689514, "num_tokens": 179467663.0, "step": 17310 }, { "entropy": 0.6440086901187897, "epoch": 0.13856, "grad_norm": 2.9281816482543945, "learning_rate": 4.3089635854341734e-05, "loss": 0.6406, "mean_token_accuracy": 0.8118469893932343, "num_tokens": 179557299.0, "step": 17320 }, { "entropy": 0.7050687313079834, "epoch": 0.13864, "grad_norm": 2.427354335784912, "learning_rate": 4.3085634253701485e-05, "loss": 0.7069, "mean_token_accuracy": 0.7943887352943421, "num_tokens": 179653480.0, "step": 17330 }, { "entropy": 0.659016290307045, "epoch": 0.13872, "grad_norm": 2.6697025299072266, "learning_rate": 4.308163265306123e-05, "loss": 0.6516, "mean_token_accuracy": 0.8069973886013031, "num_tokens": 179780019.0, "step": 17340 }, { "entropy": 0.6944721519947052, "epoch": 0.1388, "grad_norm": 4.403016567230225, "learning_rate": 4.307763105242097e-05, "loss": 0.6823, "mean_token_accuracy": 0.8149316787719727, "num_tokens": 179814787.0, "step": 17350 }, { "entropy": 0.6601154029369354, "epoch": 0.13888, "grad_norm": 2.5491364002227783, "learning_rate": 4.3073629451780716e-05, "loss": 0.6717, "mean_token_accuracy": 0.7895823180675506, "num_tokens": 179978627.0, "step": 17360 }, { "entropy": 0.7189982235431671, "epoch": 0.13896, "grad_norm": 4.059207916259766, "learning_rate": 4.306962785114046e-05, "loss": 0.7107, "mean_token_accuracy": 0.7958482086658478, "num_tokens": 180074692.0, "step": 17370 }, { "entropy": 0.7242931067943573, "epoch": 0.13904, "grad_norm": 1.9038769006729126, "learning_rate": 4.3065626250500203e-05, "loss": 0.7363, "mean_token_accuracy": 0.7929324686527253, "num_tokens": 180167898.0, "step": 17380 }, { "entropy": 0.7033732891082763, "epoch": 0.13912, "grad_norm": 2.762826681137085, "learning_rate": 4.306162464985995e-05, "loss": 0.6921, "mean_token_accuracy": 0.7872276723384857, "num_tokens": 180302576.0, "step": 17390 }, { "entropy": 0.727443641424179, "epoch": 0.1392, "grad_norm": 7.038637638092041, "learning_rate": 4.305762304921969e-05, "loss": 0.756, "mean_token_accuracy": 0.7980082750320434, "num_tokens": 180339273.0, "step": 17400 }, { "entropy": 0.7008262872695923, "epoch": 0.13928, "grad_norm": 1.8887145519256592, "learning_rate": 4.3053621448579435e-05, "loss": 0.7023, "mean_token_accuracy": 0.7786653757095336, "num_tokens": 180502683.0, "step": 17410 }, { "entropy": 0.6614748775959015, "epoch": 0.13936, "grad_norm": 3.8125245571136475, "learning_rate": 4.304961984793918e-05, "loss": 0.6364, "mean_token_accuracy": 0.8131756603717804, "num_tokens": 180583963.0, "step": 17420 }, { "entropy": 0.7136732339859009, "epoch": 0.13944, "grad_norm": 1.5575672388076782, "learning_rate": 4.304561824729892e-05, "loss": 0.7186, "mean_token_accuracy": 0.7945256710052491, "num_tokens": 180676119.0, "step": 17430 }, { "entropy": 0.6880929291248321, "epoch": 0.13952, "grad_norm": 4.07110595703125, "learning_rate": 4.3041616646658666e-05, "loss": 0.6763, "mean_token_accuracy": 0.7986730337142944, "num_tokens": 180795952.0, "step": 17440 }, { "entropy": 0.7860702514648438, "epoch": 0.1396, "grad_norm": 5.326326847076416, "learning_rate": 4.303761504601841e-05, "loss": 0.7856, "mean_token_accuracy": 0.7978846728801727, "num_tokens": 180828327.0, "step": 17450 }, { "entropy": 0.6424562871456146, "epoch": 0.13968, "grad_norm": 2.4919533729553223, "learning_rate": 4.303361344537815e-05, "loss": 0.6461, "mean_token_accuracy": 0.7965168058872223, "num_tokens": 180991203.0, "step": 17460 }, { "entropy": 0.7094282388687134, "epoch": 0.13976, "grad_norm": 3.253448009490967, "learning_rate": 4.30296118447379e-05, "loss": 0.7023, "mean_token_accuracy": 0.7983050405979156, "num_tokens": 181070012.0, "step": 17470 }, { "entropy": 0.7811993300914765, "epoch": 0.13984, "grad_norm": 2.6303064823150635, "learning_rate": 4.302561024409764e-05, "loss": 0.7817, "mean_token_accuracy": 0.7801544725894928, "num_tokens": 181162536.0, "step": 17480 }, { "entropy": 0.7128535211086273, "epoch": 0.13992, "grad_norm": 3.3005380630493164, "learning_rate": 4.3021608643457384e-05, "loss": 0.7154, "mean_token_accuracy": 0.7810164034366608, "num_tokens": 181307335.0, "step": 17490 }, { "entropy": 0.6632479965686798, "epoch": 0.14, "grad_norm": 5.1730828285217285, "learning_rate": 4.301760704281713e-05, "loss": 0.6522, "mean_token_accuracy": 0.8274513900279998, "num_tokens": 181348135.0, "step": 17500 }, { "entropy": 0.7346741914749145, "epoch": 0.14008, "grad_norm": 2.63568115234375, "learning_rate": 4.301360544217687e-05, "loss": 0.7368, "mean_token_accuracy": 0.778616976737976, "num_tokens": 181511423.0, "step": 17510 }, { "entropy": 0.7177897036075592, "epoch": 0.14016, "grad_norm": 4.317636966705322, "learning_rate": 4.3009603841536615e-05, "loss": 0.71, "mean_token_accuracy": 0.8002582609653472, "num_tokens": 181586062.0, "step": 17520 }, { "entropy": 0.7259449064731598, "epoch": 0.14024, "grad_norm": 1.603270173072815, "learning_rate": 4.300560224089636e-05, "loss": 0.7223, "mean_token_accuracy": 0.7968622088432312, "num_tokens": 181678625.0, "step": 17530 }, { "entropy": 0.7066905677318573, "epoch": 0.14032, "grad_norm": 2.0756726264953613, "learning_rate": 4.300160064025611e-05, "loss": 0.7132, "mean_token_accuracy": 0.7881742238998413, "num_tokens": 181810125.0, "step": 17540 }, { "entropy": 0.7085338413715363, "epoch": 0.1404, "grad_norm": 5.359453201293945, "learning_rate": 4.2997599039615847e-05, "loss": 0.7049, "mean_token_accuracy": 0.8138736128807068, "num_tokens": 181843610.0, "step": 17550 }, { "entropy": 0.6664656460285187, "epoch": 0.14048, "grad_norm": 1.4979742765426636, "learning_rate": 4.299359743897559e-05, "loss": 0.6703, "mean_token_accuracy": 0.7924767971038819, "num_tokens": 182007450.0, "step": 17560 }, { "entropy": 0.713438093662262, "epoch": 0.14056, "grad_norm": 2.9842493534088135, "learning_rate": 4.2989595838335334e-05, "loss": 0.7052, "mean_token_accuracy": 0.7972316741943359, "num_tokens": 182103829.0, "step": 17570 }, { "entropy": 0.7355520546436309, "epoch": 0.14064, "grad_norm": 1.5166593790054321, "learning_rate": 4.2985594237695084e-05, "loss": 0.7479, "mean_token_accuracy": 0.7858101546764373, "num_tokens": 182197166.0, "step": 17580 }, { "entropy": 0.758878618478775, "epoch": 0.14072, "grad_norm": 2.2860708236694336, "learning_rate": 4.298159263705482e-05, "loss": 0.7492, "mean_token_accuracy": 0.7772657334804535, "num_tokens": 182336760.0, "step": 17590 }, { "entropy": 0.6931428790092469, "epoch": 0.1408, "grad_norm": 6.47324275970459, "learning_rate": 4.2977591036414565e-05, "loss": 0.6982, "mean_token_accuracy": 0.8149317860603332, "num_tokens": 182373551.0, "step": 17600 }, { "entropy": 0.659984827041626, "epoch": 0.14088, "grad_norm": 1.943917155265808, "learning_rate": 4.2973589435774316e-05, "loss": 0.6548, "mean_token_accuracy": 0.794560432434082, "num_tokens": 182537059.0, "step": 17610 }, { "entropy": 0.6313802003860474, "epoch": 0.14096, "grad_norm": 3.267662763595581, "learning_rate": 4.296958783513406e-05, "loss": 0.6319, "mean_token_accuracy": 0.8157367169857025, "num_tokens": 182614358.0, "step": 17620 }, { "entropy": 0.7391306161880493, "epoch": 0.14104, "grad_norm": 1.4249701499938965, "learning_rate": 4.2965586234493796e-05, "loss": 0.7477, "mean_token_accuracy": 0.7906284928321838, "num_tokens": 182708839.0, "step": 17630 }, { "entropy": 0.7039712309837342, "epoch": 0.14112, "grad_norm": 3.076892852783203, "learning_rate": 4.296158463385354e-05, "loss": 0.6893, "mean_token_accuracy": 0.790319812297821, "num_tokens": 182849868.0, "step": 17640 }, { "entropy": 0.5901391744613648, "epoch": 0.1412, "grad_norm": 5.371332168579102, "learning_rate": 4.295758303321329e-05, "loss": 0.5907, "mean_token_accuracy": 0.8331241250038147, "num_tokens": 182893772.0, "step": 17650 }, { "entropy": 0.6232833921909332, "epoch": 0.14128, "grad_norm": 2.160724401473999, "learning_rate": 4.2953581432573034e-05, "loss": 0.6286, "mean_token_accuracy": 0.79963920712471, "num_tokens": 183057268.0, "step": 17660 }, { "entropy": 0.7267306327819825, "epoch": 0.14136, "grad_norm": 2.945072650909424, "learning_rate": 4.294957983193277e-05, "loss": 0.7232, "mean_token_accuracy": 0.7957399845123291, "num_tokens": 183135119.0, "step": 17670 }, { "entropy": 0.6956215411424637, "epoch": 0.14144, "grad_norm": 1.7760695219039917, "learning_rate": 4.294557823129252e-05, "loss": 0.6823, "mean_token_accuracy": 0.8045681715011597, "num_tokens": 183230860.0, "step": 17680 }, { "entropy": 0.7428258895874024, "epoch": 0.14152, "grad_norm": 3.4639651775360107, "learning_rate": 4.2941576630652265e-05, "loss": 0.7491, "mean_token_accuracy": 0.7760684490203857, "num_tokens": 183373010.0, "step": 17690 }, { "entropy": 0.7247849345207215, "epoch": 0.1416, "grad_norm": 5.277599811553955, "learning_rate": 4.293757503001201e-05, "loss": 0.7189, "mean_token_accuracy": 0.8035703182220459, "num_tokens": 183416516.0, "step": 17700 }, { "entropy": 0.670319789648056, "epoch": 0.14168, "grad_norm": 3.191375970840454, "learning_rate": 4.2933573429371746e-05, "loss": 0.6701, "mean_token_accuracy": 0.7876343488693237, "num_tokens": 183580356.0, "step": 17710 }, { "entropy": 0.6784152984619141, "epoch": 0.14176, "grad_norm": 3.1157970428466797, "learning_rate": 4.2929571828731496e-05, "loss": 0.6712, "mean_token_accuracy": 0.8059970438480377, "num_tokens": 183671904.0, "step": 17720 }, { "entropy": 0.6695863723754882, "epoch": 0.14184, "grad_norm": 1.9475409984588623, "learning_rate": 4.292557022809124e-05, "loss": 0.676, "mean_token_accuracy": 0.7992732405662537, "num_tokens": 183767504.0, "step": 17730 }, { "entropy": 0.7474957168102264, "epoch": 0.14192, "grad_norm": 2.0891547203063965, "learning_rate": 4.2921568627450984e-05, "loss": 0.7477, "mean_token_accuracy": 0.7753039002418518, "num_tokens": 183915023.0, "step": 17740 }, { "entropy": 0.6764218300580979, "epoch": 0.142, "grad_norm": 4.752055644989014, "learning_rate": 4.291756702681073e-05, "loss": 0.6588, "mean_token_accuracy": 0.8206803679466248, "num_tokens": 183959554.0, "step": 17750 }, { "entropy": 0.713705575466156, "epoch": 0.14208, "grad_norm": 2.338366746902466, "learning_rate": 4.291356542617047e-05, "loss": 0.7202, "mean_token_accuracy": 0.7807314336299896, "num_tokens": 184123341.0, "step": 17760 }, { "entropy": 0.6387258172035217, "epoch": 0.14216, "grad_norm": 3.2214338779449463, "learning_rate": 4.2909563825530215e-05, "loss": 0.6262, "mean_token_accuracy": 0.8149839699268341, "num_tokens": 184203184.0, "step": 17770 }, { "entropy": 0.7314336955547333, "epoch": 0.14224, "grad_norm": 1.60294771194458, "learning_rate": 4.290556222488996e-05, "loss": 0.7458, "mean_token_accuracy": 0.787243378162384, "num_tokens": 184296100.0, "step": 17780 }, { "entropy": 0.7267613768577575, "epoch": 0.14232, "grad_norm": 3.5609188079833984, "learning_rate": 4.29015606242497e-05, "loss": 0.7261, "mean_token_accuracy": 0.782683128118515, "num_tokens": 184436334.0, "step": 17790 }, { "entropy": 0.6911878913640976, "epoch": 0.1424, "grad_norm": 4.508495330810547, "learning_rate": 4.2897559023609446e-05, "loss": 0.6901, "mean_token_accuracy": 0.816694313287735, "num_tokens": 184474612.0, "step": 17800 }, { "entropy": 0.7299909055233001, "epoch": 0.14248, "grad_norm": 1.6339030265808105, "learning_rate": 4.289355742296919e-05, "loss": 0.7324, "mean_token_accuracy": 0.7784196317195893, "num_tokens": 184638452.0, "step": 17810 }, { "entropy": 0.677353122830391, "epoch": 0.14256, "grad_norm": 3.2216131687164307, "learning_rate": 4.2889555822328934e-05, "loss": 0.6707, "mean_token_accuracy": 0.8039319694042206, "num_tokens": 184729321.0, "step": 17820 }, { "entropy": 0.6849507033824921, "epoch": 0.14264, "grad_norm": 1.9618197679519653, "learning_rate": 4.288555422168868e-05, "loss": 0.682, "mean_token_accuracy": 0.801938134431839, "num_tokens": 184823895.0, "step": 17830 }, { "entropy": 0.6950380116701126, "epoch": 0.14272, "grad_norm": 2.7634544372558594, "learning_rate": 4.288155262104842e-05, "loss": 0.6835, "mean_token_accuracy": 0.7937691688537598, "num_tokens": 184954917.0, "step": 17840 }, { "entropy": 0.7399749517440796, "epoch": 0.1428, "grad_norm": 4.700033664703369, "learning_rate": 4.2877551020408165e-05, "loss": 0.7359, "mean_token_accuracy": 0.8022678136825562, "num_tokens": 184995067.0, "step": 17850 }, { "entropy": 0.649728786945343, "epoch": 0.14288, "grad_norm": 1.8652160167694092, "learning_rate": 4.287354941976791e-05, "loss": 0.6535, "mean_token_accuracy": 0.7939728915691375, "num_tokens": 185158907.0, "step": 17860 }, { "entropy": 0.7378979861736298, "epoch": 0.14296, "grad_norm": 3.6058990955352783, "learning_rate": 4.286954781912765e-05, "loss": 0.7401, "mean_token_accuracy": 0.7886099576950073, "num_tokens": 185244763.0, "step": 17870 }, { "entropy": 0.7394471764564514, "epoch": 0.14304, "grad_norm": 2.268789529800415, "learning_rate": 4.2865546218487396e-05, "loss": 0.7316, "mean_token_accuracy": 0.7956243813037872, "num_tokens": 185339135.0, "step": 17880 }, { "entropy": 0.7285175561904907, "epoch": 0.14312, "grad_norm": 3.600632905960083, "learning_rate": 4.2861544617847146e-05, "loss": 0.7309, "mean_token_accuracy": 0.7789521157741547, "num_tokens": 185484017.0, "step": 17890 }, { "entropy": 0.8138603270053864, "epoch": 0.1432, "grad_norm": 5.118010520935059, "learning_rate": 4.285754301720688e-05, "loss": 0.8134, "mean_token_accuracy": 0.7866022169589997, "num_tokens": 185519948.0, "step": 17900 }, { "entropy": 0.7196021318435669, "epoch": 0.14328, "grad_norm": 2.0809333324432373, "learning_rate": 4.285354141656663e-05, "loss": 0.7205, "mean_token_accuracy": 0.777405959367752, "num_tokens": 185683788.0, "step": 17910 }, { "entropy": 0.7603550732135773, "epoch": 0.14336, "grad_norm": 3.6679017543792725, "learning_rate": 4.284953981592637e-05, "loss": 0.7601, "mean_token_accuracy": 0.7871981561183929, "num_tokens": 185771504.0, "step": 17920 }, { "entropy": 0.6981227338314057, "epoch": 0.14344, "grad_norm": 1.7911609411239624, "learning_rate": 4.284553821528612e-05, "loss": 0.7102, "mean_token_accuracy": 0.7979333579540253, "num_tokens": 185865860.0, "step": 17930 }, { "entropy": 0.692850923538208, "epoch": 0.14352, "grad_norm": 2.272995948791504, "learning_rate": 4.284153661464586e-05, "loss": 0.6849, "mean_token_accuracy": 0.790515398979187, "num_tokens": 186007152.0, "step": 17940 }, { "entropy": 0.7398914635181427, "epoch": 0.1436, "grad_norm": 4.5462517738342285, "learning_rate": 4.28375350140056e-05, "loss": 0.7504, "mean_token_accuracy": 0.8017347633838654, "num_tokens": 186046814.0, "step": 17950 }, { "entropy": 0.6555184721946716, "epoch": 0.14368, "grad_norm": 1.95183265209198, "learning_rate": 4.2833533413365346e-05, "loss": 0.6587, "mean_token_accuracy": 0.795371276140213, "num_tokens": 186210654.0, "step": 17960 }, { "entropy": 0.6913642883300781, "epoch": 0.14376, "grad_norm": 3.3755249977111816, "learning_rate": 4.2829531812725096e-05, "loss": 0.6766, "mean_token_accuracy": 0.8016327857971192, "num_tokens": 186300774.0, "step": 17970 }, { "entropy": 0.7149376988410949, "epoch": 0.14384, "grad_norm": 1.96938955783844, "learning_rate": 4.282553021208483e-05, "loss": 0.7272, "mean_token_accuracy": 0.7969446003437042, "num_tokens": 186393601.0, "step": 17980 }, { "entropy": 0.6979980230331421, "epoch": 0.14392, "grad_norm": 1.958343744277954, "learning_rate": 4.282152861144458e-05, "loss": 0.6821, "mean_token_accuracy": 0.7914879083633423, "num_tokens": 186534839.0, "step": 17990 }, { "entropy": 0.6919957995414734, "epoch": 0.144, "grad_norm": 5.367003917694092, "learning_rate": 4.281752701080433e-05, "loss": 0.6989, "mean_token_accuracy": 0.8145815014839173, "num_tokens": 186573635.0, "step": 18000 }, { "entropy": 0.6646011650562287, "epoch": 0.14408, "grad_norm": 1.4692102670669556, "learning_rate": 4.281352541016407e-05, "loss": 0.6618, "mean_token_accuracy": 0.7919034481048584, "num_tokens": 186736974.0, "step": 18010 }, { "entropy": 0.7525195896625518, "epoch": 0.14416, "grad_norm": 2.7450649738311768, "learning_rate": 4.280952380952381e-05, "loss": 0.7499, "mean_token_accuracy": 0.7886128842830658, "num_tokens": 186823264.0, "step": 18020 }, { "entropy": 0.674865996837616, "epoch": 0.14424, "grad_norm": 1.7210381031036377, "learning_rate": 4.280552220888355e-05, "loss": 0.6708, "mean_token_accuracy": 0.8056804001331329, "num_tokens": 186918189.0, "step": 18030 }, { "entropy": 0.6993119418621063, "epoch": 0.14432, "grad_norm": 3.116954803466797, "learning_rate": 4.28015206082433e-05, "loss": 0.6915, "mean_token_accuracy": 0.7871573805809021, "num_tokens": 187064210.0, "step": 18040 }, { "entropy": 0.6765283644199371, "epoch": 0.1444, "grad_norm": 4.494393348693848, "learning_rate": 4.2797519007603046e-05, "loss": 0.6596, "mean_token_accuracy": 0.8141628801822662, "num_tokens": 187108904.0, "step": 18050 }, { "entropy": 0.6434291660785675, "epoch": 0.14448, "grad_norm": 1.3172253370285034, "learning_rate": 4.279351740696278e-05, "loss": 0.6472, "mean_token_accuracy": 0.7929164648056031, "num_tokens": 187272744.0, "step": 18060 }, { "entropy": 0.7225076466798782, "epoch": 0.14456, "grad_norm": 3.2487969398498535, "learning_rate": 4.278951580632253e-05, "loss": 0.7147, "mean_token_accuracy": 0.7961399495601654, "num_tokens": 187358282.0, "step": 18070 }, { "entropy": 0.7479647159576416, "epoch": 0.14464, "grad_norm": 1.8958531618118286, "learning_rate": 4.278551420568228e-05, "loss": 0.7556, "mean_token_accuracy": 0.7860848486423493, "num_tokens": 187453663.0, "step": 18080 }, { "entropy": 0.7236427247524262, "epoch": 0.14472, "grad_norm": 2.367692708969116, "learning_rate": 4.278151260504202e-05, "loss": 0.7195, "mean_token_accuracy": 0.7861464738845825, "num_tokens": 187592584.0, "step": 18090 }, { "entropy": 0.6455498337745667, "epoch": 0.1448, "grad_norm": 5.7664008140563965, "learning_rate": 4.277751100440176e-05, "loss": 0.6331, "mean_token_accuracy": 0.828170657157898, "num_tokens": 187635905.0, "step": 18100 }, { "entropy": 0.6692795634269715, "epoch": 0.14488, "grad_norm": 1.6660337448120117, "learning_rate": 4.277350940376151e-05, "loss": 0.6714, "mean_token_accuracy": 0.7845994174480438, "num_tokens": 187799745.0, "step": 18110 }, { "entropy": 0.686090499162674, "epoch": 0.14496, "grad_norm": 2.794273614883423, "learning_rate": 4.276950780312125e-05, "loss": 0.6793, "mean_token_accuracy": 0.8053637564182281, "num_tokens": 187889057.0, "step": 18120 }, { "entropy": 0.8158165276050567, "epoch": 0.14504, "grad_norm": 1.5879884958267212, "learning_rate": 4.2765506202480995e-05, "loss": 0.8357, "mean_token_accuracy": 0.772424328327179, "num_tokens": 187984999.0, "step": 18130 }, { "entropy": 0.7030171751976013, "epoch": 0.14512, "grad_norm": 2.0355536937713623, "learning_rate": 4.276150460184074e-05, "loss": 0.7015, "mean_token_accuracy": 0.7827440857887268, "num_tokens": 188137105.0, "step": 18140 }, { "entropy": 0.7473817288875579, "epoch": 0.1452, "grad_norm": 4.451993465423584, "learning_rate": 4.275750300120048e-05, "loss": 0.7451, "mean_token_accuracy": 0.800970607995987, "num_tokens": 188184940.0, "step": 18150 }, { "entropy": 0.6455277144908905, "epoch": 0.14528, "grad_norm": 1.4451037645339966, "learning_rate": 4.2753501400560227e-05, "loss": 0.6434, "mean_token_accuracy": 0.7949560344219208, "num_tokens": 188348780.0, "step": 18160 }, { "entropy": 0.6379847198724746, "epoch": 0.14536, "grad_norm": 3.693110466003418, "learning_rate": 4.274949979991997e-05, "loss": 0.6399, "mean_token_accuracy": 0.8138350009918213, "num_tokens": 188433527.0, "step": 18170 }, { "entropy": 0.7638520359992981, "epoch": 0.14544, "grad_norm": 1.975597858428955, "learning_rate": 4.2745498199279714e-05, "loss": 0.7686, "mean_token_accuracy": 0.781257712841034, "num_tokens": 188529001.0, "step": 18180 }, { "entropy": 0.6942445755004882, "epoch": 0.14552, "grad_norm": 2.6074655055999756, "learning_rate": 4.274149659863946e-05, "loss": 0.6813, "mean_token_accuracy": 0.7936256647109985, "num_tokens": 188657642.0, "step": 18190 }, { "entropy": 0.6937342613935471, "epoch": 0.1456, "grad_norm": 4.36482572555542, "learning_rate": 4.27374949979992e-05, "loss": 0.7038, "mean_token_accuracy": 0.8094977378845215, "num_tokens": 188691835.0, "step": 18200 }, { "entropy": 0.7396616816520691, "epoch": 0.14568, "grad_norm": 1.999043583869934, "learning_rate": 4.2733493397358945e-05, "loss": 0.7391, "mean_token_accuracy": 0.7726250410079956, "num_tokens": 188855238.0, "step": 18210 }, { "entropy": 0.653659337759018, "epoch": 0.14576, "grad_norm": 3.183258295059204, "learning_rate": 4.272949179671869e-05, "loss": 0.6423, "mean_token_accuracy": 0.8124013483524323, "num_tokens": 188946083.0, "step": 18220 }, { "entropy": 0.6578094512224197, "epoch": 0.14584, "grad_norm": 1.5603227615356445, "learning_rate": 4.272549019607843e-05, "loss": 0.6681, "mean_token_accuracy": 0.8076582670211792, "num_tokens": 189039893.0, "step": 18230 }, { "entropy": 0.7733281016349792, "epoch": 0.14592, "grad_norm": 2.6879639625549316, "learning_rate": 4.2721488595438176e-05, "loss": 0.7638, "mean_token_accuracy": 0.7766210377216339, "num_tokens": 189170492.0, "step": 18240 }, { "entropy": 0.7496064841747284, "epoch": 0.146, "grad_norm": 5.602749347686768, "learning_rate": 4.271748699479792e-05, "loss": 0.7496, "mean_token_accuracy": 0.8074563264846801, "num_tokens": 189204844.0, "step": 18250 }, { "entropy": 0.7173964381217957, "epoch": 0.14608, "grad_norm": 2.4732484817504883, "learning_rate": 4.2713485394157664e-05, "loss": 0.7193, "mean_token_accuracy": 0.7776706099510193, "num_tokens": 189365834.0, "step": 18260 }, { "entropy": 0.6679727405309677, "epoch": 0.14616, "grad_norm": 3.4548866748809814, "learning_rate": 4.270948379351741e-05, "loss": 0.655, "mean_token_accuracy": 0.8126446306705475, "num_tokens": 189439795.0, "step": 18270 }, { "entropy": 0.6823953211307525, "epoch": 0.14624, "grad_norm": 1.6411288976669312, "learning_rate": 4.270548219287716e-05, "loss": 0.6754, "mean_token_accuracy": 0.804510623216629, "num_tokens": 189532954.0, "step": 18280 }, { "entropy": 0.7070669829845428, "epoch": 0.14632, "grad_norm": 2.162632703781128, "learning_rate": 4.2701480592236895e-05, "loss": 0.7097, "mean_token_accuracy": 0.7807902038097382, "num_tokens": 189686191.0, "step": 18290 }, { "entropy": 0.6889329075813293, "epoch": 0.1464, "grad_norm": 5.163381576538086, "learning_rate": 4.269747899159664e-05, "loss": 0.6813, "mean_token_accuracy": 0.817198920249939, "num_tokens": 189733075.0, "step": 18300 }, { "entropy": 0.6841817498207092, "epoch": 0.14648, "grad_norm": 1.968133807182312, "learning_rate": 4.269347739095638e-05, "loss": 0.6855, "mean_token_accuracy": 0.7830056250095367, "num_tokens": 189896915.0, "step": 18310 }, { "entropy": 0.7886098623275757, "epoch": 0.14656, "grad_norm": 4.9059624671936035, "learning_rate": 4.268947579031613e-05, "loss": 0.7714, "mean_token_accuracy": 0.7880832493305207, "num_tokens": 189979851.0, "step": 18320 }, { "entropy": 0.7536599695682525, "epoch": 0.14664, "grad_norm": 1.9752401113510132, "learning_rate": 4.268547418967587e-05, "loss": 0.7606, "mean_token_accuracy": 0.7860564172267914, "num_tokens": 190072889.0, "step": 18330 }, { "entropy": 0.6941265344619751, "epoch": 0.14672, "grad_norm": 2.4250595569610596, "learning_rate": 4.2681472589035613e-05, "loss": 0.6923, "mean_token_accuracy": 0.7891578614711762, "num_tokens": 190211488.0, "step": 18340 }, { "entropy": 0.6486434280872345, "epoch": 0.1468, "grad_norm": 6.588010311126709, "learning_rate": 4.2677470988395364e-05, "loss": 0.6383, "mean_token_accuracy": 0.8308093905448913, "num_tokens": 190249425.0, "step": 18350 }, { "entropy": 0.6599126338958741, "epoch": 0.14688, "grad_norm": 1.5604344606399536, "learning_rate": 4.267346938775511e-05, "loss": 0.6659, "mean_token_accuracy": 0.7891426503658294, "num_tokens": 190413265.0, "step": 18360 }, { "entropy": 0.6996788501739502, "epoch": 0.14696, "grad_norm": 3.1813206672668457, "learning_rate": 4.2669467787114845e-05, "loss": 0.695, "mean_token_accuracy": 0.7950066924095154, "num_tokens": 190520794.0, "step": 18370 }, { "entropy": 0.7303160548210144, "epoch": 0.14704, "grad_norm": 2.031789779663086, "learning_rate": 4.266546618647459e-05, "loss": 0.7181, "mean_token_accuracy": 0.7967051327228546, "num_tokens": 190618209.0, "step": 18380 }, { "entropy": 0.7421167910099029, "epoch": 0.14712, "grad_norm": 2.6520328521728516, "learning_rate": 4.266146458583434e-05, "loss": 0.7476, "mean_token_accuracy": 0.7703051686286926, "num_tokens": 190767091.0, "step": 18390 }, { "entropy": 0.6801874458789825, "epoch": 0.1472, "grad_norm": 6.082298278808594, "learning_rate": 4.265746298519408e-05, "loss": 0.6721, "mean_token_accuracy": 0.8231822848320007, "num_tokens": 190808680.0, "step": 18400 }, { "entropy": 0.6958725810050964, "epoch": 0.14728, "grad_norm": 2.39281964302063, "learning_rate": 4.265346138455382e-05, "loss": 0.6886, "mean_token_accuracy": 0.7879263579845428, "num_tokens": 190971495.0, "step": 18410 }, { "entropy": 0.5823374271392823, "epoch": 0.14736, "grad_norm": 4.031869888305664, "learning_rate": 4.264945978391357e-05, "loss": 0.5806, "mean_token_accuracy": 0.8282972872257233, "num_tokens": 191046136.0, "step": 18420 }, { "entropy": 0.661176335811615, "epoch": 0.14744, "grad_norm": 1.950272560119629, "learning_rate": 4.2645458183273314e-05, "loss": 0.6633, "mean_token_accuracy": 0.807181441783905, "num_tokens": 191138979.0, "step": 18430 }, { "entropy": 0.6866606652736664, "epoch": 0.14752, "grad_norm": 2.104179859161377, "learning_rate": 4.264145658263306e-05, "loss": 0.6801, "mean_token_accuracy": 0.7905325412750244, "num_tokens": 191279305.0, "step": 18440 }, { "entropy": 0.6701322138309479, "epoch": 0.1476, "grad_norm": 5.212357521057129, "learning_rate": 4.2637454981992794e-05, "loss": 0.6737, "mean_token_accuracy": 0.8171834945678711, "num_tokens": 191314357.0, "step": 18450 }, { "entropy": 0.7371676445007325, "epoch": 0.14768, "grad_norm": 1.7535126209259033, "learning_rate": 4.2633453381352545e-05, "loss": 0.7289, "mean_token_accuracy": 0.7763434410095215, "num_tokens": 191478197.0, "step": 18460 }, { "entropy": 0.7086591780185699, "epoch": 0.14776, "grad_norm": 3.249513626098633, "learning_rate": 4.262945178071229e-05, "loss": 0.7119, "mean_token_accuracy": 0.794546765089035, "num_tokens": 191566099.0, "step": 18470 }, { "entropy": 0.7037501275539398, "epoch": 0.14784, "grad_norm": 1.4489717483520508, "learning_rate": 4.262545018007203e-05, "loss": 0.7051, "mean_token_accuracy": 0.7946808993816376, "num_tokens": 191660177.0, "step": 18480 }, { "entropy": 0.7326873242855072, "epoch": 0.14792, "grad_norm": 3.0485432147979736, "learning_rate": 4.2621448579431776e-05, "loss": 0.7263, "mean_token_accuracy": 0.7795525968074799, "num_tokens": 191800082.0, "step": 18490 }, { "entropy": 0.6792185515165329, "epoch": 0.148, "grad_norm": 5.027985095977783, "learning_rate": 4.261744697879152e-05, "loss": 0.6863, "mean_token_accuracy": 0.8171028554439544, "num_tokens": 191837194.0, "step": 18500 }, { "entropy": 0.657631254196167, "epoch": 0.14808, "grad_norm": 2.3246474266052246, "learning_rate": 4.261344537815126e-05, "loss": 0.6565, "mean_token_accuracy": 0.7930569112300873, "num_tokens": 192001034.0, "step": 18510 }, { "entropy": 0.5926489204168319, "epoch": 0.14816, "grad_norm": 2.837730884552002, "learning_rate": 4.260944377751101e-05, "loss": 0.5752, "mean_token_accuracy": 0.8322206139564514, "num_tokens": 192080015.0, "step": 18520 }, { "entropy": 0.7116489589214325, "epoch": 0.14824, "grad_norm": 2.2960774898529053, "learning_rate": 4.260544217687075e-05, "loss": 0.7257, "mean_token_accuracy": 0.79299556016922, "num_tokens": 192171729.0, "step": 18530 }, { "entropy": 0.7381775319576264, "epoch": 0.14832, "grad_norm": 3.062760829925537, "learning_rate": 4.2601440576230494e-05, "loss": 0.7308, "mean_token_accuracy": 0.7781553030014038, "num_tokens": 192311045.0, "step": 18540 }, { "entropy": 0.6532236367464066, "epoch": 0.1484, "grad_norm": 4.872953414916992, "learning_rate": 4.259743897559024e-05, "loss": 0.6472, "mean_token_accuracy": 0.8242260694503785, "num_tokens": 192348947.0, "step": 18550 }, { "entropy": 0.6528898894786834, "epoch": 0.14848, "grad_norm": 1.5640556812286377, "learning_rate": 4.259343737494998e-05, "loss": 0.657, "mean_token_accuracy": 0.7887396216392517, "num_tokens": 192512787.0, "step": 18560 }, { "entropy": 0.7295106947422028, "epoch": 0.14856, "grad_norm": 3.5993812084198, "learning_rate": 4.2589435774309726e-05, "loss": 0.7213, "mean_token_accuracy": 0.7970360398292542, "num_tokens": 192606502.0, "step": 18570 }, { "entropy": 0.722634208202362, "epoch": 0.14864, "grad_norm": 2.839035987854004, "learning_rate": 4.258543417366947e-05, "loss": 0.7265, "mean_token_accuracy": 0.7931990921497345, "num_tokens": 192699340.0, "step": 18580 }, { "entropy": 0.6721846044063569, "epoch": 0.14872, "grad_norm": 2.395382881164551, "learning_rate": 4.258143257302921e-05, "loss": 0.661, "mean_token_accuracy": 0.7962628304958344, "num_tokens": 192833836.0, "step": 18590 }, { "entropy": 0.7208921670913696, "epoch": 0.1488, "grad_norm": 4.378078937530518, "learning_rate": 4.257743097238896e-05, "loss": 0.7192, "mean_token_accuracy": 0.8107449948787689, "num_tokens": 192871030.0, "step": 18600 }, { "entropy": 0.6742242455482483, "epoch": 0.14888, "grad_norm": 1.9025988578796387, "learning_rate": 4.25734293717487e-05, "loss": 0.6796, "mean_token_accuracy": 0.7915881097316741, "num_tokens": 193031554.0, "step": 18610 }, { "entropy": 0.6438063740730285, "epoch": 0.14896, "grad_norm": 2.7700579166412354, "learning_rate": 4.2569427771108444e-05, "loss": 0.6424, "mean_token_accuracy": 0.8095751345157624, "num_tokens": 193108456.0, "step": 18620 }, { "entropy": 0.7621070921421051, "epoch": 0.14904, "grad_norm": 2.202056884765625, "learning_rate": 4.256542617046819e-05, "loss": 0.7577, "mean_token_accuracy": 0.7914729356765747, "num_tokens": 193201814.0, "step": 18630 }, { "entropy": 0.7375538110733032, "epoch": 0.14912, "grad_norm": 3.1859166622161865, "learning_rate": 4.256142456982793e-05, "loss": 0.7406, "mean_token_accuracy": 0.7809139430522919, "num_tokens": 193343702.0, "step": 18640 }, { "entropy": 0.738964518904686, "epoch": 0.1492, "grad_norm": 5.434534549713135, "learning_rate": 4.2557422969187675e-05, "loss": 0.7283, "mean_token_accuracy": 0.8083572745323181, "num_tokens": 193384419.0, "step": 18650 }, { "entropy": 0.6557590305805207, "epoch": 0.14928, "grad_norm": 2.5137245655059814, "learning_rate": 4.255342136854742e-05, "loss": 0.6539, "mean_token_accuracy": 0.7920631766319275, "num_tokens": 193548184.0, "step": 18660 }, { "entropy": 0.6944803953170776, "epoch": 0.14936, "grad_norm": 3.546778440475464, "learning_rate": 4.254941976790717e-05, "loss": 0.6821, "mean_token_accuracy": 0.805289214849472, "num_tokens": 193629072.0, "step": 18670 }, { "entropy": 0.7223029971122742, "epoch": 0.14944, "grad_norm": 2.968050956726074, "learning_rate": 4.2545418167266906e-05, "loss": 0.7299, "mean_token_accuracy": 0.794168221950531, "num_tokens": 193720990.0, "step": 18680 }, { "entropy": 0.6880286276340485, "epoch": 0.14952, "grad_norm": 2.5110530853271484, "learning_rate": 4.254141656662665e-05, "loss": 0.6783, "mean_token_accuracy": 0.7911867260932922, "num_tokens": 193861524.0, "step": 18690 }, { "entropy": 0.7047180891036987, "epoch": 0.1496, "grad_norm": 5.739253520965576, "learning_rate": 4.2537414965986394e-05, "loss": 0.6969, "mean_token_accuracy": 0.8127654969692231, "num_tokens": 193904808.0, "step": 18700 }, { "entropy": 0.6836039662361145, "epoch": 0.14968, "grad_norm": 1.611159086227417, "learning_rate": 4.2533413365346144e-05, "loss": 0.6938, "mean_token_accuracy": 0.7843429386615753, "num_tokens": 194068648.0, "step": 18710 }, { "entropy": 0.6831209063529968, "epoch": 0.14976, "grad_norm": 5.149444103240967, "learning_rate": 4.252941176470588e-05, "loss": 0.6644, "mean_token_accuracy": 0.805613923072815, "num_tokens": 194170815.0, "step": 18720 }, { "entropy": 0.7409308552742004, "epoch": 0.14984, "grad_norm": 2.1351728439331055, "learning_rate": 4.2525410164065625e-05, "loss": 0.7432, "mean_token_accuracy": 0.7905377507209778, "num_tokens": 194265809.0, "step": 18730 }, { "entropy": 0.7644416093826294, "epoch": 0.14992, "grad_norm": 2.758338689804077, "learning_rate": 4.2521408563425376e-05, "loss": 0.7758, "mean_token_accuracy": 0.767414003610611, "num_tokens": 194408047.0, "step": 18740 }, { "entropy": 0.704254886507988, "epoch": 0.15, "grad_norm": 5.55208683013916, "learning_rate": 4.251740696278512e-05, "loss": 0.6892, "mean_token_accuracy": 0.8144555747509002, "num_tokens": 194449307.0, "step": 18750 }, { "entropy": 0.6592402338981629, "epoch": 0.15008, "grad_norm": 2.534287452697754, "learning_rate": 4.2513405362144856e-05, "loss": 0.6628, "mean_token_accuracy": 0.7897301912307739, "num_tokens": 194613127.0, "step": 18760 }, { "entropy": 0.6485491096973419, "epoch": 0.15016, "grad_norm": 3.2929108142852783, "learning_rate": 4.25094037615046e-05, "loss": 0.6495, "mean_token_accuracy": 0.8134187519550323, "num_tokens": 194695903.0, "step": 18770 }, { "entropy": 0.7434994518756867, "epoch": 0.15024, "grad_norm": 1.6057041883468628, "learning_rate": 4.250540216086435e-05, "loss": 0.7159, "mean_token_accuracy": 0.7978842616081238, "num_tokens": 194788940.0, "step": 18780 }, { "entropy": 0.688283383846283, "epoch": 0.15032, "grad_norm": 2.4404923915863037, "learning_rate": 4.2501400560224094e-05, "loss": 0.689, "mean_token_accuracy": 0.7911949157714844, "num_tokens": 194916386.0, "step": 18790 }, { "entropy": 0.8100628733634949, "epoch": 0.1504, "grad_norm": 4.466749668121338, "learning_rate": 4.249739895958383e-05, "loss": 0.8107, "mean_token_accuracy": 0.7955973625183106, "num_tokens": 194950130.0, "step": 18800 }, { "entropy": 0.6351143300533295, "epoch": 0.15048, "grad_norm": 2.019073009490967, "learning_rate": 4.249339735894358e-05, "loss": 0.6326, "mean_token_accuracy": 0.7981497406959533, "num_tokens": 195113970.0, "step": 18810 }, { "entropy": 0.713024452328682, "epoch": 0.15056, "grad_norm": 6.120118618011475, "learning_rate": 4.2489395758303325e-05, "loss": 0.7136, "mean_token_accuracy": 0.7919516146183014, "num_tokens": 195207716.0, "step": 18820 }, { "entropy": 0.7377299249172211, "epoch": 0.15064, "grad_norm": 1.8586524724960327, "learning_rate": 4.248539415766307e-05, "loss": 0.725, "mean_token_accuracy": 0.7956167995929718, "num_tokens": 195300600.0, "step": 18830 }, { "entropy": 0.7466140806674957, "epoch": 0.15072, "grad_norm": 2.282235622406006, "learning_rate": 4.2481392557022806e-05, "loss": 0.7458, "mean_token_accuracy": 0.7780785858631134, "num_tokens": 195437287.0, "step": 18840 }, { "entropy": 0.7257640540599823, "epoch": 0.1508, "grad_norm": 4.834536552429199, "learning_rate": 4.2477390956382556e-05, "loss": 0.736, "mean_token_accuracy": 0.8087976992130279, "num_tokens": 195473355.0, "step": 18850 }, { "entropy": 0.6335210621356964, "epoch": 0.15088, "grad_norm": 2.2238168716430664, "learning_rate": 4.24733893557423e-05, "loss": 0.632, "mean_token_accuracy": 0.7979970753192902, "num_tokens": 195637195.0, "step": 18860 }, { "entropy": 0.7397562026977539, "epoch": 0.15096, "grad_norm": 3.7101635932922363, "learning_rate": 4.2469387755102044e-05, "loss": 0.7349, "mean_token_accuracy": 0.7917893409729004, "num_tokens": 195715751.0, "step": 18870 }, { "entropy": 0.6919900894165039, "epoch": 0.15104, "grad_norm": 2.2636332511901855, "learning_rate": 4.246538615446179e-05, "loss": 0.7015, "mean_token_accuracy": 0.795883733034134, "num_tokens": 195809708.0, "step": 18880 }, { "entropy": 0.7179597795009613, "epoch": 0.15112, "grad_norm": 2.5499370098114014, "learning_rate": 4.246138455382153e-05, "loss": 0.7029, "mean_token_accuracy": 0.7851691603660583, "num_tokens": 195957435.0, "step": 18890 }, { "entropy": 0.7035428136587143, "epoch": 0.1512, "grad_norm": 4.988234996795654, "learning_rate": 4.2457382953181275e-05, "loss": 0.7099, "mean_token_accuracy": 0.8059496223926544, "num_tokens": 196005707.0, "step": 18900 }, { "entropy": 0.6873391628265381, "epoch": 0.15128, "grad_norm": 1.5640641450881958, "learning_rate": 4.245338135254102e-05, "loss": 0.6856, "mean_token_accuracy": 0.7872862756252289, "num_tokens": 196169547.0, "step": 18910 }, { "entropy": 0.7396698713302612, "epoch": 0.15136, "grad_norm": 3.5405170917510986, "learning_rate": 4.244937975190076e-05, "loss": 0.7322, "mean_token_accuracy": 0.793604850769043, "num_tokens": 196252881.0, "step": 18920 }, { "entropy": 0.7080940783023835, "epoch": 0.15144, "grad_norm": 2.48294997215271, "learning_rate": 4.2445378151260506e-05, "loss": 0.7254, "mean_token_accuracy": 0.7928863525390625, "num_tokens": 196346418.0, "step": 18930 }, { "entropy": 0.7091092467308044, "epoch": 0.15152, "grad_norm": 3.4301013946533203, "learning_rate": 4.244137655062025e-05, "loss": 0.6931, "mean_token_accuracy": 0.792174756526947, "num_tokens": 196478770.0, "step": 18940 }, { "entropy": 0.713330739736557, "epoch": 0.1516, "grad_norm": 5.105082988739014, "learning_rate": 4.2437374949979994e-05, "loss": 0.7121, "mean_token_accuracy": 0.8125756978988647, "num_tokens": 196515176.0, "step": 18950 }, { "entropy": 0.6464562892913819, "epoch": 0.15168, "grad_norm": 1.581397533416748, "learning_rate": 4.243337334933974e-05, "loss": 0.6536, "mean_token_accuracy": 0.7951503872871399, "num_tokens": 196678323.0, "step": 18960 }, { "entropy": 0.698421198129654, "epoch": 0.15176, "grad_norm": 3.3461697101593018, "learning_rate": 4.242937174869948e-05, "loss": 0.6922, "mean_token_accuracy": 0.8040730357170105, "num_tokens": 196751744.0, "step": 18970 }, { "entropy": 0.7868056535720825, "epoch": 0.15184, "grad_norm": 2.1324665546417236, "learning_rate": 4.2425370148059225e-05, "loss": 0.7818, "mean_token_accuracy": 0.785459703207016, "num_tokens": 196847306.0, "step": 18980 }, { "entropy": 0.7112065970897674, "epoch": 0.15192, "grad_norm": 2.653071403503418, "learning_rate": 4.242136854741897e-05, "loss": 0.7172, "mean_token_accuracy": 0.7813547492027283, "num_tokens": 196988920.0, "step": 18990 }, { "entropy": 0.7264602482318878, "epoch": 0.152, "grad_norm": 4.949150562286377, "learning_rate": 4.241736694677871e-05, "loss": 0.714, "mean_token_accuracy": 0.8084556519985199, "num_tokens": 197028320.0, "step": 19000 }, { "entropy": 0.6633932411670684, "epoch": 0.15208, "grad_norm": 1.676558494567871, "learning_rate": 4.2413365346138456e-05, "loss": 0.6615, "mean_token_accuracy": 0.7951200783252717, "num_tokens": 197189113.0, "step": 19010 }, { "entropy": 0.7065504640340805, "epoch": 0.15216, "grad_norm": 3.3805596828460693, "learning_rate": 4.2409363745498206e-05, "loss": 0.7037, "mean_token_accuracy": 0.8025711834430694, "num_tokens": 197256707.0, "step": 19020 }, { "entropy": 0.677063661813736, "epoch": 0.15224, "grad_norm": 1.676101565361023, "learning_rate": 4.240536214485794e-05, "loss": 0.6744, "mean_token_accuracy": 0.8026287913322449, "num_tokens": 197349075.0, "step": 19030 }, { "entropy": 0.6776481896638871, "epoch": 0.15232, "grad_norm": 2.8877198696136475, "learning_rate": 4.240136054421769e-05, "loss": 0.6765, "mean_token_accuracy": 0.7947644352912903, "num_tokens": 197488133.0, "step": 19040 }, { "entropy": 0.6748267084360122, "epoch": 0.1524, "grad_norm": 5.164557456970215, "learning_rate": 4.239735894357743e-05, "loss": 0.6738, "mean_token_accuracy": 0.8236766576766967, "num_tokens": 197528932.0, "step": 19050 }, { "entropy": 0.6359987080097198, "epoch": 0.15248, "grad_norm": 1.9818384647369385, "learning_rate": 4.239335734293718e-05, "loss": 0.6366, "mean_token_accuracy": 0.7969345510005951, "num_tokens": 197692772.0, "step": 19060 }, { "entropy": 0.6792991012334824, "epoch": 0.15256, "grad_norm": 3.6587953567504883, "learning_rate": 4.238935574229692e-05, "loss": 0.6727, "mean_token_accuracy": 0.800058513879776, "num_tokens": 197800190.0, "step": 19070 }, { "entropy": 0.7119033336639404, "epoch": 0.15264, "grad_norm": 1.6803432703018188, "learning_rate": 4.238535414165666e-05, "loss": 0.7147, "mean_token_accuracy": 0.7966416597366333, "num_tokens": 197896168.0, "step": 19080 }, { "entropy": 0.6679206490516663, "epoch": 0.15272, "grad_norm": 4.193465232849121, "learning_rate": 4.2381352541016406e-05, "loss": 0.6609, "mean_token_accuracy": 0.8014414668083191, "num_tokens": 198020100.0, "step": 19090 }, { "entropy": 0.6405259430408478, "epoch": 0.1528, "grad_norm": 5.6212639808654785, "learning_rate": 4.2377350940376156e-05, "loss": 0.6417, "mean_token_accuracy": 0.82822545170784, "num_tokens": 198054581.0, "step": 19100 }, { "entropy": 0.6395948946475982, "epoch": 0.15288, "grad_norm": 1.7270840406417847, "learning_rate": 4.237334933973589e-05, "loss": 0.6407, "mean_token_accuracy": 0.7966846108436585, "num_tokens": 198217317.0, "step": 19110 }, { "entropy": 0.6689329862594604, "epoch": 0.15296, "grad_norm": 3.001988410949707, "learning_rate": 4.236934773909564e-05, "loss": 0.6631, "mean_token_accuracy": 0.8121066212654113, "num_tokens": 198296794.0, "step": 19120 }, { "entropy": 0.7236101567745209, "epoch": 0.15304, "grad_norm": 1.6768136024475098, "learning_rate": 4.236534613845539e-05, "loss": 0.7296, "mean_token_accuracy": 0.7948003768920898, "num_tokens": 198389673.0, "step": 19130 }, { "entropy": 0.701026451587677, "epoch": 0.15312, "grad_norm": 2.5580360889434814, "learning_rate": 4.236134453781513e-05, "loss": 0.6882, "mean_token_accuracy": 0.7959494709968566, "num_tokens": 198520384.0, "step": 19140 }, { "entropy": 0.6730273187160491, "epoch": 0.1532, "grad_norm": 5.400633335113525, "learning_rate": 4.235734293717487e-05, "loss": 0.6833, "mean_token_accuracy": 0.8185671031475067, "num_tokens": 198551923.0, "step": 19150 }, { "entropy": 0.6697177171707154, "epoch": 0.15328, "grad_norm": 1.5990246534347534, "learning_rate": 4.235334133653461e-05, "loss": 0.6682, "mean_token_accuracy": 0.7860955119132995, "num_tokens": 198715763.0, "step": 19160 }, { "entropy": 0.725685316324234, "epoch": 0.15336, "grad_norm": 3.6420037746429443, "learning_rate": 4.234933973589436e-05, "loss": 0.7154, "mean_token_accuracy": 0.7995248973369599, "num_tokens": 198801137.0, "step": 19170 }, { "entropy": 0.712893283367157, "epoch": 0.15344, "grad_norm": 1.982788324356079, "learning_rate": 4.2345338135254106e-05, "loss": 0.724, "mean_token_accuracy": 0.7917198956012725, "num_tokens": 198895068.0, "step": 19180 }, { "entropy": 0.709039568901062, "epoch": 0.15352, "grad_norm": 2.320348024368286, "learning_rate": 4.234133653461384e-05, "loss": 0.7029, "mean_token_accuracy": 0.7877241373062134, "num_tokens": 199035362.0, "step": 19190 }, { "entropy": 0.7493283927440644, "epoch": 0.1536, "grad_norm": 5.125085353851318, "learning_rate": 4.233733493397359e-05, "loss": 0.7407, "mean_token_accuracy": 0.8060150146484375, "num_tokens": 199075987.0, "step": 19200 }, { "entropy": 0.6299128115177155, "epoch": 0.15368, "grad_norm": 1.9225174188613892, "learning_rate": 4.233333333333334e-05, "loss": 0.6336, "mean_token_accuracy": 0.7975024521350861, "num_tokens": 199239827.0, "step": 19210 }, { "entropy": 0.7166049897670745, "epoch": 0.15376, "grad_norm": 3.492375373840332, "learning_rate": 4.232933173269308e-05, "loss": 0.7043, "mean_token_accuracy": 0.7977482974529266, "num_tokens": 199328869.0, "step": 19220 }, { "entropy": 0.7100907593965531, "epoch": 0.15384, "grad_norm": 2.263200044631958, "learning_rate": 4.232533013205282e-05, "loss": 0.7228, "mean_token_accuracy": 0.7954891145229339, "num_tokens": 199422742.0, "step": 19230 }, { "entropy": 0.7438731074333191, "epoch": 0.15392, "grad_norm": 3.8734118938446045, "learning_rate": 4.232132853141257e-05, "loss": 0.7305, "mean_token_accuracy": 0.77861328125, "num_tokens": 199564390.0, "step": 19240 }, { "entropy": 0.6163742423057557, "epoch": 0.154, "grad_norm": 5.356444358825684, "learning_rate": 4.231732693077231e-05, "loss": 0.6088, "mean_token_accuracy": 0.8335274875164032, "num_tokens": 199604136.0, "step": 19250 }, { "entropy": 0.644869989156723, "epoch": 0.15408, "grad_norm": 2.1006970405578613, "learning_rate": 4.2313325330132055e-05, "loss": 0.6504, "mean_token_accuracy": 0.7927210628986359, "num_tokens": 199767976.0, "step": 19260 }, { "entropy": 0.6798153162002564, "epoch": 0.15416, "grad_norm": 3.876525640487671, "learning_rate": 4.23093237294918e-05, "loss": 0.6737, "mean_token_accuracy": 0.8067686200141907, "num_tokens": 199845667.0, "step": 19270 }, { "entropy": 0.7137039065361023, "epoch": 0.15424, "grad_norm": 1.6293325424194336, "learning_rate": 4.230532212885154e-05, "loss": 0.7141, "mean_token_accuracy": 0.7985394179821015, "num_tokens": 199939097.0, "step": 19280 }, { "entropy": 0.7467499852180481, "epoch": 0.15432, "grad_norm": 2.252523183822632, "learning_rate": 4.2301320528211287e-05, "loss": 0.7443, "mean_token_accuracy": 0.7782529950141907, "num_tokens": 200084052.0, "step": 19290 }, { "entropy": 0.7079242587089538, "epoch": 0.1544, "grad_norm": 5.446684837341309, "learning_rate": 4.229731892757103e-05, "loss": 0.7021, "mean_token_accuracy": 0.808152151107788, "num_tokens": 200122572.0, "step": 19300 }, { "entropy": 0.7082178056240082, "epoch": 0.15448, "grad_norm": 1.3922998905181885, "learning_rate": 4.2293317326930774e-05, "loss": 0.7151, "mean_token_accuracy": 0.7777723610401154, "num_tokens": 200286412.0, "step": 19310 }, { "entropy": 0.6543007403612137, "epoch": 0.15456, "grad_norm": 2.8440117835998535, "learning_rate": 4.228931572629052e-05, "loss": 0.6496, "mean_token_accuracy": 0.8097146630287171, "num_tokens": 200377411.0, "step": 19320 }, { "entropy": 0.7595893800258636, "epoch": 0.15464, "grad_norm": 2.9022114276885986, "learning_rate": 4.228531412565026e-05, "loss": 0.7669, "mean_token_accuracy": 0.7848247706890106, "num_tokens": 200470933.0, "step": 19330 }, { "entropy": 0.7211185038089752, "epoch": 0.15472, "grad_norm": 4.228939056396484, "learning_rate": 4.2281312525010005e-05, "loss": 0.7147, "mean_token_accuracy": 0.784790825843811, "num_tokens": 200607059.0, "step": 19340 }, { "entropy": 0.7344958037137985, "epoch": 0.1548, "grad_norm": 5.724233150482178, "learning_rate": 4.227731092436975e-05, "loss": 0.7253, "mean_token_accuracy": 0.8091524004936218, "num_tokens": 200644246.0, "step": 19350 }, { "entropy": 0.6697327792644501, "epoch": 0.15488, "grad_norm": 1.7182453870773315, "learning_rate": 4.227330932372949e-05, "loss": 0.67, "mean_token_accuracy": 0.786536306142807, "num_tokens": 200807268.0, "step": 19360 }, { "entropy": 0.7112932562828064, "epoch": 0.15496, "grad_norm": 2.863586902618408, "learning_rate": 4.2269307723089236e-05, "loss": 0.7036, "mean_token_accuracy": 0.7947294473648071, "num_tokens": 200890676.0, "step": 19370 }, { "entropy": 0.6879124045372009, "epoch": 0.15504, "grad_norm": 1.6815513372421265, "learning_rate": 4.226530612244898e-05, "loss": 0.6898, "mean_token_accuracy": 0.8014305233955383, "num_tokens": 200984817.0, "step": 19380 }, { "entropy": 0.6826489448547364, "epoch": 0.15512, "grad_norm": 1.8824827671051025, "learning_rate": 4.2261304521808724e-05, "loss": 0.6815, "mean_token_accuracy": 0.7894177079200745, "num_tokens": 201122996.0, "step": 19390 }, { "entropy": 0.6420043885707856, "epoch": 0.1552, "grad_norm": 5.111451148986816, "learning_rate": 4.225730292116847e-05, "loss": 0.6376, "mean_token_accuracy": 0.8209051549434662, "num_tokens": 201160880.0, "step": 19400 }, { "entropy": 0.6816224813461303, "epoch": 0.15528, "grad_norm": 1.92134690284729, "learning_rate": 4.225330132052822e-05, "loss": 0.684, "mean_token_accuracy": 0.7832132339477539, "num_tokens": 201324720.0, "step": 19410 }, { "entropy": 0.6431758165359497, "epoch": 0.15536, "grad_norm": 3.127551555633545, "learning_rate": 4.2249299719887955e-05, "loss": 0.639, "mean_token_accuracy": 0.80965576171875, "num_tokens": 201416537.0, "step": 19420 }, { "entropy": 0.7405179023742676, "epoch": 0.15544, "grad_norm": 2.292238473892212, "learning_rate": 4.22452981192477e-05, "loss": 0.7422, "mean_token_accuracy": 0.7869556427001954, "num_tokens": 201510432.0, "step": 19430 }, { "entropy": 0.7542095065116883, "epoch": 0.15552, "grad_norm": 3.312140464782715, "learning_rate": 4.224129651860744e-05, "loss": 0.7461, "mean_token_accuracy": 0.7786569416522979, "num_tokens": 201650876.0, "step": 19440 }, { "entropy": 0.7001890242099762, "epoch": 0.1556, "grad_norm": 4.935096263885498, "learning_rate": 4.223729491796719e-05, "loss": 0.6955, "mean_token_accuracy": 0.8097824454307556, "num_tokens": 201694272.0, "step": 19450 }, { "entropy": 0.7665621936321259, "epoch": 0.15568, "grad_norm": 2.610062599182129, "learning_rate": 4.223329331732693e-05, "loss": 0.766, "mean_token_accuracy": 0.7707682073116302, "num_tokens": 201858112.0, "step": 19460 }, { "entropy": 0.7182255983352661, "epoch": 0.15576, "grad_norm": 2.5512301921844482, "learning_rate": 4.2229291716686673e-05, "loss": 0.7085, "mean_token_accuracy": 0.796176016330719, "num_tokens": 201948089.0, "step": 19470 }, { "entropy": 0.7558573484420776, "epoch": 0.15584, "grad_norm": 1.8710399866104126, "learning_rate": 4.2225290116046424e-05, "loss": 0.7521, "mean_token_accuracy": 0.786397272348404, "num_tokens": 202043391.0, "step": 19480 }, { "entropy": 0.6509985029697418, "epoch": 0.15592, "grad_norm": 2.4351141452789307, "learning_rate": 4.222128851540617e-05, "loss": 0.6488, "mean_token_accuracy": 0.7967620253562927, "num_tokens": 202192743.0, "step": 19490 }, { "entropy": 0.6224234402179718, "epoch": 0.156, "grad_norm": 4.860520362854004, "learning_rate": 4.2217286914765905e-05, "loss": 0.6334, "mean_token_accuracy": 0.8236030638217926, "num_tokens": 202242339.0, "step": 19500 }, { "entropy": 0.698481160402298, "epoch": 0.15608, "grad_norm": 1.8703315258026123, "learning_rate": 4.221328531412565e-05, "loss": 0.6943, "mean_token_accuracy": 0.7797508656978607, "num_tokens": 202406179.0, "step": 19510 }, { "entropy": 0.6230564057826996, "epoch": 0.15616, "grad_norm": 3.967972755432129, "learning_rate": 4.22092837134854e-05, "loss": 0.6248, "mean_token_accuracy": 0.8202518820762634, "num_tokens": 202487908.0, "step": 19520 }, { "entropy": 0.6970887124538422, "epoch": 0.15624, "grad_norm": 1.5502827167510986, "learning_rate": 4.220528211284514e-05, "loss": 0.7105, "mean_token_accuracy": 0.7946359276771545, "num_tokens": 202581792.0, "step": 19530 }, { "entropy": 0.6995569050312043, "epoch": 0.15632, "grad_norm": 1.9446771144866943, "learning_rate": 4.220128051220488e-05, "loss": 0.6932, "mean_token_accuracy": 0.7842546761035919, "num_tokens": 202727464.0, "step": 19540 }, { "entropy": 0.7128632128238678, "epoch": 0.1564, "grad_norm": 4.232076644897461, "learning_rate": 4.219727891156463e-05, "loss": 0.693, "mean_token_accuracy": 0.8123300790786743, "num_tokens": 202764075.0, "step": 19550 }, { "entropy": 0.6697188079357147, "epoch": 0.15648, "grad_norm": 1.7868459224700928, "learning_rate": 4.2193277310924374e-05, "loss": 0.6731, "mean_token_accuracy": 0.78935027718544, "num_tokens": 202927915.0, "step": 19560 }, { "entropy": 0.7008773446083069, "epoch": 0.15656, "grad_norm": 2.614778757095337, "learning_rate": 4.218927571028412e-05, "loss": 0.6963, "mean_token_accuracy": 0.7972956120967865, "num_tokens": 203031032.0, "step": 19570 }, { "entropy": 0.7313461899757385, "epoch": 0.15664, "grad_norm": 1.9885919094085693, "learning_rate": 4.2185274109643854e-05, "loss": 0.7419, "mean_token_accuracy": 0.7931115984916687, "num_tokens": 203128160.0, "step": 19580 }, { "entropy": 0.6779391825199127, "epoch": 0.15672, "grad_norm": 2.3926239013671875, "learning_rate": 4.2181272509003605e-05, "loss": 0.6732, "mean_token_accuracy": 0.7938078105449676, "num_tokens": 203262609.0, "step": 19590 }, { "entropy": 0.7222388565540314, "epoch": 0.1568, "grad_norm": 5.933003902435303, "learning_rate": 4.217727090836335e-05, "loss": 0.7378, "mean_token_accuracy": 0.8096826434135437, "num_tokens": 203294897.0, "step": 19600 }, { "entropy": 0.652388846874237, "epoch": 0.15688, "grad_norm": 1.747644305229187, "learning_rate": 4.217326930772309e-05, "loss": 0.6483, "mean_token_accuracy": 0.7943209648132324, "num_tokens": 203458737.0, "step": 19610 }, { "entropy": 0.6800316214561463, "epoch": 0.15696, "grad_norm": 3.3127083778381348, "learning_rate": 4.2169267707082836e-05, "loss": 0.672, "mean_token_accuracy": 0.8078381657600403, "num_tokens": 203548943.0, "step": 19620 }, { "entropy": 0.6584042370319366, "epoch": 0.15704, "grad_norm": 1.8123185634613037, "learning_rate": 4.216526610644258e-05, "loss": 0.6724, "mean_token_accuracy": 0.8035368263721466, "num_tokens": 203644062.0, "step": 19630 }, { "entropy": 0.7530373930931091, "epoch": 0.15712, "grad_norm": 2.544311285018921, "learning_rate": 4.216126450580232e-05, "loss": 0.739, "mean_token_accuracy": 0.7743339478969574, "num_tokens": 203783670.0, "step": 19640 }, { "entropy": 0.7966648250818252, "epoch": 0.1572, "grad_norm": 6.299690246582031, "learning_rate": 4.215726290516207e-05, "loss": 0.7899, "mean_token_accuracy": 0.7938412070274353, "num_tokens": 203821346.0, "step": 19650 }, { "entropy": 0.6870499551296234, "epoch": 0.15728, "grad_norm": 1.97413170337677, "learning_rate": 4.215326130452181e-05, "loss": 0.6924, "mean_token_accuracy": 0.7842019677162171, "num_tokens": 203985176.0, "step": 19660 }, { "entropy": 0.7172618508338928, "epoch": 0.15736, "grad_norm": 3.5785481929779053, "learning_rate": 4.2149259703881554e-05, "loss": 0.7094, "mean_token_accuracy": 0.7972020983695984, "num_tokens": 204063579.0, "step": 19670 }, { "entropy": 0.7699088275432586, "epoch": 0.15744, "grad_norm": 2.793287754058838, "learning_rate": 4.21452581032413e-05, "loss": 0.7728, "mean_token_accuracy": 0.7856050133705139, "num_tokens": 204156745.0, "step": 19680 }, { "entropy": 0.6631784498691559, "epoch": 0.15752, "grad_norm": 2.4335780143737793, "learning_rate": 4.214125650260104e-05, "loss": 0.6609, "mean_token_accuracy": 0.794046014547348, "num_tokens": 204292303.0, "step": 19690 }, { "entropy": 0.693471497297287, "epoch": 0.1576, "grad_norm": 5.316162586212158, "learning_rate": 4.2137254901960786e-05, "loss": 0.6837, "mean_token_accuracy": 0.8156568884849549, "num_tokens": 204328539.0, "step": 19700 }, { "entropy": 0.6465562105178833, "epoch": 0.15768, "grad_norm": 1.4568153619766235, "learning_rate": 4.213325330132053e-05, "loss": 0.6482, "mean_token_accuracy": 0.7928203999996185, "num_tokens": 204492023.0, "step": 19710 }, { "entropy": 0.7241931259632111, "epoch": 0.15776, "grad_norm": 3.7554190158843994, "learning_rate": 4.212925170068027e-05, "loss": 0.7131, "mean_token_accuracy": 0.8023994624614715, "num_tokens": 204571405.0, "step": 19720 }, { "entropy": 0.7386159539222718, "epoch": 0.15784, "grad_norm": 2.023409366607666, "learning_rate": 4.2125250100040024e-05, "loss": 0.7629, "mean_token_accuracy": 0.7867858648300171, "num_tokens": 204664988.0, "step": 19730 }, { "entropy": 0.7463280439376831, "epoch": 0.15792, "grad_norm": 2.3529040813446045, "learning_rate": 4.212124849939976e-05, "loss": 0.7362, "mean_token_accuracy": 0.7781756043434143, "num_tokens": 204809629.0, "step": 19740 }, { "entropy": 0.6823469817638397, "epoch": 0.158, "grad_norm": 4.835700511932373, "learning_rate": 4.2117246898759504e-05, "loss": 0.6851, "mean_token_accuracy": 0.8094355642795563, "num_tokens": 204856403.0, "step": 19750 }, { "entropy": 0.6258834838867188, "epoch": 0.15808, "grad_norm": 1.9782274961471558, "learning_rate": 4.211324529811925e-05, "loss": 0.6277, "mean_token_accuracy": 0.8002338349819184, "num_tokens": 205019198.0, "step": 19760 }, { "entropy": 0.627811861038208, "epoch": 0.15816, "grad_norm": 4.075545310974121, "learning_rate": 4.2109243697479e-05, "loss": 0.6065, "mean_token_accuracy": 0.8245073854923248, "num_tokens": 205095203.0, "step": 19770 }, { "entropy": 0.7167203783988952, "epoch": 0.15824, "grad_norm": 2.2478179931640625, "learning_rate": 4.2105242096838735e-05, "loss": 0.7365, "mean_token_accuracy": 0.7935724973678588, "num_tokens": 205188189.0, "step": 19780 }, { "entropy": 0.7171307623386383, "epoch": 0.15832, "grad_norm": 2.568040609359741, "learning_rate": 4.210124049619848e-05, "loss": 0.7072, "mean_token_accuracy": 0.7877768337726593, "num_tokens": 205324055.0, "step": 19790 }, { "entropy": 0.6883418500423432, "epoch": 0.1584, "grad_norm": 5.136399745941162, "learning_rate": 4.209723889555823e-05, "loss": 0.6821, "mean_token_accuracy": 0.8200072169303894, "num_tokens": 205363016.0, "step": 19800 }, { "entropy": 0.6806117773056031, "epoch": 0.15848, "grad_norm": 1.7838847637176514, "learning_rate": 4.209323729491797e-05, "loss": 0.6815, "mean_token_accuracy": 0.7882977426052094, "num_tokens": 205523778.0, "step": 19810 }, { "entropy": 0.6919850051403046, "epoch": 0.15856, "grad_norm": 3.569631814956665, "learning_rate": 4.208923569427771e-05, "loss": 0.6866, "mean_token_accuracy": 0.8072648346424103, "num_tokens": 205591791.0, "step": 19820 }, { "entropy": 0.7123517572879792, "epoch": 0.15864, "grad_norm": 2.0135209560394287, "learning_rate": 4.2085234093637454e-05, "loss": 0.7191, "mean_token_accuracy": 0.7988763332366944, "num_tokens": 205683568.0, "step": 19830 }, { "entropy": 0.7435868084430695, "epoch": 0.15872, "grad_norm": 2.6366422176361084, "learning_rate": 4.2081232492997204e-05, "loss": 0.7447, "mean_token_accuracy": 0.7753384232521057, "num_tokens": 205830579.0, "step": 19840 }, { "entropy": 0.7018171131610871, "epoch": 0.1588, "grad_norm": 5.094723701477051, "learning_rate": 4.207723089235695e-05, "loss": 0.6812, "mean_token_accuracy": 0.8191541612148285, "num_tokens": 205874037.0, "step": 19850 }, { "entropy": 0.6592819809913635, "epoch": 0.15888, "grad_norm": 3.089578151702881, "learning_rate": 4.2073229291716685e-05, "loss": 0.6591, "mean_token_accuracy": 0.7914570152759552, "num_tokens": 206037877.0, "step": 19860 }, { "entropy": 0.6626061230897904, "epoch": 0.15896, "grad_norm": 3.9876773357391357, "learning_rate": 4.2069227691076435e-05, "loss": 0.6599, "mean_token_accuracy": 0.8088712930679322, "num_tokens": 206122487.0, "step": 19870 }, { "entropy": 0.6507070481777191, "epoch": 0.15904, "grad_norm": 2.580434799194336, "learning_rate": 4.206522609043618e-05, "loss": 0.6513, "mean_token_accuracy": 0.8118492126464844, "num_tokens": 206215581.0, "step": 19880 }, { "entropy": 0.7038115084171295, "epoch": 0.15912, "grad_norm": 3.0502352714538574, "learning_rate": 4.206122448979592e-05, "loss": 0.7058, "mean_token_accuracy": 0.7882644593715668, "num_tokens": 206343780.0, "step": 19890 }, { "entropy": 0.626172450184822, "epoch": 0.1592, "grad_norm": 5.992545127868652, "learning_rate": 4.205722288915566e-05, "loss": 0.6233, "mean_token_accuracy": 0.8255122363567352, "num_tokens": 206378875.0, "step": 19900 }, { "entropy": 0.6809434950351715, "epoch": 0.15928, "grad_norm": 2.478912115097046, "learning_rate": 4.205322128851541e-05, "loss": 0.672, "mean_token_accuracy": 0.7909763336181641, "num_tokens": 206541079.0, "step": 19910 }, { "entropy": 0.6646119356155396, "epoch": 0.15936, "grad_norm": 3.7207558155059814, "learning_rate": 4.2049219687875154e-05, "loss": 0.6715, "mean_token_accuracy": 0.8160078346729278, "num_tokens": 206612831.0, "step": 19920 }, { "entropy": 0.7476114094257355, "epoch": 0.15944, "grad_norm": 1.7163987159729004, "learning_rate": 4.20452180872349e-05, "loss": 0.7622, "mean_token_accuracy": 0.783966863155365, "num_tokens": 206705305.0, "step": 19930 }, { "entropy": 0.78771693110466, "epoch": 0.15952, "grad_norm": 3.4780056476593018, "learning_rate": 4.204121648659464e-05, "loss": 0.7824, "mean_token_accuracy": 0.7691136181354523, "num_tokens": 206834402.0, "step": 19940 }, { "entropy": 0.6984871745109558, "epoch": 0.1596, "grad_norm": 6.435704708099365, "learning_rate": 4.2037214885954385e-05, "loss": 0.6801, "mean_token_accuracy": 0.8201280415058136, "num_tokens": 206869466.0, "step": 19950 }, { "entropy": 0.691175889968872, "epoch": 0.15968, "grad_norm": 1.9330073595046997, "learning_rate": 4.203321328531413e-05, "loss": 0.6871, "mean_token_accuracy": 0.7856192111968994, "num_tokens": 207033306.0, "step": 19960 }, { "entropy": 0.6610776066780091, "epoch": 0.15976, "grad_norm": 2.8004467487335205, "learning_rate": 4.202921168467387e-05, "loss": 0.6577, "mean_token_accuracy": 0.8108467578887939, "num_tokens": 207116230.0, "step": 19970 }, { "entropy": 0.6125957489013671, "epoch": 0.15984, "grad_norm": 2.2842023372650146, "learning_rate": 4.2025210084033616e-05, "loss": 0.6243, "mean_token_accuracy": 0.8168668806552887, "num_tokens": 207210113.0, "step": 19980 }, { "entropy": 0.6672385573387146, "epoch": 0.15992, "grad_norm": 2.552464485168457, "learning_rate": 4.202120848339336e-05, "loss": 0.6511, "mean_token_accuracy": 0.7998051822185517, "num_tokens": 207345441.0, "step": 19990 }, { "entropy": 0.7184413880109787, "epoch": 0.16, "grad_norm": 5.157080173492432, "learning_rate": 4.2017206882753104e-05, "loss": 0.7399, "mean_token_accuracy": 0.8071281313896179, "num_tokens": 207379091.0, "step": 20000 }, { "entropy": 0.6370835542678833, "epoch": 0.16008, "grad_norm": 1.445781946182251, "learning_rate": 4.201320528211285e-05, "loss": 0.639, "mean_token_accuracy": 0.7963483154773712, "num_tokens": 207542931.0, "step": 20010 }, { "entropy": 0.7083247423171997, "epoch": 0.16016, "grad_norm": 3.146176338195801, "learning_rate": 4.200920368147259e-05, "loss": 0.688, "mean_token_accuracy": 0.802782016992569, "num_tokens": 207631060.0, "step": 20020 }, { "entropy": 0.7227232158184052, "epoch": 0.16024, "grad_norm": 1.732862114906311, "learning_rate": 4.2005202080832335e-05, "loss": 0.728, "mean_token_accuracy": 0.7953341662883758, "num_tokens": 207724087.0, "step": 20030 }, { "entropy": 0.6702997267246247, "epoch": 0.16032, "grad_norm": 3.1094632148742676, "learning_rate": 4.200120048019208e-05, "loss": 0.6686, "mean_token_accuracy": 0.795078432559967, "num_tokens": 207859496.0, "step": 20040 }, { "entropy": 0.673788595199585, "epoch": 0.1604, "grad_norm": 5.409927845001221, "learning_rate": 4.199719887955182e-05, "loss": 0.6743, "mean_token_accuracy": 0.8106744170188904, "num_tokens": 207899082.0, "step": 20050 }, { "entropy": 0.6774066984653473, "epoch": 0.16048, "grad_norm": 1.40484619140625, "learning_rate": 4.1993197278911566e-05, "loss": 0.6737, "mean_token_accuracy": 0.7877575278282165, "num_tokens": 208062911.0, "step": 20060 }, { "entropy": 0.6743424981832504, "epoch": 0.16056, "grad_norm": 4.083399772644043, "learning_rate": 4.198919567827131e-05, "loss": 0.6804, "mean_token_accuracy": 0.8052649378776551, "num_tokens": 208148965.0, "step": 20070 }, { "entropy": 0.6861073136329651, "epoch": 0.16064, "grad_norm": 2.0174033641815186, "learning_rate": 4.198519407763106e-05, "loss": 0.6874, "mean_token_accuracy": 0.8000091254711151, "num_tokens": 208243433.0, "step": 20080 }, { "entropy": 0.6905926406383515, "epoch": 0.16072, "grad_norm": 2.056384563446045, "learning_rate": 4.19811924769908e-05, "loss": 0.6851, "mean_token_accuracy": 0.7880434691905975, "num_tokens": 208386770.0, "step": 20090 }, { "entropy": 0.7201395630836487, "epoch": 0.1608, "grad_norm": 4.33507776260376, "learning_rate": 4.197719087635054e-05, "loss": 0.6991, "mean_token_accuracy": 0.8129392266273499, "num_tokens": 208427433.0, "step": 20100 }, { "entropy": 0.6205820292234421, "epoch": 0.16088, "grad_norm": 1.9771647453308105, "learning_rate": 4.1973189275710285e-05, "loss": 0.6238, "mean_token_accuracy": 0.7974352717399598, "num_tokens": 208591273.0, "step": 20110 }, { "entropy": 0.6592480629682541, "epoch": 0.16096, "grad_norm": 3.6091184616088867, "learning_rate": 4.1969187675070035e-05, "loss": 0.6591, "mean_token_accuracy": 0.812103658914566, "num_tokens": 208684604.0, "step": 20120 }, { "entropy": 0.6660028696060181, "epoch": 0.16104, "grad_norm": 2.23952054977417, "learning_rate": 4.196518607442977e-05, "loss": 0.6764, "mean_token_accuracy": 0.803732281923294, "num_tokens": 208778748.0, "step": 20130 }, { "entropy": 0.6424860179424285, "epoch": 0.16112, "grad_norm": 2.326355218887329, "learning_rate": 4.1961184473789516e-05, "loss": 0.6322, "mean_token_accuracy": 0.8021600425243378, "num_tokens": 208920840.0, "step": 20140 }, { "entropy": 0.6936331361532211, "epoch": 0.1612, "grad_norm": 5.136407852172852, "learning_rate": 4.1957182873149266e-05, "loss": 0.6901, "mean_token_accuracy": 0.8108067333698272, "num_tokens": 208960393.0, "step": 20150 }, { "entropy": 0.6360230892896652, "epoch": 0.16128, "grad_norm": 1.5032703876495361, "learning_rate": 4.195318127250901e-05, "loss": 0.6437, "mean_token_accuracy": 0.7949743568897247, "num_tokens": 209124233.0, "step": 20160 }, { "entropy": 0.6952445030212402, "epoch": 0.16136, "grad_norm": 3.4735777378082275, "learning_rate": 4.194917967186875e-05, "loss": 0.6878, "mean_token_accuracy": 0.8021711647510529, "num_tokens": 209210760.0, "step": 20170 }, { "entropy": 0.6908344864845276, "epoch": 0.16144, "grad_norm": 1.4610068798065186, "learning_rate": 4.194517807122849e-05, "loss": 0.689, "mean_token_accuracy": 0.8015122950077057, "num_tokens": 209304970.0, "step": 20180 }, { "entropy": 0.7062107801437378, "epoch": 0.16152, "grad_norm": 2.535301446914673, "learning_rate": 4.194117647058824e-05, "loss": 0.6967, "mean_token_accuracy": 0.7917699337005615, "num_tokens": 209443270.0, "step": 20190 }, { "entropy": 0.679357385635376, "epoch": 0.1616, "grad_norm": 4.6629557609558105, "learning_rate": 4.1937174869947985e-05, "loss": 0.6739, "mean_token_accuracy": 0.8183404922485351, "num_tokens": 209481026.0, "step": 20200 }, { "entropy": 0.6959427118301391, "epoch": 0.16168, "grad_norm": 1.7396736145019531, "learning_rate": 4.193317326930772e-05, "loss": 0.6959, "mean_token_accuracy": 0.7832403481006622, "num_tokens": 209644634.0, "step": 20210 }, { "entropy": 0.6768202781677246, "epoch": 0.16176, "grad_norm": 3.1166818141937256, "learning_rate": 4.1929171668667465e-05, "loss": 0.672, "mean_token_accuracy": 0.8047387897968292, "num_tokens": 209727652.0, "step": 20220 }, { "entropy": 0.7161984264850616, "epoch": 0.16184, "grad_norm": 1.8052020072937012, "learning_rate": 4.1925170068027216e-05, "loss": 0.728, "mean_token_accuracy": 0.7944311439990998, "num_tokens": 209821132.0, "step": 20230 }, { "entropy": 0.6829452335834503, "epoch": 0.16192, "grad_norm": 2.009112596511841, "learning_rate": 4.192116846738696e-05, "loss": 0.6721, "mean_token_accuracy": 0.7909090220928192, "num_tokens": 209966149.0, "step": 20240 }, { "entropy": 0.7182624518871308, "epoch": 0.162, "grad_norm": 5.373078346252441, "learning_rate": 4.1917166866746697e-05, "loss": 0.7249, "mean_token_accuracy": 0.8089606702327728, "num_tokens": 210007740.0, "step": 20250 }, { "entropy": 0.6506197392940521, "epoch": 0.16208, "grad_norm": 1.60433828830719, "learning_rate": 4.191316526610645e-05, "loss": 0.6545, "mean_token_accuracy": 0.7923104107379914, "num_tokens": 210171551.0, "step": 20260 }, { "entropy": 0.7206071615219116, "epoch": 0.16216, "grad_norm": 3.148545026779175, "learning_rate": 4.190916366546619e-05, "loss": 0.7095, "mean_token_accuracy": 0.7982850730419159, "num_tokens": 210248135.0, "step": 20270 }, { "entropy": 0.6882496118545532, "epoch": 0.16224, "grad_norm": 2.234180212020874, "learning_rate": 4.1905162064825935e-05, "loss": 0.7073, "mean_token_accuracy": 0.7991973102092743, "num_tokens": 210340216.0, "step": 20280 }, { "entropy": 0.7024528980255127, "epoch": 0.16232, "grad_norm": 3.229355573654175, "learning_rate": 4.190116046418567e-05, "loss": 0.6861, "mean_token_accuracy": 0.7913197100162506, "num_tokens": 210479037.0, "step": 20290 }, { "entropy": 0.6943091690540314, "epoch": 0.1624, "grad_norm": 5.08136510848999, "learning_rate": 4.189715886354542e-05, "loss": 0.6943, "mean_token_accuracy": 0.8154527604579925, "num_tokens": 210516517.0, "step": 20300 }, { "entropy": 0.6818770825862884, "epoch": 0.16248, "grad_norm": 1.5381221771240234, "learning_rate": 4.1893157262905166e-05, "loss": 0.6814, "mean_token_accuracy": 0.7870927035808564, "num_tokens": 210679704.0, "step": 20310 }, { "entropy": 0.6692463219165802, "epoch": 0.16256, "grad_norm": 3.9337501525878906, "learning_rate": 4.188915566226491e-05, "loss": 0.6616, "mean_token_accuracy": 0.8113745629787446, "num_tokens": 210754290.0, "step": 20320 }, { "entropy": 0.7130683839321137, "epoch": 0.16264, "grad_norm": 1.8732097148895264, "learning_rate": 4.188515406162465e-05, "loss": 0.7373, "mean_token_accuracy": 0.7964827477931976, "num_tokens": 210846691.0, "step": 20330 }, { "entropy": 0.7192935645580292, "epoch": 0.16272, "grad_norm": 2.455526828765869, "learning_rate": 4.18811524609844e-05, "loss": 0.6998, "mean_token_accuracy": 0.7894566237926484, "num_tokens": 210985111.0, "step": 20340 }, { "entropy": 0.6784464538097381, "epoch": 0.1628, "grad_norm": 4.975406646728516, "learning_rate": 4.187715086034414e-05, "loss": 0.6714, "mean_token_accuracy": 0.8159193694591522, "num_tokens": 211025617.0, "step": 20350 }, { "entropy": 0.637724506855011, "epoch": 0.16288, "grad_norm": 1.940281629562378, "learning_rate": 4.1873149259703884e-05, "loss": 0.648, "mean_token_accuracy": 0.792116516828537, "num_tokens": 211189457.0, "step": 20360 }, { "entropy": 0.6508484095335006, "epoch": 0.16296, "grad_norm": 3.644695520401001, "learning_rate": 4.186914765906363e-05, "loss": 0.636, "mean_token_accuracy": 0.8145866096019745, "num_tokens": 211278054.0, "step": 20370 }, { "entropy": 0.7036954641342164, "epoch": 0.16304, "grad_norm": 2.0119564533233643, "learning_rate": 4.186514605842337e-05, "loss": 0.7211, "mean_token_accuracy": 0.7970732390880585, "num_tokens": 211372932.0, "step": 20380 }, { "entropy": 0.6881662607192993, "epoch": 0.16312, "grad_norm": 3.1855857372283936, "learning_rate": 4.1861144457783115e-05, "loss": 0.6825, "mean_token_accuracy": 0.7947479307651519, "num_tokens": 211509895.0, "step": 20390 }, { "entropy": 0.6940005362033844, "epoch": 0.1632, "grad_norm": 5.898284912109375, "learning_rate": 4.185714285714286e-05, "loss": 0.6805, "mean_token_accuracy": 0.8214118421077728, "num_tokens": 211545524.0, "step": 20400 }, { "entropy": 0.6974820911884307, "epoch": 0.16328, "grad_norm": 2.1546592712402344, "learning_rate": 4.18531412565026e-05, "loss": 0.7019, "mean_token_accuracy": 0.7810698568820953, "num_tokens": 211709364.0, "step": 20410 }, { "entropy": 0.6268428295850754, "epoch": 0.16336, "grad_norm": 3.6039323806762695, "learning_rate": 4.1849139655862347e-05, "loss": 0.6214, "mean_token_accuracy": 0.8151175856590271, "num_tokens": 211798125.0, "step": 20420 }, { "entropy": 0.6759274482727051, "epoch": 0.16344, "grad_norm": 1.983149528503418, "learning_rate": 4.184513805522209e-05, "loss": 0.6662, "mean_token_accuracy": 0.8073321282863617, "num_tokens": 211893758.0, "step": 20430 }, { "entropy": 0.7790165603160858, "epoch": 0.16352, "grad_norm": 2.4336628913879395, "learning_rate": 4.1841136454581834e-05, "loss": 0.7789, "mean_token_accuracy": 0.7696840047836304, "num_tokens": 212032109.0, "step": 20440 }, { "entropy": 0.7399724245071411, "epoch": 0.1636, "grad_norm": 4.757774829864502, "learning_rate": 4.183713485394158e-05, "loss": 0.7333, "mean_token_accuracy": 0.8083510220050811, "num_tokens": 212069077.0, "step": 20450 }, { "entropy": 0.6635586500167847, "epoch": 0.16368, "grad_norm": 1.7657848596572876, "learning_rate": 4.183313325330132e-05, "loss": 0.6672, "mean_token_accuracy": 0.790443342924118, "num_tokens": 212232917.0, "step": 20460 }, { "entropy": 0.7407928466796875, "epoch": 0.16376, "grad_norm": 4.923123359680176, "learning_rate": 4.182913165266107e-05, "loss": 0.7372, "mean_token_accuracy": 0.7951801240444183, "num_tokens": 212317614.0, "step": 20470 }, { "entropy": 0.7633149683475494, "epoch": 0.16384, "grad_norm": 3.226896047592163, "learning_rate": 4.182513005202081e-05, "loss": 0.7593, "mean_token_accuracy": 0.7882200658321381, "num_tokens": 212410412.0, "step": 20480 }, { "entropy": 0.7439690828323364, "epoch": 0.16392, "grad_norm": 2.4646925926208496, "learning_rate": 4.182112845138055e-05, "loss": 0.7412, "mean_token_accuracy": 0.7784423828125, "num_tokens": 212539684.0, "step": 20490 }, { "entropy": 0.6788719058036804, "epoch": 0.164, "grad_norm": 5.5932536125183105, "learning_rate": 4.1817126850740296e-05, "loss": 0.6708, "mean_token_accuracy": 0.8200817286968232, "num_tokens": 212577037.0, "step": 20500 }, { "entropy": 0.6913000345230103, "epoch": 0.16408, "grad_norm": 1.5566015243530273, "learning_rate": 4.181312525010005e-05, "loss": 0.6966, "mean_token_accuracy": 0.7805324971675873, "num_tokens": 212740877.0, "step": 20510 }, { "entropy": 0.7061049968004227, "epoch": 0.16416, "grad_norm": 3.0054495334625244, "learning_rate": 4.1809123649459784e-05, "loss": 0.6989, "mean_token_accuracy": 0.7959836542606353, "num_tokens": 212828871.0, "step": 20520 }, { "entropy": 0.7781460523605347, "epoch": 0.16424, "grad_norm": 2.5270774364471436, "learning_rate": 4.180512204881953e-05, "loss": 0.784, "mean_token_accuracy": 0.7819414377212525, "num_tokens": 212923164.0, "step": 20530 }, { "entropy": 0.6899608135223388, "epoch": 0.16432, "grad_norm": 2.388516664505005, "learning_rate": 4.180112044817928e-05, "loss": 0.6975, "mean_token_accuracy": 0.7880273997783661, "num_tokens": 213068298.0, "step": 20540 }, { "entropy": 0.6900661051273346, "epoch": 0.1644, "grad_norm": 6.213988780975342, "learning_rate": 4.179711884753902e-05, "loss": 0.6737, "mean_token_accuracy": 0.8203777194023132, "num_tokens": 213105818.0, "step": 20550 }, { "entropy": 0.7371955752372742, "epoch": 0.16448, "grad_norm": 2.3266193866729736, "learning_rate": 4.179311724689876e-05, "loss": 0.7348, "mean_token_accuracy": 0.7769840776920318, "num_tokens": 213265611.0, "step": 20560 }, { "entropy": 0.6160520583391189, "epoch": 0.16456, "grad_norm": 3.119377374649048, "learning_rate": 4.17891156462585e-05, "loss": 0.6132, "mean_token_accuracy": 0.8214216113090516, "num_tokens": 213339607.0, "step": 20570 }, { "entropy": 0.7512570738792419, "epoch": 0.16464, "grad_norm": 1.7472864389419556, "learning_rate": 4.178511404561825e-05, "loss": 0.749, "mean_token_accuracy": 0.7906299293041229, "num_tokens": 213432509.0, "step": 20580 }, { "entropy": 0.7381763875484466, "epoch": 0.16472, "grad_norm": 3.125652313232422, "learning_rate": 4.1781112444977996e-05, "loss": 0.7358, "mean_token_accuracy": 0.7824206471443176, "num_tokens": 213565086.0, "step": 20590 }, { "entropy": 0.6892926633358002, "epoch": 0.1648, "grad_norm": 4.564051628112793, "learning_rate": 4.177711084433773e-05, "loss": 0.6851, "mean_token_accuracy": 0.815417867898941, "num_tokens": 213604466.0, "step": 20600 }, { "entropy": 0.6819506168365479, "epoch": 0.16488, "grad_norm": 2.0446810722351074, "learning_rate": 4.1773109243697484e-05, "loss": 0.6786, "mean_token_accuracy": 0.7888173878192901, "num_tokens": 213767867.0, "step": 20610 }, { "entropy": 0.6694641888141633, "epoch": 0.16496, "grad_norm": 4.091919898986816, "learning_rate": 4.176910764305723e-05, "loss": 0.6537, "mean_token_accuracy": 0.8109238743782043, "num_tokens": 213849199.0, "step": 20620 }, { "entropy": 0.7356207489967346, "epoch": 0.16504, "grad_norm": 1.7855883836746216, "learning_rate": 4.176510604241697e-05, "loss": 0.7433, "mean_token_accuracy": 0.7904536068439484, "num_tokens": 213942945.0, "step": 20630 }, { "entropy": 0.7168935000896454, "epoch": 0.16512, "grad_norm": 2.176952600479126, "learning_rate": 4.176110444177671e-05, "loss": 0.6989, "mean_token_accuracy": 0.7886417031288147, "num_tokens": 214080212.0, "step": 20640 }, { "entropy": 0.6301255345344543, "epoch": 0.1652, "grad_norm": 5.2596354484558105, "learning_rate": 4.175710284113646e-05, "loss": 0.6406, "mean_token_accuracy": 0.8228830397129059, "num_tokens": 214120866.0, "step": 20650 }, { "entropy": 0.6948278844356537, "epoch": 0.16528, "grad_norm": 1.741429090499878, "learning_rate": 4.17531012404962e-05, "loss": 0.6954, "mean_token_accuracy": 0.7889791429042816, "num_tokens": 214284496.0, "step": 20660 }, { "entropy": 0.7030101656913758, "epoch": 0.16536, "grad_norm": 3.468686580657959, "learning_rate": 4.1749099639855946e-05, "loss": 0.6922, "mean_token_accuracy": 0.8053121626377105, "num_tokens": 214361250.0, "step": 20670 }, { "entropy": 0.6868538141250611, "epoch": 0.16544, "grad_norm": 1.886333703994751, "learning_rate": 4.174509803921569e-05, "loss": 0.6885, "mean_token_accuracy": 0.8029013514518738, "num_tokens": 214455739.0, "step": 20680 }, { "entropy": 0.6698555827140809, "epoch": 0.16552, "grad_norm": 2.5704286098480225, "learning_rate": 4.1741096438575434e-05, "loss": 0.6592, "mean_token_accuracy": 0.7942327558994293, "num_tokens": 214597829.0, "step": 20690 }, { "entropy": 0.7223972886800766, "epoch": 0.1656, "grad_norm": 5.394332408905029, "learning_rate": 4.173709483793518e-05, "loss": 0.7232, "mean_token_accuracy": 0.8115558862686157, "num_tokens": 214633619.0, "step": 20700 }, { "entropy": 0.6760286092758179, "epoch": 0.16568, "grad_norm": 1.8238211870193481, "learning_rate": 4.173309323729492e-05, "loss": 0.6819, "mean_token_accuracy": 0.7866193175315856, "num_tokens": 214794905.0, "step": 20710 }, { "entropy": 0.6165511697530747, "epoch": 0.16576, "grad_norm": 3.4297943115234375, "learning_rate": 4.1729091636654665e-05, "loss": 0.5985, "mean_token_accuracy": 0.8290297031402588, "num_tokens": 214862578.0, "step": 20720 }, { "entropy": 0.7193939507007598, "epoch": 0.16584, "grad_norm": 2.491299867630005, "learning_rate": 4.172509003601441e-05, "loss": 0.7236, "mean_token_accuracy": 0.7928211867809296, "num_tokens": 214955656.0, "step": 20730 }, { "entropy": 0.7591664910316467, "epoch": 0.16592, "grad_norm": 2.5762670040130615, "learning_rate": 4.172108843537415e-05, "loss": 0.7561, "mean_token_accuracy": 0.7713869094848633, "num_tokens": 215114460.0, "step": 20740 }, { "entropy": 0.7119322061538697, "epoch": 0.166, "grad_norm": 4.996725559234619, "learning_rate": 4.1717086834733896e-05, "loss": 0.7184, "mean_token_accuracy": 0.8059933066368103, "num_tokens": 215163314.0, "step": 20750 }, { "entropy": 0.694132822751999, "epoch": 0.16608, "grad_norm": 2.2237372398376465, "learning_rate": 4.171308523409364e-05, "loss": 0.6924, "mean_token_accuracy": 0.7865226447582245, "num_tokens": 215325130.0, "step": 20760 }, { "entropy": 0.6791060656309128, "epoch": 0.16616, "grad_norm": 3.214550018310547, "learning_rate": 4.170908363345338e-05, "loss": 0.6805, "mean_token_accuracy": 0.809857577085495, "num_tokens": 215399290.0, "step": 20770 }, { "entropy": 0.6984959423542023, "epoch": 0.16624, "grad_norm": 1.4048120975494385, "learning_rate": 4.170508203281313e-05, "loss": 0.6928, "mean_token_accuracy": 0.8030032813549042, "num_tokens": 215493058.0, "step": 20780 }, { "entropy": 0.7529291272163391, "epoch": 0.16632, "grad_norm": 2.249191999435425, "learning_rate": 4.170108043217287e-05, "loss": 0.7483, "mean_token_accuracy": 0.7742200791835785, "num_tokens": 215634697.0, "step": 20790 }, { "entropy": 0.723644083738327, "epoch": 0.1664, "grad_norm": 4.216500282287598, "learning_rate": 4.1697078831532614e-05, "loss": 0.7048, "mean_token_accuracy": 0.8104886293411255, "num_tokens": 215676988.0, "step": 20800 }, { "entropy": 0.7293954253196716, "epoch": 0.16648, "grad_norm": 1.851946234703064, "learning_rate": 4.169307723089236e-05, "loss": 0.736, "mean_token_accuracy": 0.7729884266853333, "num_tokens": 215840066.0, "step": 20810 }, { "entropy": 0.7475016832351684, "epoch": 0.16656, "grad_norm": 3.475613832473755, "learning_rate": 4.16890756302521e-05, "loss": 0.7484, "mean_token_accuracy": 0.7894435048103332, "num_tokens": 215924655.0, "step": 20820 }, { "entropy": 0.7595025897026062, "epoch": 0.16664, "grad_norm": 2.3196232318878174, "learning_rate": 4.1685074029611846e-05, "loss": 0.7414, "mean_token_accuracy": 0.7920259475708008, "num_tokens": 216020473.0, "step": 20830 }, { "entropy": 0.7363230049610138, "epoch": 0.16672, "grad_norm": 2.4930503368377686, "learning_rate": 4.168107242897159e-05, "loss": 0.7393, "mean_token_accuracy": 0.7817542016506195, "num_tokens": 216152946.0, "step": 20840 }, { "entropy": 0.7216690659523011, "epoch": 0.1668, "grad_norm": 4.709262847900391, "learning_rate": 4.167707082833133e-05, "loss": 0.7314, "mean_token_accuracy": 0.8130571067333221, "num_tokens": 216189305.0, "step": 20850 }, { "entropy": 0.7221987307071686, "epoch": 0.16688, "grad_norm": 1.5344043970108032, "learning_rate": 4.1673069227691083e-05, "loss": 0.7137, "mean_token_accuracy": 0.7815889239311218, "num_tokens": 216353145.0, "step": 20860 }, { "entropy": 0.6057876408100128, "epoch": 0.16696, "grad_norm": 2.943037986755371, "learning_rate": 4.166906762705082e-05, "loss": 0.6029, "mean_token_accuracy": 0.8236111760139465, "num_tokens": 216448753.0, "step": 20870 }, { "entropy": 0.739649909734726, "epoch": 0.16704, "grad_norm": 2.0773003101348877, "learning_rate": 4.1665066026410564e-05, "loss": 0.7453, "mean_token_accuracy": 0.7906432271003723, "num_tokens": 216541816.0, "step": 20880 }, { "entropy": 0.7401181042194367, "epoch": 0.16712, "grad_norm": 2.6351282596588135, "learning_rate": 4.166106442577031e-05, "loss": 0.7366, "mean_token_accuracy": 0.7841116905212402, "num_tokens": 216666484.0, "step": 20890 }, { "entropy": 0.5958155065774917, "epoch": 0.1672, "grad_norm": 4.590105056762695, "learning_rate": 4.165706282513006e-05, "loss": 0.6077, "mean_token_accuracy": 0.8367473125457764, "num_tokens": 216698042.0, "step": 20900 }, { "entropy": 0.7234588980674743, "epoch": 0.16728, "grad_norm": 1.701235294342041, "learning_rate": 4.1653061224489795e-05, "loss": 0.7193, "mean_token_accuracy": 0.778810453414917, "num_tokens": 216861882.0, "step": 20910 }, { "entropy": 0.6590817868709564, "epoch": 0.16736, "grad_norm": 3.1751625537872314, "learning_rate": 4.164905962384954e-05, "loss": 0.6498, "mean_token_accuracy": 0.8125561535358429, "num_tokens": 216940500.0, "step": 20920 }, { "entropy": 0.7164944231510162, "epoch": 0.16744, "grad_norm": 2.870607852935791, "learning_rate": 4.164505802320929e-05, "loss": 0.7356, "mean_token_accuracy": 0.7937620103359222, "num_tokens": 217032545.0, "step": 20930 }, { "entropy": 0.7318663120269775, "epoch": 0.16752, "grad_norm": 2.4780869483947754, "learning_rate": 4.164105642256903e-05, "loss": 0.7204, "mean_token_accuracy": 0.7826789975166321, "num_tokens": 217166894.0, "step": 20940 }, { "entropy": 0.706986528635025, "epoch": 0.1676, "grad_norm": 5.098681449890137, "learning_rate": 4.163705482192877e-05, "loss": 0.7091, "mean_token_accuracy": 0.8142569422721863, "num_tokens": 217203193.0, "step": 20950 }, { "entropy": 0.681541359424591, "epoch": 0.16768, "grad_norm": 2.8921151161193848, "learning_rate": 4.1633053221288514e-05, "loss": 0.6899, "mean_token_accuracy": 0.788383024930954, "num_tokens": 217363667.0, "step": 20960 }, { "entropy": 0.709611302614212, "epoch": 0.16776, "grad_norm": 3.53324294090271, "learning_rate": 4.1629051620648264e-05, "loss": 0.6941, "mean_token_accuracy": 0.8073506534099579, "num_tokens": 217436042.0, "step": 20970 }, { "entropy": 0.7585168302059173, "epoch": 0.16784, "grad_norm": 1.5896376371383667, "learning_rate": 4.162505002000801e-05, "loss": 0.7473, "mean_token_accuracy": 0.7851435720920563, "num_tokens": 217530432.0, "step": 20980 }, { "entropy": 0.6298268795013428, "epoch": 0.16792, "grad_norm": 2.439342737197876, "learning_rate": 4.1621048419367745e-05, "loss": 0.6343, "mean_token_accuracy": 0.8031911611557007, "num_tokens": 217647547.0, "step": 20990 }, { "entropy": 0.701993191242218, "epoch": 0.168, "grad_norm": 4.411873817443848, "learning_rate": 4.1617046818727495e-05, "loss": 0.6848, "mean_token_accuracy": 0.8202066361904145, "num_tokens": 217680356.0, "step": 21000 }, { "entropy": 0.6535513579845429, "epoch": 0.16808, "grad_norm": 1.8569079637527466, "learning_rate": 4.161304521808724e-05, "loss": 0.6537, "mean_token_accuracy": 0.792104297876358, "num_tokens": 217844196.0, "step": 21010 }, { "entropy": 0.69967320561409, "epoch": 0.16816, "grad_norm": 2.9792075157165527, "learning_rate": 4.160904361744698e-05, "loss": 0.7009, "mean_token_accuracy": 0.7950195848941803, "num_tokens": 217943348.0, "step": 21020 }, { "entropy": 0.7005751252174377, "epoch": 0.16824, "grad_norm": 1.814939260482788, "learning_rate": 4.160504201680672e-05, "loss": 0.7137, "mean_token_accuracy": 0.7955400943756104, "num_tokens": 218036879.0, "step": 21030 }, { "entropy": 0.7304234087467194, "epoch": 0.16832, "grad_norm": 2.7358155250549316, "learning_rate": 4.160104041616647e-05, "loss": 0.7258, "mean_token_accuracy": 0.7779077112674713, "num_tokens": 218179962.0, "step": 21040 }, { "entropy": 0.6695861846208573, "epoch": 0.1684, "grad_norm": 5.344966411590576, "learning_rate": 4.1597038815526214e-05, "loss": 0.6524, "mean_token_accuracy": 0.8273319721221923, "num_tokens": 218223815.0, "step": 21050 }, { "entropy": 0.6858462393283844, "epoch": 0.16848, "grad_norm": 1.640073299407959, "learning_rate": 4.159303721488596e-05, "loss": 0.6875, "mean_token_accuracy": 0.7849626779556275, "num_tokens": 218386765.0, "step": 21060 }, { "entropy": 0.7344328910112381, "epoch": 0.16856, "grad_norm": 3.2082812786102295, "learning_rate": 4.15890356142457e-05, "loss": 0.7444, "mean_token_accuracy": 0.7953724980354309, "num_tokens": 218461361.0, "step": 21070 }, { "entropy": 0.7534828007221221, "epoch": 0.16864, "grad_norm": 2.696812391281128, "learning_rate": 4.1585034013605445e-05, "loss": 0.743, "mean_token_accuracy": 0.7960164070129394, "num_tokens": 218553100.0, "step": 21080 }, { "entropy": 0.7263747215270996, "epoch": 0.16872, "grad_norm": 2.109285354614258, "learning_rate": 4.158103241296519e-05, "loss": 0.7268, "mean_token_accuracy": 0.7816718101501465, "num_tokens": 218694306.0, "step": 21090 }, { "entropy": 0.6756551116704941, "epoch": 0.1688, "grad_norm": 4.709641933441162, "learning_rate": 4.157703081232493e-05, "loss": 0.6655, "mean_token_accuracy": 0.8181797683238983, "num_tokens": 218741262.0, "step": 21100 }, { "entropy": 0.6272196769714355, "epoch": 0.16888, "grad_norm": 1.4740694761276245, "learning_rate": 4.1573029211684676e-05, "loss": 0.6325, "mean_token_accuracy": 0.7981741666793823, "num_tokens": 218905102.0, "step": 21110 }, { "entropy": 0.6737342864274979, "epoch": 0.16896, "grad_norm": 2.9450523853302, "learning_rate": 4.156902761104442e-05, "loss": 0.6652, "mean_token_accuracy": 0.8060453414916993, "num_tokens": 218999410.0, "step": 21120 }, { "entropy": 0.7241481184959412, "epoch": 0.16904, "grad_norm": 2.009777069091797, "learning_rate": 4.1565026010404164e-05, "loss": 0.7065, "mean_token_accuracy": 0.7983157396316528, "num_tokens": 219093990.0, "step": 21130 }, { "entropy": 0.6811316609382629, "epoch": 0.16912, "grad_norm": 2.0118229389190674, "learning_rate": 4.156102440976391e-05, "loss": 0.689, "mean_token_accuracy": 0.7860229790210724, "num_tokens": 219242883.0, "step": 21140 }, { "entropy": 0.6192075431346893, "epoch": 0.1692, "grad_norm": 5.603626251220703, "learning_rate": 4.155702280912365e-05, "loss": 0.6245, "mean_token_accuracy": 0.8285487949848175, "num_tokens": 219288786.0, "step": 21150 }, { "entropy": 0.635514497756958, "epoch": 0.16928, "grad_norm": 1.691166877746582, "learning_rate": 4.1553021208483395e-05, "loss": 0.6294, "mean_token_accuracy": 0.8002992093563079, "num_tokens": 219452626.0, "step": 21160 }, { "entropy": 0.7143749237060547, "epoch": 0.16936, "grad_norm": 3.2354366779327393, "learning_rate": 4.154901960784314e-05, "loss": 0.704, "mean_token_accuracy": 0.8030412673950196, "num_tokens": 219529890.0, "step": 21170 }, { "entropy": 0.7082731425762177, "epoch": 0.16944, "grad_norm": 1.5900031328201294, "learning_rate": 4.154501800720288e-05, "loss": 0.7168, "mean_token_accuracy": 0.7947343409061431, "num_tokens": 219622868.0, "step": 21180 }, { "entropy": 0.6861757516860962, "epoch": 0.16952, "grad_norm": 2.4700558185577393, "learning_rate": 4.1541016406562626e-05, "loss": 0.6741, "mean_token_accuracy": 0.7945350587368012, "num_tokens": 219758368.0, "step": 21190 }, { "entropy": 0.7474832028150559, "epoch": 0.1696, "grad_norm": 7.566014289855957, "learning_rate": 4.153701480592237e-05, "loss": 0.7583, "mean_token_accuracy": 0.8054009318351746, "num_tokens": 219794812.0, "step": 21200 }, { "entropy": 0.6462403237819672, "epoch": 0.16968, "grad_norm": 1.7897974252700806, "learning_rate": 4.153301320528212e-05, "loss": 0.6483, "mean_token_accuracy": 0.7949088513851166, "num_tokens": 219958072.0, "step": 21210 }, { "entropy": 0.6970337122678757, "epoch": 0.16976, "grad_norm": 3.7530405521392822, "learning_rate": 4.152901160464186e-05, "loss": 0.6872, "mean_token_accuracy": 0.809572947025299, "num_tokens": 220030260.0, "step": 21220 }, { "entropy": 0.7753338932991027, "epoch": 0.16984, "grad_norm": 1.7814801931381226, "learning_rate": 4.15250100040016e-05, "loss": 0.7783, "mean_token_accuracy": 0.7811521649360657, "num_tokens": 220122557.0, "step": 21230 }, { "entropy": 0.7824186503887176, "epoch": 0.16992, "grad_norm": 3.7451610565185547, "learning_rate": 4.1521008403361345e-05, "loss": 0.7652, "mean_token_accuracy": 0.7703268468379975, "num_tokens": 220258423.0, "step": 21240 }, { "entropy": 0.6418113231658935, "epoch": 0.17, "grad_norm": 4.156778335571289, "learning_rate": 4.1517006802721095e-05, "loss": 0.638, "mean_token_accuracy": 0.8292930543422699, "num_tokens": 220298275.0, "step": 21250 }, { "entropy": 0.6947573244571685, "epoch": 0.17008, "grad_norm": 1.659529447555542, "learning_rate": 4.151300520208083e-05, "loss": 0.6932, "mean_token_accuracy": 0.7852222859859467, "num_tokens": 220462115.0, "step": 21260 }, { "entropy": 0.7113915592432022, "epoch": 0.17016, "grad_norm": 3.685838460922241, "learning_rate": 4.1509003601440576e-05, "loss": 0.7018, "mean_token_accuracy": 0.7973008096218109, "num_tokens": 220554533.0, "step": 21270 }, { "entropy": 0.7282500952482224, "epoch": 0.17024, "grad_norm": 1.7255548238754272, "learning_rate": 4.1505002000800326e-05, "loss": 0.7475, "mean_token_accuracy": 0.7901187062263488, "num_tokens": 220649451.0, "step": 21280 }, { "entropy": 0.6439059615135193, "epoch": 0.17032, "grad_norm": 2.5555262565612793, "learning_rate": 4.150100040016007e-05, "loss": 0.6416, "mean_token_accuracy": 0.800229150056839, "num_tokens": 220800580.0, "step": 21290 }, { "entropy": 0.654791846871376, "epoch": 0.1704, "grad_norm": 4.754857540130615, "learning_rate": 4.149699879951981e-05, "loss": 0.6619, "mean_token_accuracy": 0.8232996761798859, "num_tokens": 220844699.0, "step": 21300 }, { "entropy": 0.7413911104202271, "epoch": 0.17048, "grad_norm": 1.6597788333892822, "learning_rate": 4.149299719887955e-05, "loss": 0.7407, "mean_token_accuracy": 0.7777418315410614, "num_tokens": 221008539.0, "step": 21310 }, { "entropy": 0.6661987364292145, "epoch": 0.17056, "grad_norm": 3.097594738006592, "learning_rate": 4.14889955982393e-05, "loss": 0.6632, "mean_token_accuracy": 0.8034457504749298, "num_tokens": 221105340.0, "step": 21320 }, { "entropy": 0.7365366220474243, "epoch": 0.17064, "grad_norm": 2.8781237602233887, "learning_rate": 4.1484993997599045e-05, "loss": 0.7302, "mean_token_accuracy": 0.7913193285465241, "num_tokens": 221200379.0, "step": 21330 }, { "entropy": 0.7273813903331756, "epoch": 0.17072, "grad_norm": 3.690647840499878, "learning_rate": 4.148099239695878e-05, "loss": 0.7259, "mean_token_accuracy": 0.7849785506725311, "num_tokens": 221338012.0, "step": 21340 }, { "entropy": 0.6638818740844726, "epoch": 0.1708, "grad_norm": 5.475861549377441, "learning_rate": 4.1476990796318525e-05, "loss": 0.6557, "mean_token_accuracy": 0.8196492969989777, "num_tokens": 221379523.0, "step": 21350 }, { "entropy": 0.6827005445957184, "epoch": 0.17088, "grad_norm": 1.7157903909683228, "learning_rate": 4.1472989195678276e-05, "loss": 0.6834, "mean_token_accuracy": 0.7894442737102508, "num_tokens": 221542749.0, "step": 21360 }, { "entropy": 0.6929374396800995, "epoch": 0.17096, "grad_norm": 3.1529366970062256, "learning_rate": 4.146898759503802e-05, "loss": 0.6822, "mean_token_accuracy": 0.8053780198097229, "num_tokens": 221615571.0, "step": 21370 }, { "entropy": 0.6543614447116852, "epoch": 0.17104, "grad_norm": 1.807204246520996, "learning_rate": 4.1464985994397757e-05, "loss": 0.6576, "mean_token_accuracy": 0.8084519743919373, "num_tokens": 221709148.0, "step": 21380 }, { "entropy": 0.7372536838054657, "epoch": 0.17112, "grad_norm": 2.807587146759033, "learning_rate": 4.146098439375751e-05, "loss": 0.739, "mean_token_accuracy": 0.7750931501388549, "num_tokens": 221852854.0, "step": 21390 }, { "entropy": 0.7384696185588837, "epoch": 0.1712, "grad_norm": 4.914677619934082, "learning_rate": 4.145698279311725e-05, "loss": 0.7318, "mean_token_accuracy": 0.8069841206073761, "num_tokens": 221892256.0, "step": 21400 }, { "entropy": 0.6697758615016938, "epoch": 0.17128, "grad_norm": 1.8103251457214355, "learning_rate": 4.1452981192476994e-05, "loss": 0.6654, "mean_token_accuracy": 0.7924462735652924, "num_tokens": 222056096.0, "step": 21410 }, { "entropy": 0.6353877902030944, "epoch": 0.17136, "grad_norm": 3.4193317890167236, "learning_rate": 4.144897959183673e-05, "loss": 0.6338, "mean_token_accuracy": 0.8155735194683075, "num_tokens": 222143006.0, "step": 21420 }, { "entropy": 0.7604339420795441, "epoch": 0.17144, "grad_norm": 2.179713010787964, "learning_rate": 4.144497799119648e-05, "loss": 0.7548, "mean_token_accuracy": 0.7857522189617157, "num_tokens": 222237368.0, "step": 21430 }, { "entropy": 0.7102319538593292, "epoch": 0.17152, "grad_norm": 1.9851493835449219, "learning_rate": 4.1440976390556226e-05, "loss": 0.7049, "mean_token_accuracy": 0.7821685612201691, "num_tokens": 222386754.0, "step": 21440 }, { "entropy": 0.6266790330410004, "epoch": 0.1716, "grad_norm": 4.366530418395996, "learning_rate": 4.143697478991597e-05, "loss": 0.6256, "mean_token_accuracy": 0.8252945065498352, "num_tokens": 222430315.0, "step": 21450 }, { "entropy": 0.6584851026535035, "epoch": 0.17168, "grad_norm": 2.2755777835845947, "learning_rate": 4.143297318927571e-05, "loss": 0.6643, "mean_token_accuracy": 0.7913865089416504, "num_tokens": 222594104.0, "step": 21460 }, { "entropy": 0.6793955653905869, "epoch": 0.17176, "grad_norm": 3.3687286376953125, "learning_rate": 4.142897158863546e-05, "loss": 0.6688, "mean_token_accuracy": 0.811349892616272, "num_tokens": 222672782.0, "step": 21470 }, { "entropy": 0.6851794183254242, "epoch": 0.17184, "grad_norm": 1.3552196025848389, "learning_rate": 4.14249699879952e-05, "loss": 0.6746, "mean_token_accuracy": 0.8051718473434448, "num_tokens": 222766217.0, "step": 21480 }, { "entropy": 0.6693320333957672, "epoch": 0.17192, "grad_norm": 2.2155375480651855, "learning_rate": 4.1420968387354944e-05, "loss": 0.6789, "mean_token_accuracy": 0.7931659400463105, "num_tokens": 222904224.0, "step": 21490 }, { "entropy": 0.6042717874050141, "epoch": 0.172, "grad_norm": 3.5120654106140137, "learning_rate": 4.141696678671469e-05, "loss": 0.5967, "mean_token_accuracy": 0.8285130202770233, "num_tokens": 222946548.0, "step": 21500 }, { "entropy": 0.7152993202209472, "epoch": 0.17208, "grad_norm": 1.4529552459716797, "learning_rate": 4.141296518607443e-05, "loss": 0.7148, "mean_token_accuracy": 0.7780837953090668, "num_tokens": 223110388.0, "step": 21510 }, { "entropy": 0.6539487659931182, "epoch": 0.17216, "grad_norm": 5.484604358673096, "learning_rate": 4.1408963585434175e-05, "loss": 0.6507, "mean_token_accuracy": 0.8087252914905548, "num_tokens": 223196873.0, "step": 21520 }, { "entropy": 0.769355320930481, "epoch": 0.17224, "grad_norm": 1.795302152633667, "learning_rate": 4.140496198479392e-05, "loss": 0.7677, "mean_token_accuracy": 0.7826105833053589, "num_tokens": 223291293.0, "step": 21530 }, { "entropy": 0.650690907239914, "epoch": 0.17232, "grad_norm": 2.3638393878936768, "learning_rate": 4.140096038415366e-05, "loss": 0.6412, "mean_token_accuracy": 0.8012589573860168, "num_tokens": 223421164.0, "step": 21540 }, { "entropy": 0.7311586320400238, "epoch": 0.1724, "grad_norm": 4.546844959259033, "learning_rate": 4.1396958783513406e-05, "loss": 0.7377, "mean_token_accuracy": 0.8102015435695649, "num_tokens": 223458831.0, "step": 21550 }, { "entropy": 0.6865057408809662, "epoch": 0.17248, "grad_norm": 1.5157880783081055, "learning_rate": 4.139295718287315e-05, "loss": 0.6839, "mean_token_accuracy": 0.7883587837219238, "num_tokens": 223622562.0, "step": 21560 }, { "entropy": 0.6674075722694397, "epoch": 0.17256, "grad_norm": 4.5431976318359375, "learning_rate": 4.1388955582232894e-05, "loss": 0.6572, "mean_token_accuracy": 0.8098717272281647, "num_tokens": 223705385.0, "step": 21570 }, { "entropy": 0.7114940524101258, "epoch": 0.17264, "grad_norm": 1.903581976890564, "learning_rate": 4.138495398159264e-05, "loss": 0.7124, "mean_token_accuracy": 0.7987206101417541, "num_tokens": 223798961.0, "step": 21580 }, { "entropy": 0.7411758542060852, "epoch": 0.17272, "grad_norm": 2.6188759803771973, "learning_rate": 4.138095238095238e-05, "loss": 0.7369, "mean_token_accuracy": 0.776815265417099, "num_tokens": 223942626.0, "step": 21590 }, { "entropy": 0.6908529192209244, "epoch": 0.1728, "grad_norm": 4.727738857269287, "learning_rate": 4.137695078031213e-05, "loss": 0.6874, "mean_token_accuracy": 0.8086635887622833, "num_tokens": 223986498.0, "step": 21600 }, { "entropy": 0.7244656533002853, "epoch": 0.17288, "grad_norm": 2.079955816268921, "learning_rate": 4.137294917967187e-05, "loss": 0.7193, "mean_token_accuracy": 0.7784067153930664, "num_tokens": 224148119.0, "step": 21610 }, { "entropy": 0.690695172548294, "epoch": 0.17296, "grad_norm": 3.1834285259246826, "learning_rate": 4.136894757903161e-05, "loss": 0.675, "mean_token_accuracy": 0.8042260229587554, "num_tokens": 224231520.0, "step": 21620 }, { "entropy": 0.6468979179859161, "epoch": 0.17304, "grad_norm": 2.18747615814209, "learning_rate": 4.1364945978391356e-05, "loss": 0.6727, "mean_token_accuracy": 0.8056341648101807, "num_tokens": 224327049.0, "step": 21630 }, { "entropy": 0.6253764182329178, "epoch": 0.17312, "grad_norm": 2.5169992446899414, "learning_rate": 4.136094437775111e-05, "loss": 0.6215, "mean_token_accuracy": 0.8055383384227752, "num_tokens": 224460756.0, "step": 21640 }, { "entropy": 0.655018436908722, "epoch": 0.1732, "grad_norm": 5.912696361541748, "learning_rate": 4.1356942777110844e-05, "loss": 0.6563, "mean_token_accuracy": 0.8244928896427155, "num_tokens": 224499408.0, "step": 21650 }, { "entropy": 0.64217289686203, "epoch": 0.17328, "grad_norm": 1.3314592838287354, "learning_rate": 4.135294117647059e-05, "loss": 0.6458, "mean_token_accuracy": 0.794736635684967, "num_tokens": 224663149.0, "step": 21660 }, { "entropy": 0.8056072235107422, "epoch": 0.17336, "grad_norm": 3.21036434173584, "learning_rate": 4.134893957583034e-05, "loss": 0.7941, "mean_token_accuracy": 0.78132843375206, "num_tokens": 224734548.0, "step": 21670 }, { "entropy": 0.7330606162548066, "epoch": 0.17344, "grad_norm": 2.6954753398895264, "learning_rate": 4.134493797519008e-05, "loss": 0.7379, "mean_token_accuracy": 0.7921984434127808, "num_tokens": 224826942.0, "step": 21680 }, { "entropy": 0.6991007030010223, "epoch": 0.17352, "grad_norm": 2.4529788494110107, "learning_rate": 4.134093637454982e-05, "loss": 0.6981, "mean_token_accuracy": 0.7915015339851379, "num_tokens": 224960084.0, "step": 21690 }, { "entropy": 0.7588575005531311, "epoch": 0.1736, "grad_norm": 4.932362079620361, "learning_rate": 4.133693477390956e-05, "loss": 0.7464, "mean_token_accuracy": 0.7993552982807159, "num_tokens": 224996531.0, "step": 21700 }, { "entropy": 0.6724180519580841, "epoch": 0.17368, "grad_norm": 1.5211416482925415, "learning_rate": 4.133293317326931e-05, "loss": 0.6708, "mean_token_accuracy": 0.786938202381134, "num_tokens": 225160371.0, "step": 21710 }, { "entropy": 0.6598450124263764, "epoch": 0.17376, "grad_norm": 3.0549256801605225, "learning_rate": 4.1328931572629056e-05, "loss": 0.6659, "mean_token_accuracy": 0.8083136856555939, "num_tokens": 225245072.0, "step": 21720 }, { "entropy": 0.702957284450531, "epoch": 0.17384, "grad_norm": 1.9079642295837402, "learning_rate": 4.132492997198879e-05, "loss": 0.7028, "mean_token_accuracy": 0.7942018687725068, "num_tokens": 225340260.0, "step": 21730 }, { "entropy": 0.7461592495441437, "epoch": 0.17392, "grad_norm": 2.3257784843444824, "learning_rate": 4.1320928371348544e-05, "loss": 0.7395, "mean_token_accuracy": 0.7799888849258423, "num_tokens": 225470890.0, "step": 21740 }, { "entropy": 0.6680344492197037, "epoch": 0.174, "grad_norm": 5.442326068878174, "learning_rate": 4.131692677070829e-05, "loss": 0.6764, "mean_token_accuracy": 0.8149429500102997, "num_tokens": 225506140.0, "step": 21750 }, { "entropy": 0.7274433970451355, "epoch": 0.17408, "grad_norm": 1.6651952266693115, "learning_rate": 4.131292517006803e-05, "loss": 0.7267, "mean_token_accuracy": 0.7781998038291931, "num_tokens": 225669980.0, "step": 21760 }, { "entropy": 0.7327234596014023, "epoch": 0.17416, "grad_norm": 3.8702571392059326, "learning_rate": 4.130892356942777e-05, "loss": 0.7177, "mean_token_accuracy": 0.7941850185394287, "num_tokens": 225769730.0, "step": 21770 }, { "entropy": 0.6779056847095489, "epoch": 0.17424, "grad_norm": 2.271864175796509, "learning_rate": 4.130492196878752e-05, "loss": 0.6874, "mean_token_accuracy": 0.8052448511123658, "num_tokens": 225862105.0, "step": 21780 }, { "entropy": 0.7063582241535187, "epoch": 0.17432, "grad_norm": 3.301534652709961, "learning_rate": 4.130092036814726e-05, "loss": 0.7007, "mean_token_accuracy": 0.7860628008842468, "num_tokens": 226008238.0, "step": 21790 }, { "entropy": 0.7451909840106964, "epoch": 0.1744, "grad_norm": 4.399627208709717, "learning_rate": 4.1296918767507006e-05, "loss": 0.7328, "mean_token_accuracy": 0.8029196679592132, "num_tokens": 226048048.0, "step": 21800 }, { "entropy": 0.698210334777832, "epoch": 0.17448, "grad_norm": 1.676142692565918, "learning_rate": 4.129291716686675e-05, "loss": 0.7053, "mean_token_accuracy": 0.7808011770248413, "num_tokens": 226211888.0, "step": 21810 }, { "entropy": 0.6805242955684662, "epoch": 0.17456, "grad_norm": 3.1359000205993652, "learning_rate": 4.1288915566226494e-05, "loss": 0.6691, "mean_token_accuracy": 0.807530689239502, "num_tokens": 226300054.0, "step": 21820 }, { "entropy": 0.7943453133106232, "epoch": 0.17464, "grad_norm": 1.8557543754577637, "learning_rate": 4.128491396558624e-05, "loss": 0.803, "mean_token_accuracy": 0.7817293882369996, "num_tokens": 226393782.0, "step": 21830 }, { "entropy": 0.6915009379386902, "epoch": 0.17472, "grad_norm": 3.152682065963745, "learning_rate": 4.128091236494598e-05, "loss": 0.6935, "mean_token_accuracy": 0.7922289669513702, "num_tokens": 226517534.0, "step": 21840 }, { "entropy": 0.7170435190200806, "epoch": 0.1748, "grad_norm": 4.965356826782227, "learning_rate": 4.1276910764305725e-05, "loss": 0.7189, "mean_token_accuracy": 0.8090378522872925, "num_tokens": 226551486.0, "step": 21850 }, { "entropy": 0.6726264595985413, "epoch": 0.17488, "grad_norm": 1.691001534461975, "learning_rate": 4.127290916366547e-05, "loss": 0.6749, "mean_token_accuracy": 0.7867000639438629, "num_tokens": 226715326.0, "step": 21860 }, { "entropy": 0.6377951592206955, "epoch": 0.17496, "grad_norm": 4.087121486663818, "learning_rate": 4.126890756302521e-05, "loss": 0.629, "mean_token_accuracy": 0.817139995098114, "num_tokens": 226791785.0, "step": 21870 }, { "entropy": 0.7588944375514984, "epoch": 0.17504, "grad_norm": 2.0880672931671143, "learning_rate": 4.1264905962384956e-05, "loss": 0.7457, "mean_token_accuracy": 0.7890569925308227, "num_tokens": 226884845.0, "step": 21880 }, { "entropy": 0.7346546947956085, "epoch": 0.17512, "grad_norm": 2.192441463470459, "learning_rate": 4.12609043617447e-05, "loss": 0.7387, "mean_token_accuracy": 0.7769695699214936, "num_tokens": 227025084.0, "step": 21890 }, { "entropy": 0.6770976722240448, "epoch": 0.1752, "grad_norm": 6.233524322509766, "learning_rate": 4.125690276110444e-05, "loss": 0.6746, "mean_token_accuracy": 0.8129255771636963, "num_tokens": 227066240.0, "step": 21900 }, { "entropy": 0.6694867372512817, "epoch": 0.17528, "grad_norm": 1.9977047443389893, "learning_rate": 4.125290116046419e-05, "loss": 0.6687, "mean_token_accuracy": 0.7888452649116516, "num_tokens": 227229393.0, "step": 21910 }, { "entropy": 0.7677497506141663, "epoch": 0.17536, "grad_norm": 3.070770740509033, "learning_rate": 4.124889955982393e-05, "loss": 0.7567, "mean_token_accuracy": 0.7848235309123993, "num_tokens": 227312276.0, "step": 21920 }, { "entropy": 0.7043572247028351, "epoch": 0.17544, "grad_norm": 1.9565234184265137, "learning_rate": 4.1244897959183674e-05, "loss": 0.7234, "mean_token_accuracy": 0.7972940742969513, "num_tokens": 227407957.0, "step": 21930 }, { "entropy": 0.694657689332962, "epoch": 0.17552, "grad_norm": 2.6839046478271484, "learning_rate": 4.124089635854342e-05, "loss": 0.6847, "mean_token_accuracy": 0.7948987185955048, "num_tokens": 227539732.0, "step": 21940 }, { "entropy": 0.6592202544212341, "epoch": 0.1756, "grad_norm": 3.9668071269989014, "learning_rate": 4.123689475790316e-05, "loss": 0.647, "mean_token_accuracy": 0.8262664556503296, "num_tokens": 227578040.0, "step": 21950 }, { "entropy": 0.608012217283249, "epoch": 0.17568, "grad_norm": 1.6269338130950928, "learning_rate": 4.1232893157262906e-05, "loss": 0.609, "mean_token_accuracy": 0.8049645900726319, "num_tokens": 227741880.0, "step": 21960 }, { "entropy": 0.675517201423645, "epoch": 0.17576, "grad_norm": 2.9282007217407227, "learning_rate": 4.122889155662265e-05, "loss": 0.672, "mean_token_accuracy": 0.8034788429737091, "num_tokens": 227835728.0, "step": 21970 }, { "entropy": 0.7773352026939392, "epoch": 0.17584, "grad_norm": 1.4971996545791626, "learning_rate": 4.122488995598239e-05, "loss": 0.7729, "mean_token_accuracy": 0.7827446281909942, "num_tokens": 227930478.0, "step": 21980 }, { "entropy": 0.7316895365715027, "epoch": 0.17592, "grad_norm": 3.090388298034668, "learning_rate": 4.1220888355342143e-05, "loss": 0.7409, "mean_token_accuracy": 0.7827124238014221, "num_tokens": 228058942.0, "step": 21990 }, { "entropy": 0.6078915536403656, "epoch": 0.176, "grad_norm": 4.039350509643555, "learning_rate": 4.121688675470188e-05, "loss": 0.6158, "mean_token_accuracy": 0.8312233567237854, "num_tokens": 228100596.0, "step": 22000 }, { "entropy": 0.6629402130842209, "epoch": 0.17608, "grad_norm": 1.9874645471572876, "learning_rate": 4.1212885154061624e-05, "loss": 0.6619, "mean_token_accuracy": 0.7902662515640259, "num_tokens": 228264436.0, "step": 22010 }, { "entropy": 0.6495255798101425, "epoch": 0.17616, "grad_norm": 4.234248638153076, "learning_rate": 4.120888355342137e-05, "loss": 0.6411, "mean_token_accuracy": 0.8121263325214386, "num_tokens": 228357594.0, "step": 22020 }, { "entropy": 0.6677537381649017, "epoch": 0.17624, "grad_norm": 1.672018051147461, "learning_rate": 4.120488195278112e-05, "loss": 0.6788, "mean_token_accuracy": 0.8100315034389496, "num_tokens": 228451870.0, "step": 22030 }, { "entropy": 0.6997492253780365, "epoch": 0.17632, "grad_norm": 2.1376779079437256, "learning_rate": 4.1200880352140855e-05, "loss": 0.6912, "mean_token_accuracy": 0.7861933588981629, "num_tokens": 228590844.0, "step": 22040 }, { "entropy": 0.7040640652179718, "epoch": 0.1764, "grad_norm": 5.097400188446045, "learning_rate": 4.11968787515006e-05, "loss": 0.7002, "mean_token_accuracy": 0.8089961886405945, "num_tokens": 228630362.0, "step": 22050 }, { "entropy": 0.6370095133781433, "epoch": 0.17648, "grad_norm": 1.7693580389022827, "learning_rate": 4.119287715086035e-05, "loss": 0.6419, "mean_token_accuracy": 0.7965759813785553, "num_tokens": 228792529.0, "step": 22060 }, { "entropy": 0.6902148932218551, "epoch": 0.17656, "grad_norm": 3.135503053665161, "learning_rate": 4.118887555022009e-05, "loss": 0.6794, "mean_token_accuracy": 0.8050322890281677, "num_tokens": 228861998.0, "step": 22070 }, { "entropy": 0.694248378276825, "epoch": 0.17664, "grad_norm": 2.004669666290283, "learning_rate": 4.118487394957983e-05, "loss": 0.7042, "mean_token_accuracy": 0.7993439912796021, "num_tokens": 228954261.0, "step": 22080 }, { "entropy": 0.7226885735988617, "epoch": 0.17672, "grad_norm": 2.44466495513916, "learning_rate": 4.1180872348939574e-05, "loss": 0.7151, "mean_token_accuracy": 0.7818299293518066, "num_tokens": 229106401.0, "step": 22090 }, { "entropy": 0.7108057677745819, "epoch": 0.1768, "grad_norm": 4.785640239715576, "learning_rate": 4.1176870748299324e-05, "loss": 0.7125, "mean_token_accuracy": 0.8083360195159912, "num_tokens": 229145523.0, "step": 22100 }, { "entropy": 0.6842201709747314, "epoch": 0.17688, "grad_norm": 1.5659955739974976, "learning_rate": 4.117286914765907e-05, "loss": 0.6858, "mean_token_accuracy": 0.7850405752658844, "num_tokens": 229309354.0, "step": 22110 }, { "entropy": 0.72744100689888, "epoch": 0.17696, "grad_norm": 3.0461881160736084, "learning_rate": 4.1168867547018805e-05, "loss": 0.7126, "mean_token_accuracy": 0.7949418723583221, "num_tokens": 229403202.0, "step": 22120 }, { "entropy": 0.7135652363300323, "epoch": 0.17704, "grad_norm": 2.2358808517456055, "learning_rate": 4.1164865946378555e-05, "loss": 0.7211, "mean_token_accuracy": 0.7948421895503998, "num_tokens": 229496831.0, "step": 22130 }, { "entropy": 0.7340127408504487, "epoch": 0.17712, "grad_norm": 3.503202438354492, "learning_rate": 4.11608643457383e-05, "loss": 0.7206, "mean_token_accuracy": 0.7873267292976379, "num_tokens": 229626636.0, "step": 22140 }, { "entropy": 0.6837863326072693, "epoch": 0.1772, "grad_norm": 4.422046661376953, "learning_rate": 4.115686274509804e-05, "loss": 0.6847, "mean_token_accuracy": 0.8178483068943023, "num_tokens": 229663265.0, "step": 22150 }, { "entropy": 0.7225450873374939, "epoch": 0.17728, "grad_norm": 2.5962116718292236, "learning_rate": 4.115286114445778e-05, "loss": 0.7281, "mean_token_accuracy": 0.7768441617488862, "num_tokens": 229827105.0, "step": 22160 }, { "entropy": 0.6686119079589844, "epoch": 0.17736, "grad_norm": 3.1742024421691895, "learning_rate": 4.114885954381753e-05, "loss": 0.6633, "mean_token_accuracy": 0.8114422619342804, "num_tokens": 229904192.0, "step": 22170 }, { "entropy": 0.765252023935318, "epoch": 0.17744, "grad_norm": 2.871424674987793, "learning_rate": 4.1144857943177274e-05, "loss": 0.76, "mean_token_accuracy": 0.7896966755390167, "num_tokens": 229997229.0, "step": 22180 }, { "entropy": 0.6879453480243682, "epoch": 0.17752, "grad_norm": 2.161999225616455, "learning_rate": 4.114085634253702e-05, "loss": 0.6942, "mean_token_accuracy": 0.7908452272415161, "num_tokens": 230124423.0, "step": 22190 }, { "entropy": 0.8387950718402862, "epoch": 0.1776, "grad_norm": 5.975296497344971, "learning_rate": 4.113685474189676e-05, "loss": 0.8185, "mean_token_accuracy": 0.7907259345054627, "num_tokens": 230158666.0, "step": 22200 }, { "entropy": 0.6599321782588958, "epoch": 0.17768, "grad_norm": 1.5679117441177368, "learning_rate": 4.1132853141256505e-05, "loss": 0.6591, "mean_token_accuracy": 0.7907659888267518, "num_tokens": 230321740.0, "step": 22210 }, { "entropy": 0.6111919641494751, "epoch": 0.17776, "grad_norm": 3.7011795043945312, "learning_rate": 4.112885154061625e-05, "loss": 0.6033, "mean_token_accuracy": 0.8221196532249451, "num_tokens": 230400136.0, "step": 22220 }, { "entropy": 0.7493340313434601, "epoch": 0.17784, "grad_norm": 1.8225854635238647, "learning_rate": 4.112484993997599e-05, "loss": 0.755, "mean_token_accuracy": 0.7909559965133667, "num_tokens": 230492264.0, "step": 22230 }, { "entropy": 0.6682537019252777, "epoch": 0.17792, "grad_norm": 2.8044626712799072, "learning_rate": 4.1120848339335736e-05, "loss": 0.6645, "mean_token_accuracy": 0.7960254430770874, "num_tokens": 230624473.0, "step": 22240 }, { "entropy": 0.6912320256233215, "epoch": 0.178, "grad_norm": 6.188565731048584, "learning_rate": 4.111684673869548e-05, "loss": 0.6913, "mean_token_accuracy": 0.8157679378986359, "num_tokens": 230657689.0, "step": 22250 }, { "entropy": 0.6259787142276764, "epoch": 0.17808, "grad_norm": 2.2206966876983643, "learning_rate": 4.1112845138055224e-05, "loss": 0.633, "mean_token_accuracy": 0.8009281992912293, "num_tokens": 230821529.0, "step": 22260 }, { "entropy": 0.6570145130157471, "epoch": 0.17816, "grad_norm": 3.3296079635620117, "learning_rate": 4.110884353741497e-05, "loss": 0.6425, "mean_token_accuracy": 0.8134762704372406, "num_tokens": 230911300.0, "step": 22270 }, { "entropy": 0.7041824340820313, "epoch": 0.17824, "grad_norm": 2.486018419265747, "learning_rate": 4.110484193677471e-05, "loss": 0.7157, "mean_token_accuracy": 0.7978662610054016, "num_tokens": 231006009.0, "step": 22280 }, { "entropy": 0.6922010362148285, "epoch": 0.17832, "grad_norm": 2.9881131649017334, "learning_rate": 4.1100840336134455e-05, "loss": 0.6848, "mean_token_accuracy": 0.791857260465622, "num_tokens": 231147738.0, "step": 22290 }, { "entropy": 0.6858417332172394, "epoch": 0.1784, "grad_norm": 5.224206447601318, "learning_rate": 4.10968387354942e-05, "loss": 0.6871, "mean_token_accuracy": 0.815523874759674, "num_tokens": 231186068.0, "step": 22300 }, { "entropy": 0.6730184614658355, "epoch": 0.17848, "grad_norm": 2.231934070587158, "learning_rate": 4.109283713485394e-05, "loss": 0.6741, "mean_token_accuracy": 0.7884648323059082, "num_tokens": 231349908.0, "step": 22310 }, { "entropy": 0.6776305854320526, "epoch": 0.17856, "grad_norm": 2.5333681106567383, "learning_rate": 4.1088835534213686e-05, "loss": 0.6736, "mean_token_accuracy": 0.8037637770175934, "num_tokens": 231439948.0, "step": 22320 }, { "entropy": 0.6880438089370727, "epoch": 0.17864, "grad_norm": 1.4902644157409668, "learning_rate": 4.108483393357343e-05, "loss": 0.6867, "mean_token_accuracy": 0.8002367973327636, "num_tokens": 231533523.0, "step": 22330 }, { "entropy": 0.6561358273029327, "epoch": 0.17872, "grad_norm": 3.03657603263855, "learning_rate": 4.108083233293318e-05, "loss": 0.6463, "mean_token_accuracy": 0.8030239343643188, "num_tokens": 231660202.0, "step": 22340 }, { "entropy": 0.690906697511673, "epoch": 0.1788, "grad_norm": 4.046293258666992, "learning_rate": 4.107683073229292e-05, "loss": 0.7011, "mean_token_accuracy": 0.8124451756477356, "num_tokens": 231696561.0, "step": 22350 }, { "entropy": 0.6816493749618531, "epoch": 0.17888, "grad_norm": 2.076395273208618, "learning_rate": 4.107282913165266e-05, "loss": 0.6833, "mean_token_accuracy": 0.7864313662052155, "num_tokens": 231860401.0, "step": 22360 }, { "entropy": 0.6977500438690185, "epoch": 0.17896, "grad_norm": 3.7900514602661133, "learning_rate": 4.1068827531012405e-05, "loss": 0.6896, "mean_token_accuracy": 0.8019044697284698, "num_tokens": 231947831.0, "step": 22370 }, { "entropy": 0.7384151577949524, "epoch": 0.17904, "grad_norm": 1.7667738199234009, "learning_rate": 4.1064825930372155e-05, "loss": 0.7461, "mean_token_accuracy": 0.7877771437168122, "num_tokens": 232041219.0, "step": 22380 }, { "entropy": 0.6746794998645782, "epoch": 0.17912, "grad_norm": 2.3440845012664795, "learning_rate": 4.106082432973189e-05, "loss": 0.6639, "mean_token_accuracy": 0.7935473501682282, "num_tokens": 232178645.0, "step": 22390 }, { "entropy": 0.7753546059131622, "epoch": 0.1792, "grad_norm": 5.375569820404053, "learning_rate": 4.1056822729091636e-05, "loss": 0.7594, "mean_token_accuracy": 0.8009793937206269, "num_tokens": 232214369.0, "step": 22400 }, { "entropy": 0.6171093702316284, "epoch": 0.17928, "grad_norm": 1.8311314582824707, "learning_rate": 4.1052821128451386e-05, "loss": 0.6166, "mean_token_accuracy": 0.8033522486686706, "num_tokens": 232377896.0, "step": 22410 }, { "entropy": 0.6777725398540497, "epoch": 0.17936, "grad_norm": 3.3732213973999023, "learning_rate": 4.104881952781113e-05, "loss": 0.6749, "mean_token_accuracy": 0.8084651947021484, "num_tokens": 232451805.0, "step": 22420 }, { "entropy": 0.6575208485126496, "epoch": 0.17944, "grad_norm": 1.6885337829589844, "learning_rate": 4.104481792717087e-05, "loss": 0.6477, "mean_token_accuracy": 0.8115244388580323, "num_tokens": 232543796.0, "step": 22430 }, { "entropy": 0.7022213757038116, "epoch": 0.17952, "grad_norm": 3.9591124057769775, "learning_rate": 4.104081632653061e-05, "loss": 0.7052, "mean_token_accuracy": 0.7868866920471191, "num_tokens": 232667025.0, "step": 22440 }, { "entropy": 0.687937206029892, "epoch": 0.1796, "grad_norm": 4.770838737487793, "learning_rate": 4.103681472589036e-05, "loss": 0.703, "mean_token_accuracy": 0.812822288274765, "num_tokens": 232700653.0, "step": 22450 }, { "entropy": 0.6856361329555511, "epoch": 0.17968, "grad_norm": 2.4500579833984375, "learning_rate": 4.1032813125250105e-05, "loss": 0.6785, "mean_token_accuracy": 0.7875963509082794, "num_tokens": 232863299.0, "step": 22460 }, { "entropy": 0.7079109787940979, "epoch": 0.17976, "grad_norm": 3.2113983631134033, "learning_rate": 4.102881152460984e-05, "loss": 0.7039, "mean_token_accuracy": 0.8038083374500274, "num_tokens": 232941471.0, "step": 22470 }, { "entropy": 0.7412150621414184, "epoch": 0.17984, "grad_norm": 1.755474328994751, "learning_rate": 4.1024809923969585e-05, "loss": 0.74, "mean_token_accuracy": 0.7887933909893036, "num_tokens": 233034802.0, "step": 22480 }, { "entropy": 0.6894140928983689, "epoch": 0.17992, "grad_norm": 2.428290367126465, "learning_rate": 4.1020808323329336e-05, "loss": 0.6744, "mean_token_accuracy": 0.794824755191803, "num_tokens": 233166984.0, "step": 22490 }, { "entropy": 0.7014735162258148, "epoch": 0.18, "grad_norm": 4.872162818908691, "learning_rate": 4.101680672268908e-05, "loss": 0.7203, "mean_token_accuracy": 0.8086410641670227, "num_tokens": 233205382.0, "step": 22500 }, { "entropy": 0.6623899340629578, "epoch": 0.18008, "grad_norm": 1.7476637363433838, "learning_rate": 4.1012805122048817e-05, "loss": 0.6642, "mean_token_accuracy": 0.7886174976825714, "num_tokens": 233369222.0, "step": 22510 }, { "entropy": 0.6783383369445801, "epoch": 0.18016, "grad_norm": 3.1630024909973145, "learning_rate": 4.100880352140857e-05, "loss": 0.6648, "mean_token_accuracy": 0.8076937198638916, "num_tokens": 233465657.0, "step": 22520 }, { "entropy": 0.7393767058849334, "epoch": 0.18024, "grad_norm": 1.9686335325241089, "learning_rate": 4.100480192076831e-05, "loss": 0.7465, "mean_token_accuracy": 0.791801804304123, "num_tokens": 233559588.0, "step": 22530 }, { "entropy": 0.7019142806529999, "epoch": 0.18032, "grad_norm": 1.9074931144714355, "learning_rate": 4.1000800320128054e-05, "loss": 0.693, "mean_token_accuracy": 0.783707857131958, "num_tokens": 233705547.0, "step": 22540 }, { "entropy": 0.7738478004932403, "epoch": 0.1804, "grad_norm": 4.753734111785889, "learning_rate": 4.099679871948779e-05, "loss": 0.771, "mean_token_accuracy": 0.7955702245235443, "num_tokens": 233747842.0, "step": 22550 }, { "entropy": 0.685502964258194, "epoch": 0.18048, "grad_norm": 1.7794511318206787, "learning_rate": 4.099279711884754e-05, "loss": 0.686, "mean_token_accuracy": 0.7882511019706726, "num_tokens": 233911682.0, "step": 22560 }, { "entropy": 0.738067764043808, "epoch": 0.18056, "grad_norm": 3.4792134761810303, "learning_rate": 4.0988795518207286e-05, "loss": 0.7273, "mean_token_accuracy": 0.7961779654026031, "num_tokens": 233996107.0, "step": 22570 }, { "entropy": 0.7023865878582001, "epoch": 0.18064, "grad_norm": 2.3864283561706543, "learning_rate": 4.098479391756703e-05, "loss": 0.7103, "mean_token_accuracy": 0.7933776140213012, "num_tokens": 234090172.0, "step": 22580 }, { "entropy": 0.6880164206027984, "epoch": 0.18072, "grad_norm": 2.1327970027923584, "learning_rate": 4.098079231692677e-05, "loss": 0.6737, "mean_token_accuracy": 0.7972638726234436, "num_tokens": 234242886.0, "step": 22590 }, { "entropy": 0.7663327395915985, "epoch": 0.1808, "grad_norm": 5.037988185882568, "learning_rate": 4.097679071628652e-05, "loss": 0.7739, "mean_token_accuracy": 0.797235083580017, "num_tokens": 234283440.0, "step": 22600 }, { "entropy": 0.7087028741836547, "epoch": 0.18088, "grad_norm": 2.4398841857910156, "learning_rate": 4.097278911564626e-05, "loss": 0.7134, "mean_token_accuracy": 0.7815766990184784, "num_tokens": 234447280.0, "step": 22610 }, { "entropy": 0.7770077109336853, "epoch": 0.18096, "grad_norm": 4.043250560760498, "learning_rate": 4.0968787515006004e-05, "loss": 0.7734, "mean_token_accuracy": 0.7827921569347381, "num_tokens": 234549497.0, "step": 22620 }, { "entropy": 0.6755852341651917, "epoch": 0.18104, "grad_norm": 1.7763653993606567, "learning_rate": 4.096478591436575e-05, "loss": 0.6765, "mean_token_accuracy": 0.8006763577461242, "num_tokens": 234643779.0, "step": 22630 }, { "entropy": 0.7072835862636566, "epoch": 0.18112, "grad_norm": 2.2603533267974854, "learning_rate": 4.096078431372549e-05, "loss": 0.6965, "mean_token_accuracy": 0.7887476027011872, "num_tokens": 234795440.0, "step": 22640 }, { "entropy": 0.7025846660137176, "epoch": 0.1812, "grad_norm": 4.490751266479492, "learning_rate": 4.0956782713085235e-05, "loss": 0.701, "mean_token_accuracy": 0.8098995447158813, "num_tokens": 234841941.0, "step": 22650 }, { "entropy": 0.6586232006549835, "epoch": 0.18128, "grad_norm": 1.707593321800232, "learning_rate": 4.095278111244498e-05, "loss": 0.6655, "mean_token_accuracy": 0.7919272184371948, "num_tokens": 235005781.0, "step": 22660 }, { "entropy": 0.6657885879278183, "epoch": 0.18136, "grad_norm": 3.0378170013427734, "learning_rate": 4.094877951180472e-05, "loss": 0.664, "mean_token_accuracy": 0.805607271194458, "num_tokens": 235102333.0, "step": 22670 }, { "entropy": 0.7376222968101501, "epoch": 0.18144, "grad_norm": 1.8540033102035522, "learning_rate": 4.0944777911164466e-05, "loss": 0.7364, "mean_token_accuracy": 0.7935705780982971, "num_tokens": 235194966.0, "step": 22680 }, { "entropy": 0.7082687497138977, "epoch": 0.18152, "grad_norm": 2.553095579147339, "learning_rate": 4.094077631052421e-05, "loss": 0.6938, "mean_token_accuracy": 0.7889690816402435, "num_tokens": 235336021.0, "step": 22690 }, { "entropy": 0.6860388815402985, "epoch": 0.1816, "grad_norm": 5.560471057891846, "learning_rate": 4.0936774709883954e-05, "loss": 0.6868, "mean_token_accuracy": 0.8181432604789733, "num_tokens": 235376116.0, "step": 22700 }, { "entropy": 0.6534160375595093, "epoch": 0.18168, "grad_norm": 1.9580060243606567, "learning_rate": 4.09327731092437e-05, "loss": 0.6563, "mean_token_accuracy": 0.7962738990783691, "num_tokens": 235535923.0, "step": 22710 }, { "entropy": 0.5536795645952225, "epoch": 0.18176, "grad_norm": 2.9083597660064697, "learning_rate": 4.092877150860344e-05, "loss": 0.5392, "mean_token_accuracy": 0.839943391084671, "num_tokens": 235602863.0, "step": 22720 }, { "entropy": 0.7272048771381379, "epoch": 0.18184, "grad_norm": 1.6153689622879028, "learning_rate": 4.092476990796319e-05, "loss": 0.7269, "mean_token_accuracy": 0.7920450091361999, "num_tokens": 235696019.0, "step": 22730 }, { "entropy": 0.7169977486133575, "epoch": 0.18192, "grad_norm": 2.723802328109741, "learning_rate": 4.092076830732293e-05, "loss": 0.711, "mean_token_accuracy": 0.7854685604572296, "num_tokens": 235831280.0, "step": 22740 }, { "entropy": 0.7153978914022445, "epoch": 0.182, "grad_norm": 6.408761024475098, "learning_rate": 4.091676670668267e-05, "loss": 0.7113, "mean_token_accuracy": 0.8090779662132264, "num_tokens": 235873730.0, "step": 22750 }, { "entropy": 0.6206241726875306, "epoch": 0.18208, "grad_norm": 1.595786690711975, "learning_rate": 4.0912765106042416e-05, "loss": 0.6261, "mean_token_accuracy": 0.7984855949878693, "num_tokens": 236037570.0, "step": 22760 }, { "entropy": 0.6616337031126023, "epoch": 0.18216, "grad_norm": 3.1045305728912354, "learning_rate": 4.090876350540217e-05, "loss": 0.6506, "mean_token_accuracy": 0.8123730659484864, "num_tokens": 236120141.0, "step": 22770 }, { "entropy": 0.6576323449611664, "epoch": 0.18224, "grad_norm": 2.386737108230591, "learning_rate": 4.0904761904761904e-05, "loss": 0.6571, "mean_token_accuracy": 0.809656971693039, "num_tokens": 236214965.0, "step": 22780 }, { "entropy": 0.6706186532974243, "epoch": 0.18232, "grad_norm": 2.004483461380005, "learning_rate": 4.090076030412165e-05, "loss": 0.6674, "mean_token_accuracy": 0.7928433775901794, "num_tokens": 236364101.0, "step": 22790 }, { "entropy": 0.6837160617113114, "epoch": 0.1824, "grad_norm": 4.525196075439453, "learning_rate": 4.08967587034814e-05, "loss": 0.6757, "mean_token_accuracy": 0.8154143273830414, "num_tokens": 236408174.0, "step": 22800 }, { "entropy": 0.7127113163471221, "epoch": 0.18248, "grad_norm": 1.685930848121643, "learning_rate": 4.089275710284114e-05, "loss": 0.7205, "mean_token_accuracy": 0.7743588209152221, "num_tokens": 236572014.0, "step": 22810 }, { "entropy": 0.6670223623514175, "epoch": 0.18256, "grad_norm": 2.9866514205932617, "learning_rate": 4.088875550220088e-05, "loss": 0.6453, "mean_token_accuracy": 0.8097565233707428, "num_tokens": 236653783.0, "step": 22820 }, { "entropy": 0.7350206971168518, "epoch": 0.18264, "grad_norm": 2.1692159175872803, "learning_rate": 4.088475390156062e-05, "loss": 0.7369, "mean_token_accuracy": 0.7896705985069274, "num_tokens": 236747418.0, "step": 22830 }, { "entropy": 0.7213882625102996, "epoch": 0.18272, "grad_norm": 3.358750581741333, "learning_rate": 4.088075230092037e-05, "loss": 0.7307, "mean_token_accuracy": 0.7788874924182891, "num_tokens": 236889131.0, "step": 22840 }, { "entropy": 0.6972641915082931, "epoch": 0.1828, "grad_norm": 5.580048084259033, "learning_rate": 4.0876750700280116e-05, "loss": 0.6882, "mean_token_accuracy": 0.8173644185066223, "num_tokens": 236928507.0, "step": 22850 }, { "entropy": 0.6999973058700562, "epoch": 0.18288, "grad_norm": 1.6565598249435425, "learning_rate": 4.087274909963985e-05, "loss": 0.697, "mean_token_accuracy": 0.7802406072616577, "num_tokens": 237092184.0, "step": 22860 }, { "entropy": 0.71315096616745, "epoch": 0.18296, "grad_norm": 3.550138235092163, "learning_rate": 4.0868747498999604e-05, "loss": 0.7116, "mean_token_accuracy": 0.7998935401439666, "num_tokens": 237169180.0, "step": 22870 }, { "entropy": 0.7085930824279785, "epoch": 0.18304, "grad_norm": 1.9418388605117798, "learning_rate": 4.086474589835935e-05, "loss": 0.715, "mean_token_accuracy": 0.8002826869487762, "num_tokens": 237260825.0, "step": 22880 }, { "entropy": 0.7499661862850189, "epoch": 0.18312, "grad_norm": 2.5767462253570557, "learning_rate": 4.086074429771909e-05, "loss": 0.7401, "mean_token_accuracy": 0.7812958300113678, "num_tokens": 237390230.0, "step": 22890 }, { "entropy": 0.7398426592350006, "epoch": 0.1832, "grad_norm": 5.498938083648682, "learning_rate": 4.085674269707883e-05, "loss": 0.731, "mean_token_accuracy": 0.8051385402679443, "num_tokens": 237423326.0, "step": 22900 }, { "entropy": 0.6323539704084397, "epoch": 0.18328, "grad_norm": 1.6893141269683838, "learning_rate": 4.085274109643858e-05, "loss": 0.6399, "mean_token_accuracy": 0.7936370253562928, "num_tokens": 237587166.0, "step": 22910 }, { "entropy": 0.6041779547929764, "epoch": 0.18336, "grad_norm": 3.947617769241333, "learning_rate": 4.084873949579832e-05, "loss": 0.5918, "mean_token_accuracy": 0.8258014738559722, "num_tokens": 237671901.0, "step": 22920 }, { "entropy": 0.6817964673042297, "epoch": 0.18344, "grad_norm": 2.7452659606933594, "learning_rate": 4.0844737895158066e-05, "loss": 0.6802, "mean_token_accuracy": 0.8070767760276795, "num_tokens": 237765464.0, "step": 22930 }, { "entropy": 0.6938463926315308, "epoch": 0.18352, "grad_norm": 2.2022030353546143, "learning_rate": 4.084073629451781e-05, "loss": 0.6945, "mean_token_accuracy": 0.7893632769584655, "num_tokens": 237905160.0, "step": 22940 }, { "entropy": 0.6720123589038849, "epoch": 0.1836, "grad_norm": 6.030847549438477, "learning_rate": 4.0836734693877553e-05, "loss": 0.6716, "mean_token_accuracy": 0.818192583322525, "num_tokens": 237943230.0, "step": 22950 }, { "entropy": 0.698908394575119, "epoch": 0.18368, "grad_norm": 1.3169792890548706, "learning_rate": 4.08327330932373e-05, "loss": 0.6927, "mean_token_accuracy": 0.7856005907058716, "num_tokens": 238106979.0, "step": 22960 }, { "entropy": 0.7449725896120072, "epoch": 0.18376, "grad_norm": 3.8229682445526123, "learning_rate": 4.082873149259704e-05, "loss": 0.737, "mean_token_accuracy": 0.7921479165554046, "num_tokens": 238182426.0, "step": 22970 }, { "entropy": 0.703963702917099, "epoch": 0.18384, "grad_norm": 3.013493776321411, "learning_rate": 4.0824729891956785e-05, "loss": 0.6992, "mean_token_accuracy": 0.7992039382457733, "num_tokens": 238275165.0, "step": 22980 }, { "entropy": 0.7439287185668946, "epoch": 0.18392, "grad_norm": 2.7332043647766113, "learning_rate": 4.082072829131653e-05, "loss": 0.7488, "mean_token_accuracy": 0.7767341494560241, "num_tokens": 238409677.0, "step": 22990 }, { "entropy": 0.7112905740737915, "epoch": 0.184, "grad_norm": 5.121912479400635, "learning_rate": 4.081672669067627e-05, "loss": 0.7183, "mean_token_accuracy": 0.8083651840686799, "num_tokens": 238445209.0, "step": 23000 }, { "entropy": 0.737404716014862, "epoch": 0.18408, "grad_norm": 1.8371551036834717, "learning_rate": 4.0812725090036016e-05, "loss": 0.729, "mean_token_accuracy": 0.7773035407066345, "num_tokens": 238604449.0, "step": 23010 }, { "entropy": 0.7342022955417633, "epoch": 0.18416, "grad_norm": 3.8039000034332275, "learning_rate": 4.080872348939576e-05, "loss": 0.7347, "mean_token_accuracy": 0.7990869522094727, "num_tokens": 238669293.0, "step": 23020 }, { "entropy": 0.6961185097694397, "epoch": 0.18424, "grad_norm": 1.680775761604309, "learning_rate": 4.08047218887555e-05, "loss": 0.6841, "mean_token_accuracy": 0.7967734277248383, "num_tokens": 238761437.0, "step": 23030 }, { "entropy": 0.7254520893096924, "epoch": 0.18432, "grad_norm": 2.676375150680542, "learning_rate": 4.080072028811525e-05, "loss": 0.721, "mean_token_accuracy": 0.7814616322517395, "num_tokens": 238901978.0, "step": 23040 }, { "entropy": 0.7075047731399536, "epoch": 0.1844, "grad_norm": 4.9903974533081055, "learning_rate": 4.0796718687475e-05, "loss": 0.6979, "mean_token_accuracy": 0.812750232219696, "num_tokens": 238942031.0, "step": 23050 }, { "entropy": 0.6470142483711243, "epoch": 0.18448, "grad_norm": 1.7536909580230713, "learning_rate": 4.0792717086834734e-05, "loss": 0.649, "mean_token_accuracy": 0.7954873025417328, "num_tokens": 239105871.0, "step": 23060 }, { "entropy": 0.7237686276435852, "epoch": 0.18456, "grad_norm": 4.090611934661865, "learning_rate": 4.078871548619448e-05, "loss": 0.7199, "mean_token_accuracy": 0.7981289744377136, "num_tokens": 239189145.0, "step": 23070 }, { "entropy": 0.7045099139213562, "epoch": 0.18464, "grad_norm": 1.7494314908981323, "learning_rate": 4.078471388555422e-05, "loss": 0.7022, "mean_token_accuracy": 0.7950267732143402, "num_tokens": 239282590.0, "step": 23080 }, { "entropy": 0.6590442061424255, "epoch": 0.18472, "grad_norm": 2.879044771194458, "learning_rate": 4.078071228491397e-05, "loss": 0.66, "mean_token_accuracy": 0.7965273320674896, "num_tokens": 239419413.0, "step": 23090 }, { "entropy": 0.7852351605892182, "epoch": 0.1848, "grad_norm": 5.020484447479248, "learning_rate": 4.077671068427371e-05, "loss": 0.7657, "mean_token_accuracy": 0.8003006041049957, "num_tokens": 239458085.0, "step": 23100 }, { "entropy": 0.6903618156909943, "epoch": 0.18488, "grad_norm": 2.1599020957946777, "learning_rate": 4.077270908363345e-05, "loss": 0.6906, "mean_token_accuracy": 0.7839093923568725, "num_tokens": 239621925.0, "step": 23110 }, { "entropy": 0.7062003195285798, "epoch": 0.18496, "grad_norm": 3.6358935832977295, "learning_rate": 4.07687074829932e-05, "loss": 0.7002, "mean_token_accuracy": 0.8007135927677155, "num_tokens": 239714273.0, "step": 23120 }, { "entropy": 0.6915939331054688, "epoch": 0.18504, "grad_norm": 1.418570876121521, "learning_rate": 4.076470588235295e-05, "loss": 0.7062, "mean_token_accuracy": 0.7983096718788147, "num_tokens": 239809087.0, "step": 23130 }, { "entropy": 0.6411217272281646, "epoch": 0.18512, "grad_norm": 2.4757683277130127, "learning_rate": 4.0760704281712684e-05, "loss": 0.629, "mean_token_accuracy": 0.8059350073337554, "num_tokens": 239942136.0, "step": 23140 }, { "entropy": 0.7596830785274505, "epoch": 0.1852, "grad_norm": 4.1572723388671875, "learning_rate": 4.075670268107243e-05, "loss": 0.7709, "mean_token_accuracy": 0.8037079453468323, "num_tokens": 239973600.0, "step": 23150 }, { "entropy": 0.6986185014247894, "epoch": 0.18528, "grad_norm": 1.6085948944091797, "learning_rate": 4.075270108043218e-05, "loss": 0.6965, "mean_token_accuracy": 0.78153395652771, "num_tokens": 240137440.0, "step": 23160 }, { "entropy": 0.6564988523721695, "epoch": 0.18536, "grad_norm": 3.111870050430298, "learning_rate": 4.074869947979192e-05, "loss": 0.6419, "mean_token_accuracy": 0.812713348865509, "num_tokens": 240224619.0, "step": 23170 }, { "entropy": 0.714034104347229, "epoch": 0.18544, "grad_norm": 1.7734845876693726, "learning_rate": 4.074469787915166e-05, "loss": 0.7229, "mean_token_accuracy": 0.7944191813468933, "num_tokens": 240318002.0, "step": 23180 }, { "entropy": 0.6834815621376038, "epoch": 0.18552, "grad_norm": 2.9837565422058105, "learning_rate": 4.074069627851141e-05, "loss": 0.6761, "mean_token_accuracy": 0.7947249948978424, "num_tokens": 240451137.0, "step": 23190 }, { "entropy": 0.7815622806549072, "epoch": 0.1856, "grad_norm": 5.537600040435791, "learning_rate": 4.073669467787115e-05, "loss": 0.7727, "mean_token_accuracy": 0.8028430998325348, "num_tokens": 240490558.0, "step": 23200 }, { "entropy": 0.6336936950683594, "epoch": 0.18568, "grad_norm": 2.1095712184906006, "learning_rate": 4.07326930772309e-05, "loss": 0.633, "mean_token_accuracy": 0.7977714955806732, "num_tokens": 240654198.0, "step": 23210 }, { "entropy": 0.7356218814849853, "epoch": 0.18576, "grad_norm": 3.3239200115203857, "learning_rate": 4.0728691476590634e-05, "loss": 0.7284, "mean_token_accuracy": 0.7964202105998993, "num_tokens": 240731122.0, "step": 23220 }, { "entropy": 0.6933870255947113, "epoch": 0.18584, "grad_norm": 1.7075196504592896, "learning_rate": 4.0724689875950384e-05, "loss": 0.6922, "mean_token_accuracy": 0.798346883058548, "num_tokens": 240825898.0, "step": 23230 }, { "entropy": 0.7291960120201111, "epoch": 0.18592, "grad_norm": 2.084946393966675, "learning_rate": 4.072068827531013e-05, "loss": 0.7311, "mean_token_accuracy": 0.7839991509914398, "num_tokens": 240960819.0, "step": 23240 }, { "entropy": 0.6742640554904937, "epoch": 0.186, "grad_norm": 6.230724334716797, "learning_rate": 4.071668667466987e-05, "loss": 0.6688, "mean_token_accuracy": 0.8181580185890198, "num_tokens": 241000251.0, "step": 23250 }, { "entropy": 0.6726976990699768, "epoch": 0.18608, "grad_norm": 1.6182925701141357, "learning_rate": 4.0712685074029615e-05, "loss": 0.6729, "mean_token_accuracy": 0.7884404122829437, "num_tokens": 241164091.0, "step": 23260 }, { "entropy": 0.6187936514616013, "epoch": 0.18616, "grad_norm": 3.622978448867798, "learning_rate": 4.070868347338936e-05, "loss": 0.6057, "mean_token_accuracy": 0.8214320242404938, "num_tokens": 241245201.0, "step": 23270 }, { "entropy": 0.7004518926143646, "epoch": 0.18624, "grad_norm": 1.6769864559173584, "learning_rate": 4.07046818727491e-05, "loss": 0.7146, "mean_token_accuracy": 0.7962665677070617, "num_tokens": 241340434.0, "step": 23280 }, { "entropy": 0.7146873712539673, "epoch": 0.18632, "grad_norm": 2.0354185104370117, "learning_rate": 4.0700680272108847e-05, "loss": 0.7117, "mean_token_accuracy": 0.7833512246608734, "num_tokens": 241484167.0, "step": 23290 }, { "entropy": 0.6193594574928284, "epoch": 0.1864, "grad_norm": 5.929380416870117, "learning_rate": 4.069667867146859e-05, "loss": 0.6159, "mean_token_accuracy": 0.831468266248703, "num_tokens": 241524489.0, "step": 23300 }, { "entropy": 0.675643464922905, "epoch": 0.18648, "grad_norm": 1.5811452865600586, "learning_rate": 4.0692677070828334e-05, "loss": 0.678, "mean_token_accuracy": 0.7860283434391022, "num_tokens": 241688329.0, "step": 23310 }, { "entropy": 0.6724188923835754, "epoch": 0.18656, "grad_norm": 3.4265196323394775, "learning_rate": 4.068867547018808e-05, "loss": 0.6592, "mean_token_accuracy": 0.8088804900646209, "num_tokens": 241785185.0, "step": 23320 }, { "entropy": 0.773957222700119, "epoch": 0.18664, "grad_norm": 1.5438995361328125, "learning_rate": 4.068467386954782e-05, "loss": 0.7872, "mean_token_accuracy": 0.7827115535736084, "num_tokens": 241882514.0, "step": 23330 }, { "entropy": 0.6735726952552795, "epoch": 0.18672, "grad_norm": 2.7761504650115967, "learning_rate": 4.0680672268907565e-05, "loss": 0.6721, "mean_token_accuracy": 0.7928688108921051, "num_tokens": 242014267.0, "step": 23340 }, { "entropy": 0.7383569836616516, "epoch": 0.1868, "grad_norm": 5.655780792236328, "learning_rate": 4.067667066826731e-05, "loss": 0.7311, "mean_token_accuracy": 0.8047410368919372, "num_tokens": 242047497.0, "step": 23350 }, { "entropy": 0.6075785636901856, "epoch": 0.18688, "grad_norm": 1.9570412635803223, "learning_rate": 4.067266906762705e-05, "loss": 0.6074, "mean_token_accuracy": 0.8028578460216522, "num_tokens": 242211337.0, "step": 23360 }, { "entropy": 0.7283272296190262, "epoch": 0.18696, "grad_norm": 3.6301255226135254, "learning_rate": 4.0668667466986796e-05, "loss": 0.7192, "mean_token_accuracy": 0.7972404658794403, "num_tokens": 242294936.0, "step": 23370 }, { "entropy": 0.6963740408420562, "epoch": 0.18704, "grad_norm": 2.4476983547210693, "learning_rate": 4.066466586634654e-05, "loss": 0.6999, "mean_token_accuracy": 0.8004231154918671, "num_tokens": 242389184.0, "step": 23380 }, { "entropy": 0.7017114758491516, "epoch": 0.18712, "grad_norm": 3.4295778274536133, "learning_rate": 4.0660664265706284e-05, "loss": 0.7032, "mean_token_accuracy": 0.7883487701416015, "num_tokens": 242513605.0, "step": 23390 }, { "entropy": 0.7547553598880767, "epoch": 0.1872, "grad_norm": 5.266709804534912, "learning_rate": 4.0656662665066034e-05, "loss": 0.763, "mean_token_accuracy": 0.8083842873573304, "num_tokens": 242550721.0, "step": 23400 }, { "entropy": 0.6829511284828186, "epoch": 0.18728, "grad_norm": 2.2195239067077637, "learning_rate": 4.065266106442577e-05, "loss": 0.6787, "mean_token_accuracy": 0.7867244899272918, "num_tokens": 242714561.0, "step": 23410 }, { "entropy": 0.7344180285930634, "epoch": 0.18736, "grad_norm": 3.335066556930542, "learning_rate": 4.0648659463785515e-05, "loss": 0.7212, "mean_token_accuracy": 0.7970059156417847, "num_tokens": 242800824.0, "step": 23420 }, { "entropy": 0.7014655411243439, "epoch": 0.18744, "grad_norm": 2.0373435020446777, "learning_rate": 4.064465786314526e-05, "loss": 0.7076, "mean_token_accuracy": 0.8000874936580658, "num_tokens": 242894991.0, "step": 23430 }, { "entropy": 0.7208017408847809, "epoch": 0.18752, "grad_norm": 3.0744619369506836, "learning_rate": 4.064065626250501e-05, "loss": 0.7133, "mean_token_accuracy": 0.7837130010128022, "num_tokens": 243023214.0, "step": 23440 }, { "entropy": 0.778817567229271, "epoch": 0.1876, "grad_norm": 4.514371395111084, "learning_rate": 4.0636654661864746e-05, "loss": 0.7662, "mean_token_accuracy": 0.8021717727184295, "num_tokens": 243058312.0, "step": 23450 }, { "entropy": 0.7011812031269073, "epoch": 0.18768, "grad_norm": 1.8214662075042725, "learning_rate": 4.063265306122449e-05, "loss": 0.7, "mean_token_accuracy": 0.7832193434238434, "num_tokens": 243222152.0, "step": 23460 }, { "entropy": 0.6760211527347565, "epoch": 0.18776, "grad_norm": 3.2756102085113525, "learning_rate": 4.062865146058424e-05, "loss": 0.6712, "mean_token_accuracy": 0.8037131428718567, "num_tokens": 243313649.0, "step": 23470 }, { "entropy": 0.7203639864921569, "epoch": 0.18784, "grad_norm": 2.1379547119140625, "learning_rate": 4.0624649859943984e-05, "loss": 0.7209, "mean_token_accuracy": 0.7949851036071778, "num_tokens": 243408403.0, "step": 23480 }, { "entropy": 0.7385113298892975, "epoch": 0.18792, "grad_norm": 2.7224678993225098, "learning_rate": 4.062064825930372e-05, "loss": 0.7327, "mean_token_accuracy": 0.7825685918331147, "num_tokens": 243542170.0, "step": 23490 }, { "entropy": 0.7161407202482224, "epoch": 0.188, "grad_norm": 5.280667304992676, "learning_rate": 4.0616646658663464e-05, "loss": 0.7177, "mean_token_accuracy": 0.8075015366077423, "num_tokens": 243577794.0, "step": 23500 }, { "entropy": 0.7406507194042206, "epoch": 0.18808, "grad_norm": 2.4016976356506348, "learning_rate": 4.0612645058023215e-05, "loss": 0.7392, "mean_token_accuracy": 0.7721196353435517, "num_tokens": 243741250.0, "step": 23510 }, { "entropy": 0.6632969737052917, "epoch": 0.18816, "grad_norm": 2.9862473011016846, "learning_rate": 4.060864345738296e-05, "loss": 0.6631, "mean_token_accuracy": 0.8102523505687713, "num_tokens": 243824046.0, "step": 23520 }, { "entropy": 0.715105164051056, "epoch": 0.18824, "grad_norm": 2.3614859580993652, "learning_rate": 4.0604641856742696e-05, "loss": 0.7097, "mean_token_accuracy": 0.8004243433475494, "num_tokens": 243916346.0, "step": 23530 }, { "entropy": 0.6789016664028168, "epoch": 0.18832, "grad_norm": 1.9568686485290527, "learning_rate": 4.060064025610244e-05, "loss": 0.6806, "mean_token_accuracy": 0.7918351233005524, "num_tokens": 244050899.0, "step": 23540 }, { "entropy": 0.7438103556632996, "epoch": 0.1884, "grad_norm": 4.899401664733887, "learning_rate": 4.059663865546219e-05, "loss": 0.7542, "mean_token_accuracy": 0.8013890206813812, "num_tokens": 244087145.0, "step": 23550 }, { "entropy": 0.7042156159877777, "epoch": 0.18848, "grad_norm": 1.6921488046646118, "learning_rate": 4.0592637054821934e-05, "loss": 0.6981, "mean_token_accuracy": 0.781204205751419, "num_tokens": 244250985.0, "step": 23560 }, { "entropy": 0.7604289174079895, "epoch": 0.18856, "grad_norm": 3.1893088817596436, "learning_rate": 4.058863545418167e-05, "loss": 0.7488, "mean_token_accuracy": 0.7836827397346496, "num_tokens": 244357928.0, "step": 23570 }, { "entropy": 0.7145902633666992, "epoch": 0.18864, "grad_norm": 2.87324857711792, "learning_rate": 4.058463385354142e-05, "loss": 0.7057, "mean_token_accuracy": 0.7993674576282501, "num_tokens": 244452126.0, "step": 23580 }, { "entropy": 0.7044288873672485, "epoch": 0.18872, "grad_norm": 3.3054404258728027, "learning_rate": 4.0580632252901165e-05, "loss": 0.7095, "mean_token_accuracy": 0.783352255821228, "num_tokens": 244590099.0, "step": 23590 }, { "entropy": 0.7426506251096725, "epoch": 0.1888, "grad_norm": 4.550796985626221, "learning_rate": 4.057663065226091e-05, "loss": 0.7264, "mean_token_accuracy": 0.8110343754291535, "num_tokens": 244625858.0, "step": 23600 }, { "entropy": 0.6346801042556762, "epoch": 0.18888, "grad_norm": 1.6580979824066162, "learning_rate": 4.0572629051620645e-05, "loss": 0.6309, "mean_token_accuracy": 0.799280321598053, "num_tokens": 244789672.0, "step": 23610 }, { "entropy": 0.6992064356803894, "epoch": 0.18896, "grad_norm": 3.0256283283233643, "learning_rate": 4.0568627450980396e-05, "loss": 0.6922, "mean_token_accuracy": 0.7999421954154968, "num_tokens": 244877824.0, "step": 23620 }, { "entropy": 0.7508036911487579, "epoch": 0.18904, "grad_norm": 2.2898194789886475, "learning_rate": 4.056462585034014e-05, "loss": 0.7544, "mean_token_accuracy": 0.7891430377960205, "num_tokens": 244972449.0, "step": 23630 }, { "entropy": 0.6849518775939941, "epoch": 0.18912, "grad_norm": 3.643214464187622, "learning_rate": 4.056062424969988e-05, "loss": 0.6883, "mean_token_accuracy": 0.7890375673770904, "num_tokens": 245111710.0, "step": 23640 }, { "entropy": 0.6199118971824646, "epoch": 0.1892, "grad_norm": 4.16788387298584, "learning_rate": 4.055662264905963e-05, "loss": 0.6244, "mean_token_accuracy": 0.8264638841152191, "num_tokens": 245153505.0, "step": 23650 }, { "entropy": 0.6941121220588684, "epoch": 0.18928, "grad_norm": 1.9814029932022095, "learning_rate": 4.055262104841937e-05, "loss": 0.694, "mean_token_accuracy": 0.7843429446220398, "num_tokens": 245317345.0, "step": 23660 }, { "entropy": 0.6893690675497055, "epoch": 0.18936, "grad_norm": 3.786000967025757, "learning_rate": 4.0548619447779114e-05, "loss": 0.6846, "mean_token_accuracy": 0.8063007175922394, "num_tokens": 245399634.0, "step": 23670 }, { "entropy": 0.7216401517391204, "epoch": 0.18944, "grad_norm": 2.0490384101867676, "learning_rate": 4.054461784713886e-05, "loss": 0.7224, "mean_token_accuracy": 0.7950467586517334, "num_tokens": 245492478.0, "step": 23680 }, { "entropy": 0.6629175901412964, "epoch": 0.18952, "grad_norm": 1.902233362197876, "learning_rate": 4.05406162464986e-05, "loss": 0.6615, "mean_token_accuracy": 0.7960727572441101, "num_tokens": 245642574.0, "step": 23690 }, { "entropy": 0.6731798231601716, "epoch": 0.1896, "grad_norm": 7.961674690246582, "learning_rate": 4.0536614645858346e-05, "loss": 0.6599, "mean_token_accuracy": 0.8155706584453583, "num_tokens": 245690239.0, "step": 23700 }, { "entropy": 0.6742336750030518, "epoch": 0.18968, "grad_norm": 1.9486973285675049, "learning_rate": 4.053261304521809e-05, "loss": 0.6802, "mean_token_accuracy": 0.7872435271739959, "num_tokens": 245854079.0, "step": 23710 }, { "entropy": 0.6875517427921295, "epoch": 0.18976, "grad_norm": 3.5190846920013428, "learning_rate": 4.052861144457783e-05, "loss": 0.6816, "mean_token_accuracy": 0.8052462100982666, "num_tokens": 245940791.0, "step": 23720 }, { "entropy": 0.68410884141922, "epoch": 0.18984, "grad_norm": 1.541609525680542, "learning_rate": 4.052460984393758e-05, "loss": 0.6961, "mean_token_accuracy": 0.8020910739898681, "num_tokens": 246034070.0, "step": 23730 }, { "entropy": 0.7322937667369842, "epoch": 0.18992, "grad_norm": 2.565082550048828, "learning_rate": 4.052060824329732e-05, "loss": 0.7243, "mean_token_accuracy": 0.7848851263523102, "num_tokens": 246174796.0, "step": 23740 }, { "entropy": 0.7067143440246582, "epoch": 0.19, "grad_norm": 4.914010047912598, "learning_rate": 4.0516606642657064e-05, "loss": 0.6944, "mean_token_accuracy": 0.8145142078399659, "num_tokens": 246217383.0, "step": 23750 }, { "entropy": 0.6894832849502563, "epoch": 0.19008, "grad_norm": 2.121587038040161, "learning_rate": 4.051260504201681e-05, "loss": 0.6893, "mean_token_accuracy": 0.7860059440135956, "num_tokens": 246380906.0, "step": 23760 }, { "entropy": 0.7495656073093414, "epoch": 0.19016, "grad_norm": 4.800922870635986, "learning_rate": 4.050860344137655e-05, "loss": 0.7423, "mean_token_accuracy": 0.7889203488826751, "num_tokens": 246461322.0, "step": 23770 }, { "entropy": 0.7033386051654815, "epoch": 0.19024, "grad_norm": 1.866610050201416, "learning_rate": 4.0504601840736295e-05, "loss": 0.7083, "mean_token_accuracy": 0.800173157453537, "num_tokens": 246554198.0, "step": 23780 }, { "entropy": 0.7069668114185333, "epoch": 0.19032, "grad_norm": 2.1559367179870605, "learning_rate": 4.0500600240096046e-05, "loss": 0.7015, "mean_token_accuracy": 0.785826462507248, "num_tokens": 246695079.0, "step": 23790 }, { "entropy": 0.6566290974617004, "epoch": 0.1904, "grad_norm": 4.0436577796936035, "learning_rate": 4.049659863945578e-05, "loss": 0.6438, "mean_token_accuracy": 0.8238438963890076, "num_tokens": 246738380.0, "step": 23800 }, { "entropy": 0.6181396901607513, "epoch": 0.19048, "grad_norm": 2.6454923152923584, "learning_rate": 4.0492597038815526e-05, "loss": 0.6189, "mean_token_accuracy": 0.8006289660930633, "num_tokens": 246902220.0, "step": 23810 }, { "entropy": 0.6786719352006912, "epoch": 0.19056, "grad_norm": 3.8111255168914795, "learning_rate": 4.048859543817527e-05, "loss": 0.6771, "mean_token_accuracy": 0.8065763115882874, "num_tokens": 246993312.0, "step": 23820 }, { "entropy": 0.7476238548755646, "epoch": 0.19064, "grad_norm": 1.5590484142303467, "learning_rate": 4.048459383753502e-05, "loss": 0.7469, "mean_token_accuracy": 0.7882187008857727, "num_tokens": 247086145.0, "step": 23830 }, { "entropy": 0.7210447430610657, "epoch": 0.19072, "grad_norm": 3.7065300941467285, "learning_rate": 4.048059223689476e-05, "loss": 0.7118, "mean_token_accuracy": 0.7877130091190339, "num_tokens": 247223231.0, "step": 23840 }, { "entropy": 0.7216628611087799, "epoch": 0.1908, "grad_norm": 4.7653093338012695, "learning_rate": 4.04765906362545e-05, "loss": 0.7244, "mean_token_accuracy": 0.8058358013629914, "num_tokens": 247264355.0, "step": 23850 }, { "entropy": 0.6460838198661805, "epoch": 0.19088, "grad_norm": 2.2912466526031494, "learning_rate": 4.047258903561425e-05, "loss": 0.6485, "mean_token_accuracy": 0.7927816152572632, "num_tokens": 247427928.0, "step": 23860 }, { "entropy": 0.6873735845088959, "epoch": 0.19096, "grad_norm": 5.0180583000183105, "learning_rate": 4.0468587434973995e-05, "loss": 0.6852, "mean_token_accuracy": 0.8030618667602539, "num_tokens": 247504264.0, "step": 23870 }, { "entropy": 0.765934145450592, "epoch": 0.19104, "grad_norm": 1.9988212585449219, "learning_rate": 4.046458583433373e-05, "loss": 0.7607, "mean_token_accuracy": 0.7834839701652527, "num_tokens": 247596628.0, "step": 23880 }, { "entropy": 0.6748035281896592, "epoch": 0.19112, "grad_norm": 2.5583982467651367, "learning_rate": 4.0460584233693476e-05, "loss": 0.6753, "mean_token_accuracy": 0.7952576756477356, "num_tokens": 247731038.0, "step": 23890 }, { "entropy": 0.757808718085289, "epoch": 0.1912, "grad_norm": 4.247909069061279, "learning_rate": 4.0456582633053227e-05, "loss": 0.7449, "mean_token_accuracy": 0.8051135420799256, "num_tokens": 247767848.0, "step": 23900 }, { "entropy": 0.7169110357761384, "epoch": 0.19128, "grad_norm": 2.036388397216797, "learning_rate": 4.045258103241297e-05, "loss": 0.7279, "mean_token_accuracy": 0.7836463928222657, "num_tokens": 247931048.0, "step": 23910 }, { "entropy": 0.7010546207427979, "epoch": 0.19136, "grad_norm": 3.7891905307769775, "learning_rate": 4.044857943177271e-05, "loss": 0.6832, "mean_token_accuracy": 0.8045948803424835, "num_tokens": 248011745.0, "step": 23920 }, { "entropy": 0.7237134516239166, "epoch": 0.19144, "grad_norm": 1.6379890441894531, "learning_rate": 4.044457783113246e-05, "loss": 0.7284, "mean_token_accuracy": 0.7938419103622436, "num_tokens": 248106581.0, "step": 23930 }, { "entropy": 0.6728738844394684, "epoch": 0.19152, "grad_norm": 2.617946147918701, "learning_rate": 4.04405762304922e-05, "loss": 0.6656, "mean_token_accuracy": 0.7945909559726715, "num_tokens": 248249813.0, "step": 23940 }, { "entropy": 0.7498361706733704, "epoch": 0.1916, "grad_norm": 5.452594757080078, "learning_rate": 4.0436574629851945e-05, "loss": 0.7491, "mean_token_accuracy": 0.8034337937831879, "num_tokens": 248289751.0, "step": 23950 }, { "entropy": 0.6612809717655181, "epoch": 0.19168, "grad_norm": 1.376196026802063, "learning_rate": 4.043257302921168e-05, "loss": 0.662, "mean_token_accuracy": 0.7884159803390502, "num_tokens": 248453591.0, "step": 23960 }, { "entropy": 0.6146257519721985, "epoch": 0.19176, "grad_norm": 3.0482444763183594, "learning_rate": 4.042857142857143e-05, "loss": 0.6067, "mean_token_accuracy": 0.8250286996364593, "num_tokens": 248533151.0, "step": 23970 }, { "entropy": 0.6832959771156311, "epoch": 0.19184, "grad_norm": 1.4806095361709595, "learning_rate": 4.0424569827931176e-05, "loss": 0.6789, "mean_token_accuracy": 0.8093526661396027, "num_tokens": 248625549.0, "step": 23980 }, { "entropy": 0.7358441054821014, "epoch": 0.19192, "grad_norm": 2.8677594661712646, "learning_rate": 4.042056822729092e-05, "loss": 0.7239, "mean_token_accuracy": 0.7830167055130005, "num_tokens": 248754425.0, "step": 23990 }, { "entropy": 0.7583747982978821, "epoch": 0.192, "grad_norm": 5.400460243225098, "learning_rate": 4.0416566626650664e-05, "loss": 0.757, "mean_token_accuracy": 0.8032839119434356, "num_tokens": 248788691.0, "step": 24000 }, { "entropy": 0.620217365026474, "epoch": 0.19208, "grad_norm": 1.7461082935333252, "learning_rate": 4.041256502601041e-05, "loss": 0.6287, "mean_token_accuracy": 0.7991050064563752, "num_tokens": 248952524.0, "step": 24010 }, { "entropy": 0.7205950915813446, "epoch": 0.19216, "grad_norm": 3.6516406536102295, "learning_rate": 4.040856342537015e-05, "loss": 0.7135, "mean_token_accuracy": 0.7975282073020935, "num_tokens": 249037706.0, "step": 24020 }, { "entropy": 0.7252453804016114, "epoch": 0.19224, "grad_norm": 1.6738641262054443, "learning_rate": 4.0404561824729895e-05, "loss": 0.7192, "mean_token_accuracy": 0.7948180794715881, "num_tokens": 249132735.0, "step": 24030 }, { "entropy": 0.7183156967163086, "epoch": 0.19232, "grad_norm": 1.9783295392990112, "learning_rate": 4.040056022408964e-05, "loss": 0.7195, "mean_token_accuracy": 0.7804535150527954, "num_tokens": 249282287.0, "step": 24040 }, { "entropy": 0.634413093328476, "epoch": 0.1924, "grad_norm": 4.7396650314331055, "learning_rate": 4.039655862344938e-05, "loss": 0.6161, "mean_token_accuracy": 0.8278027415275574, "num_tokens": 249329680.0, "step": 24050 }, { "entropy": 0.6365076065063476, "epoch": 0.19248, "grad_norm": 1.3852134943008423, "learning_rate": 4.0392557022809126e-05, "loss": 0.64, "mean_token_accuracy": 0.7938019096851349, "num_tokens": 249493520.0, "step": 24060 }, { "entropy": 0.6525556087493897, "epoch": 0.19256, "grad_norm": 3.4508018493652344, "learning_rate": 4.038855542216887e-05, "loss": 0.642, "mean_token_accuracy": 0.8103876411914825, "num_tokens": 249588558.0, "step": 24070 }, { "entropy": 0.7133795559406281, "epoch": 0.19264, "grad_norm": 2.4553818702697754, "learning_rate": 4.0384553821528613e-05, "loss": 0.7264, "mean_token_accuracy": 0.7953554272651673, "num_tokens": 249683354.0, "step": 24080 }, { "entropy": 0.6993209779262543, "epoch": 0.19272, "grad_norm": 3.2621383666992188, "learning_rate": 4.038055222088836e-05, "loss": 0.6855, "mean_token_accuracy": 0.7913707196712494, "num_tokens": 249816001.0, "step": 24090 }, { "entropy": 0.6940908849239349, "epoch": 0.1928, "grad_norm": 4.43383264541626, "learning_rate": 4.03765506202481e-05, "loss": 0.7004, "mean_token_accuracy": 0.8116228520870209, "num_tokens": 249848386.0, "step": 24100 }, { "entropy": 0.6970305144786835, "epoch": 0.19288, "grad_norm": 1.5190398693084717, "learning_rate": 4.0372549019607845e-05, "loss": 0.702, "mean_token_accuracy": 0.7849169552326203, "num_tokens": 250012226.0, "step": 24110 }, { "entropy": 0.6680907309055328, "epoch": 0.19296, "grad_norm": 3.734954833984375, "learning_rate": 4.036854741896759e-05, "loss": 0.6552, "mean_token_accuracy": 0.8079137086868287, "num_tokens": 250108589.0, "step": 24120 }, { "entropy": 0.6701581537723541, "epoch": 0.19304, "grad_norm": 1.6263048648834229, "learning_rate": 4.036454581832733e-05, "loss": 0.6767, "mean_token_accuracy": 0.8032383263111115, "num_tokens": 250204097.0, "step": 24130 }, { "entropy": 0.6818401098251343, "epoch": 0.19312, "grad_norm": 4.61167573928833, "learning_rate": 4.0360544217687076e-05, "loss": 0.6818, "mean_token_accuracy": 0.7936575770378113, "num_tokens": 250328795.0, "step": 24140 }, { "entropy": 0.7431852698326111, "epoch": 0.1932, "grad_norm": 4.089645862579346, "learning_rate": 4.035654261704682e-05, "loss": 0.7207, "mean_token_accuracy": 0.8042912006378173, "num_tokens": 250363263.0, "step": 24150 }, { "entropy": 0.6993580758571625, "epoch": 0.19328, "grad_norm": 1.891648530960083, "learning_rate": 4.035254101640656e-05, "loss": 0.7039, "mean_token_accuracy": 0.7829872965812683, "num_tokens": 250527103.0, "step": 24160 }, { "entropy": 0.7429773867130279, "epoch": 0.19336, "grad_norm": 4.27223014831543, "learning_rate": 4.034853941576631e-05, "loss": 0.731, "mean_token_accuracy": 0.7923098862171173, "num_tokens": 250608263.0, "step": 24170 }, { "entropy": 0.6660778880119324, "epoch": 0.19344, "grad_norm": 1.6672859191894531, "learning_rate": 4.034453781512606e-05, "loss": 0.6703, "mean_token_accuracy": 0.8032581090927124, "num_tokens": 250701341.0, "step": 24180 }, { "entropy": 0.6422890186309814, "epoch": 0.19352, "grad_norm": 2.6584904193878174, "learning_rate": 4.0340536214485794e-05, "loss": 0.6316, "mean_token_accuracy": 0.8031275928020477, "num_tokens": 250832628.0, "step": 24190 }, { "entropy": 0.60674988925457, "epoch": 0.1936, "grad_norm": 4.732316970825195, "learning_rate": 4.033653461384554e-05, "loss": 0.6173, "mean_token_accuracy": 0.8302529215812683, "num_tokens": 250871240.0, "step": 24200 }, { "entropy": 0.7076359033584595, "epoch": 0.19368, "grad_norm": 1.7757503986358643, "learning_rate": 4.033253301320528e-05, "loss": 0.7059, "mean_token_accuracy": 0.7797055125236512, "num_tokens": 251034854.0, "step": 24210 }, { "entropy": 0.6900137096643448, "epoch": 0.19376, "grad_norm": 3.547024726867676, "learning_rate": 4.032853141256503e-05, "loss": 0.6834, "mean_token_accuracy": 0.8079200923442841, "num_tokens": 251108220.0, "step": 24220 }, { "entropy": 0.6752313256263733, "epoch": 0.19384, "grad_norm": 1.4416236877441406, "learning_rate": 4.032452981192477e-05, "loss": 0.685, "mean_token_accuracy": 0.8055802404880523, "num_tokens": 251200621.0, "step": 24230 }, { "entropy": 0.7159296452999115, "epoch": 0.19392, "grad_norm": 3.205989360809326, "learning_rate": 4.032052821128451e-05, "loss": 0.697, "mean_token_accuracy": 0.7844821214675903, "num_tokens": 251347015.0, "step": 24240 }, { "entropy": 0.6567562818527222, "epoch": 0.194, "grad_norm": 4.592838764190674, "learning_rate": 4.031652661064426e-05, "loss": 0.6655, "mean_token_accuracy": 0.8181621015071869, "num_tokens": 251389150.0, "step": 24250 }, { "entropy": 0.6863506078720093, "epoch": 0.19408, "grad_norm": 2.866610288619995, "learning_rate": 4.031252501000401e-05, "loss": 0.6893, "mean_token_accuracy": 0.7837872505187988, "num_tokens": 251552990.0, "step": 24260 }, { "entropy": 0.6755431324243546, "epoch": 0.19416, "grad_norm": 3.521580457687378, "learning_rate": 4.0308523409363744e-05, "loss": 0.6607, "mean_token_accuracy": 0.809596836566925, "num_tokens": 251642152.0, "step": 24270 }, { "entropy": 0.7624621391296387, "epoch": 0.19424, "grad_norm": 1.5811755657196045, "learning_rate": 4.030452180872349e-05, "loss": 0.7718, "mean_token_accuracy": 0.7848680853843689, "num_tokens": 251736746.0, "step": 24280 }, { "entropy": 0.7290179610252381, "epoch": 0.19432, "grad_norm": 2.0332067012786865, "learning_rate": 4.030052020808324e-05, "loss": 0.7175, "mean_token_accuracy": 0.7842486441135407, "num_tokens": 251891726.0, "step": 24290 }, { "entropy": 0.6919755429029465, "epoch": 0.1944, "grad_norm": 5.034499168395996, "learning_rate": 4.029651860744298e-05, "loss": 0.6764, "mean_token_accuracy": 0.8139496922492981, "num_tokens": 251939932.0, "step": 24300 }, { "entropy": 0.6715460836887359, "epoch": 0.19448, "grad_norm": 1.8596314191818237, "learning_rate": 4.029251700680272e-05, "loss": 0.684, "mean_token_accuracy": 0.7850463449954986, "num_tokens": 252100193.0, "step": 24310 }, { "entropy": 0.7186572253704071, "epoch": 0.19456, "grad_norm": 3.3598926067352295, "learning_rate": 4.028851540616247e-05, "loss": 0.7061, "mean_token_accuracy": 0.8029260694980621, "num_tokens": 252163144.0, "step": 24320 }, { "entropy": 0.715075820684433, "epoch": 0.19464, "grad_norm": 1.4505176544189453, "learning_rate": 4.028451380552221e-05, "loss": 0.7179, "mean_token_accuracy": 0.8027754366397858, "num_tokens": 252253817.0, "step": 24330 }, { "entropy": 0.6274616479873657, "epoch": 0.19472, "grad_norm": 2.2225821018218994, "learning_rate": 4.028051220488196e-05, "loss": 0.6374, "mean_token_accuracy": 0.7993969678878784, "num_tokens": 252394826.0, "step": 24340 }, { "entropy": 0.6748224496841431, "epoch": 0.1948, "grad_norm": 4.304285526275635, "learning_rate": 4.0276510604241694e-05, "loss": 0.6662, "mean_token_accuracy": 0.8141491293907166, "num_tokens": 252437788.0, "step": 24350 }, { "entropy": 0.6950388133525849, "epoch": 0.19488, "grad_norm": 1.7125478982925415, "learning_rate": 4.0272509003601444e-05, "loss": 0.6901, "mean_token_accuracy": 0.784220939874649, "num_tokens": 252601606.0, "step": 24360 }, { "entropy": 0.661337473988533, "epoch": 0.19496, "grad_norm": 3.4437878131866455, "learning_rate": 4.026850740296119e-05, "loss": 0.662, "mean_token_accuracy": 0.8120720207691192, "num_tokens": 252675343.0, "step": 24370 }, { "entropy": 0.7222983360290527, "epoch": 0.19504, "grad_norm": 2.270872116088867, "learning_rate": 4.026450580232093e-05, "loss": 0.7294, "mean_token_accuracy": 0.7959828674793243, "num_tokens": 252767619.0, "step": 24380 }, { "entropy": 0.7661113917827607, "epoch": 0.19512, "grad_norm": 2.575878381729126, "learning_rate": 4.0260504201680675e-05, "loss": 0.7601, "mean_token_accuracy": 0.7761373698711396, "num_tokens": 252902490.0, "step": 24390 }, { "entropy": 0.7226547956466675, "epoch": 0.1952, "grad_norm": 5.329259395599365, "learning_rate": 4.025650260104042e-05, "loss": 0.7159, "mean_token_accuracy": 0.8055723905563354, "num_tokens": 252941098.0, "step": 24400 }, { "entropy": 0.6913705229759216, "epoch": 0.19528, "grad_norm": 1.9822883605957031, "learning_rate": 4.025250100040016e-05, "loss": 0.6942, "mean_token_accuracy": 0.7839093863964081, "num_tokens": 253104938.0, "step": 24410 }, { "entropy": 0.6406028568744659, "epoch": 0.19536, "grad_norm": 3.3704025745391846, "learning_rate": 4.0248499399759906e-05, "loss": 0.6324, "mean_token_accuracy": 0.8175829946994781, "num_tokens": 253194177.0, "step": 24420 }, { "entropy": 0.663730138540268, "epoch": 0.19544, "grad_norm": 2.155622959136963, "learning_rate": 4.024449779911965e-05, "loss": 0.6675, "mean_token_accuracy": 0.8103002667427063, "num_tokens": 253288901.0, "step": 24430 }, { "entropy": 0.7205103874206543, "epoch": 0.19552, "grad_norm": 3.6967129707336426, "learning_rate": 4.0240496198479394e-05, "loss": 0.7113, "mean_token_accuracy": 0.7818262636661529, "num_tokens": 253431625.0, "step": 24440 }, { "entropy": 0.7026829957962036, "epoch": 0.1956, "grad_norm": 4.525763511657715, "learning_rate": 4.023649459783914e-05, "loss": 0.6882, "mean_token_accuracy": 0.813293045759201, "num_tokens": 253469035.0, "step": 24450 }, { "entropy": 0.6524636566638946, "epoch": 0.19568, "grad_norm": 1.944875955581665, "learning_rate": 4.023249299719888e-05, "loss": 0.6571, "mean_token_accuracy": 0.7929897427558898, "num_tokens": 253632875.0, "step": 24460 }, { "entropy": 0.6347760319709778, "epoch": 0.19576, "grad_norm": 2.9883761405944824, "learning_rate": 4.0228491396558625e-05, "loss": 0.6241, "mean_token_accuracy": 0.8173600256443023, "num_tokens": 253718490.0, "step": 24470 }, { "entropy": 0.7429231584072113, "epoch": 0.19584, "grad_norm": 3.2480597496032715, "learning_rate": 4.022448979591837e-05, "loss": 0.7447, "mean_token_accuracy": 0.7903252184391022, "num_tokens": 253813718.0, "step": 24480 }, { "entropy": 0.7334966659545898, "epoch": 0.19592, "grad_norm": 2.2094054222106934, "learning_rate": 4.022048819527811e-05, "loss": 0.7367, "mean_token_accuracy": 0.7808286666870117, "num_tokens": 253957118.0, "step": 24490 }, { "entropy": 0.7952514231204987, "epoch": 0.196, "grad_norm": 5.552675724029541, "learning_rate": 4.0216486594637856e-05, "loss": 0.7967, "mean_token_accuracy": 0.7910906374454498, "num_tokens": 253997244.0, "step": 24500 }, { "entropy": 0.6379989087581635, "epoch": 0.19608, "grad_norm": 1.6268792152404785, "learning_rate": 4.02124849939976e-05, "loss": 0.6312, "mean_token_accuracy": 0.8011739909648895, "num_tokens": 254158666.0, "step": 24510 }, { "entropy": 0.6710532307624817, "epoch": 0.19616, "grad_norm": 4.01578426361084, "learning_rate": 4.0208483393357344e-05, "loss": 0.6517, "mean_token_accuracy": 0.8082307398319244, "num_tokens": 254233415.0, "step": 24520 }, { "entropy": 0.7058187246322631, "epoch": 0.19624, "grad_norm": 1.419256329536438, "learning_rate": 4.0204481792717094e-05, "loss": 0.7242, "mean_token_accuracy": 0.7949883222579956, "num_tokens": 254327140.0, "step": 24530 }, { "entropy": 0.732913589477539, "epoch": 0.19632, "grad_norm": 3.6477456092834473, "learning_rate": 4.020048019207683e-05, "loss": 0.7163, "mean_token_accuracy": 0.7844912946224213, "num_tokens": 254473194.0, "step": 24540 }, { "entropy": 0.6010377883911133, "epoch": 0.1964, "grad_norm": 3.988865852355957, "learning_rate": 4.0196478591436575e-05, "loss": 0.5889, "mean_token_accuracy": 0.8313175082206726, "num_tokens": 254517241.0, "step": 24550 }, { "entropy": 0.663917601108551, "epoch": 0.19648, "grad_norm": 1.8838955163955688, "learning_rate": 4.019247699079632e-05, "loss": 0.673, "mean_token_accuracy": 0.7884220898151397, "num_tokens": 254681081.0, "step": 24560 }, { "entropy": 0.6348476946353913, "epoch": 0.19656, "grad_norm": 4.197878360748291, "learning_rate": 4.018847539015607e-05, "loss": 0.6344, "mean_token_accuracy": 0.8172521114349365, "num_tokens": 254760826.0, "step": 24570 }, { "entropy": 0.8102673530578614, "epoch": 0.19664, "grad_norm": 1.762830138206482, "learning_rate": 4.0184473789515806e-05, "loss": 0.7971, "mean_token_accuracy": 0.7741909801959992, "num_tokens": 254855584.0, "step": 24580 }, { "entropy": 0.6010179907083512, "epoch": 0.19672, "grad_norm": 4.041004180908203, "learning_rate": 4.018047218887555e-05, "loss": 0.6027, "mean_token_accuracy": 0.8132854044437409, "num_tokens": 254991736.0, "step": 24590 }, { "entropy": 0.6757224798202515, "epoch": 0.1968, "grad_norm": 4.528892517089844, "learning_rate": 4.01764705882353e-05, "loss": 0.6761, "mean_token_accuracy": 0.8216265797615051, "num_tokens": 255028178.0, "step": 24600 }, { "entropy": 0.7187901973724365, "epoch": 0.19688, "grad_norm": 1.9317710399627686, "learning_rate": 4.0172468987595044e-05, "loss": 0.7178, "mean_token_accuracy": 0.7774042427539826, "num_tokens": 255191382.0, "step": 24610 }, { "entropy": 0.6769185721874237, "epoch": 0.19696, "grad_norm": 3.7438535690307617, "learning_rate": 4.016846738695478e-05, "loss": 0.6664, "mean_token_accuracy": 0.8068778812885284, "num_tokens": 255267019.0, "step": 24620 }, { "entropy": 0.7242698788642883, "epoch": 0.19704, "grad_norm": 1.3642483949661255, "learning_rate": 4.0164465786314524e-05, "loss": 0.7226, "mean_token_accuracy": 0.7934981703758239, "num_tokens": 255361117.0, "step": 24630 }, { "entropy": 0.7049399137496948, "epoch": 0.19712, "grad_norm": 2.72698974609375, "learning_rate": 4.0160464185674275e-05, "loss": 0.7063, "mean_token_accuracy": 0.7832386434078217, "num_tokens": 255506954.0, "step": 24640 }, { "entropy": 0.8015026271343231, "epoch": 0.1972, "grad_norm": 5.624351501464844, "learning_rate": 4.015646258503402e-05, "loss": 0.7848, "mean_token_accuracy": 0.7950710356235504, "num_tokens": 255545987.0, "step": 24650 }, { "entropy": 0.6230179309844971, "epoch": 0.19728, "grad_norm": 2.0398271083831787, "learning_rate": 4.0152460984393756e-05, "loss": 0.6263, "mean_token_accuracy": 0.8011766016483307, "num_tokens": 255708632.0, "step": 24660 }, { "entropy": 0.717952823638916, "epoch": 0.19736, "grad_norm": 3.179135322570801, "learning_rate": 4.01484593837535e-05, "loss": 0.7194, "mean_token_accuracy": 0.7914114415645599, "num_tokens": 255791632.0, "step": 24670 }, { "entropy": 0.6741098582744598, "epoch": 0.19744, "grad_norm": 1.9345171451568604, "learning_rate": 4.014445778311325e-05, "loss": 0.673, "mean_token_accuracy": 0.8068709135055542, "num_tokens": 255886948.0, "step": 24680 }, { "entropy": 0.7364036798477173, "epoch": 0.19752, "grad_norm": 2.748660087585449, "learning_rate": 4.0140456182472994e-05, "loss": 0.7348, "mean_token_accuracy": 0.7797862946987152, "num_tokens": 256025319.0, "step": 24690 }, { "entropy": 0.7276161253452301, "epoch": 0.1976, "grad_norm": 5.736556053161621, "learning_rate": 4.013645458183273e-05, "loss": 0.715, "mean_token_accuracy": 0.8122204065322876, "num_tokens": 256064927.0, "step": 24700 }, { "entropy": 0.6125629305839538, "epoch": 0.19768, "grad_norm": 1.4397363662719727, "learning_rate": 4.013245298119248e-05, "loss": 0.6159, "mean_token_accuracy": 0.8013190031051636, "num_tokens": 256228767.0, "step": 24710 }, { "entropy": 0.753754872083664, "epoch": 0.19776, "grad_norm": 3.439856767654419, "learning_rate": 4.0128451380552225e-05, "loss": 0.7521, "mean_token_accuracy": 0.7885772228240967, "num_tokens": 256320728.0, "step": 24720 }, { "entropy": 0.7797607600688934, "epoch": 0.19784, "grad_norm": 1.8975708484649658, "learning_rate": 4.012444977991197e-05, "loss": 0.7713, "mean_token_accuracy": 0.7848673582077026, "num_tokens": 256413913.0, "step": 24730 }, { "entropy": 0.6441531002521514, "epoch": 0.19792, "grad_norm": 2.7205536365509033, "learning_rate": 4.0120448179271705e-05, "loss": 0.6421, "mean_token_accuracy": 0.808480703830719, "num_tokens": 256531531.0, "step": 24740 }, { "entropy": 0.7136757671833038, "epoch": 0.198, "grad_norm": 4.6822686195373535, "learning_rate": 4.0116446578631456e-05, "loss": 0.7086, "mean_token_accuracy": 0.8157759189605713, "num_tokens": 256561932.0, "step": 24750 }, { "entropy": 0.7201740086078644, "epoch": 0.19808, "grad_norm": 2.2835910320281982, "learning_rate": 4.01124449779912e-05, "loss": 0.7157, "mean_token_accuracy": 0.781583833694458, "num_tokens": 256725566.0, "step": 24760 }, { "entropy": 0.6683447569608688, "epoch": 0.19816, "grad_norm": 3.162980794906616, "learning_rate": 4.010844337735094e-05, "loss": 0.6605, "mean_token_accuracy": 0.8118211030960083, "num_tokens": 256805849.0, "step": 24770 }, { "entropy": 0.7133530139923095, "epoch": 0.19824, "grad_norm": 1.416582703590393, "learning_rate": 4.010444177671069e-05, "loss": 0.7256, "mean_token_accuracy": 0.7918986201286315, "num_tokens": 256901411.0, "step": 24780 }, { "entropy": 0.7145650267601014, "epoch": 0.19832, "grad_norm": 2.3120083808898926, "learning_rate": 4.010044017607043e-05, "loss": 0.7087, "mean_token_accuracy": 0.7914305925369263, "num_tokens": 257025827.0, "step": 24790 }, { "entropy": 0.7134221315383911, "epoch": 0.1984, "grad_norm": 5.606067657470703, "learning_rate": 4.0096438575430174e-05, "loss": 0.7103, "mean_token_accuracy": 0.8126696467399597, "num_tokens": 257058789.0, "step": 24800 }, { "entropy": 0.6442330539226532, "epoch": 0.19848, "grad_norm": 2.115781307220459, "learning_rate": 4.009243697478992e-05, "loss": 0.6433, "mean_token_accuracy": 0.79511878490448, "num_tokens": 257221913.0, "step": 24810 }, { "entropy": 0.6926358938217163, "epoch": 0.19856, "grad_norm": 3.5341598987579346, "learning_rate": 4.008843537414966e-05, "loss": 0.6779, "mean_token_accuracy": 0.8110290884971618, "num_tokens": 257296024.0, "step": 24820 }, { "entropy": 0.6478572428226471, "epoch": 0.19864, "grad_norm": 2.59367299079895, "learning_rate": 4.0084433773509405e-05, "loss": 0.6509, "mean_token_accuracy": 0.8089015543460846, "num_tokens": 257388663.0, "step": 24830 }, { "entropy": 0.7703151881694794, "epoch": 0.19872, "grad_norm": 3.325883388519287, "learning_rate": 4.008043217286915e-05, "loss": 0.763, "mean_token_accuracy": 0.7731131196022034, "num_tokens": 257514457.0, "step": 24840 }, { "entropy": 0.6413863480091095, "epoch": 0.1988, "grad_norm": 5.710382461547852, "learning_rate": 4.007643057222889e-05, "loss": 0.6597, "mean_token_accuracy": 0.8177377581596375, "num_tokens": 257550543.0, "step": 24850 }, { "entropy": 0.634521746635437, "epoch": 0.19888, "grad_norm": 1.5227776765823364, "learning_rate": 4.007242897158864e-05, "loss": 0.6293, "mean_token_accuracy": 0.7995908677577972, "num_tokens": 257714383.0, "step": 24860 }, { "entropy": 0.6993597209453583, "epoch": 0.19896, "grad_norm": 3.3141560554504395, "learning_rate": 4.006842737094838e-05, "loss": 0.694, "mean_token_accuracy": 0.805402410030365, "num_tokens": 257789716.0, "step": 24870 }, { "entropy": 0.7171655535697937, "epoch": 0.19904, "grad_norm": 1.711638331413269, "learning_rate": 4.0064425770308124e-05, "loss": 0.7251, "mean_token_accuracy": 0.7963369846343994, "num_tokens": 257880957.0, "step": 24880 }, { "entropy": 0.7002371907234192, "epoch": 0.19912, "grad_norm": 3.2591922283172607, "learning_rate": 4.006042416966787e-05, "loss": 0.6882, "mean_token_accuracy": 0.7878976583480835, "num_tokens": 258020642.0, "step": 24890 }, { "entropy": 0.7279939472675323, "epoch": 0.1992, "grad_norm": 4.484709739685059, "learning_rate": 4.005642256902761e-05, "loss": 0.7179, "mean_token_accuracy": 0.8097670316696167, "num_tokens": 258060428.0, "step": 24900 }, { "entropy": 0.6189093708992004, "epoch": 0.19928, "grad_norm": 1.8646475076675415, "learning_rate": 4.0052420968387355e-05, "loss": 0.6194, "mean_token_accuracy": 0.8009886980056763, "num_tokens": 258224165.0, "step": 24910 }, { "entropy": 0.6188645541667939, "epoch": 0.19936, "grad_norm": 3.421945810317993, "learning_rate": 4.0048419367747106e-05, "loss": 0.6187, "mean_token_accuracy": 0.8225226879119873, "num_tokens": 258301182.0, "step": 24920 }, { "entropy": 0.7107336342334747, "epoch": 0.19944, "grad_norm": 2.7477495670318604, "learning_rate": 4.004441776710684e-05, "loss": 0.7099, "mean_token_accuracy": 0.7998300135135651, "num_tokens": 258393413.0, "step": 24930 }, { "entropy": 0.6579534471035003, "epoch": 0.19952, "grad_norm": 2.1587331295013428, "learning_rate": 4.0040416166466586e-05, "loss": 0.6494, "mean_token_accuracy": 0.7969311833381653, "num_tokens": 258541679.0, "step": 24940 }, { "entropy": 0.6871620565652847, "epoch": 0.1996, "grad_norm": 6.012516498565674, "learning_rate": 4.003641456582633e-05, "loss": 0.6939, "mean_token_accuracy": 0.8171104729175568, "num_tokens": 258586866.0, "step": 24950 }, { "entropy": 0.6963134706020355, "epoch": 0.19968, "grad_norm": 2.2377233505249023, "learning_rate": 4.003241296518608e-05, "loss": 0.6959, "mean_token_accuracy": 0.7846238434314727, "num_tokens": 258750706.0, "step": 24960 }, { "entropy": 0.6618042767047883, "epoch": 0.19976, "grad_norm": 3.1241471767425537, "learning_rate": 4.002841136454582e-05, "loss": 0.6503, "mean_token_accuracy": 0.8077340245246887, "num_tokens": 258862736.0, "step": 24970 }, { "entropy": 0.6295656561851501, "epoch": 0.19984, "grad_norm": 1.9967572689056396, "learning_rate": 4.002440976390556e-05, "loss": 0.6411, "mean_token_accuracy": 0.8124807834625244, "num_tokens": 258957923.0, "step": 24980 }, { "entropy": 0.7085526764392853, "epoch": 0.19992, "grad_norm": 3.153262138366699, "learning_rate": 4.002040816326531e-05, "loss": 0.7037, "mean_token_accuracy": 0.7865424215793609, "num_tokens": 259100229.0, "step": 24990 }, { "entropy": 0.7005757033824921, "epoch": 0.2, "grad_norm": 4.637353420257568, "learning_rate": 4.0016406562625055e-05, "loss": 0.6976, "mean_token_accuracy": 0.8112364709377289, "num_tokens": 259136611.0, "step": 25000 }, { "entropy": 0.6884506523609162, "epoch": 0.20008, "grad_norm": 1.7353684902191162, "learning_rate": 4.001240496198479e-05, "loss": 0.6864, "mean_token_accuracy": 0.7841363489627838, "num_tokens": 259300104.0, "step": 25010 }, { "entropy": 0.6631472945213318, "epoch": 0.20016, "grad_norm": 4.056802272796631, "learning_rate": 4.0008403361344536e-05, "loss": 0.6554, "mean_token_accuracy": 0.8118070244789124, "num_tokens": 259376054.0, "step": 25020 }, { "entropy": 0.6654484689235687, "epoch": 0.20024, "grad_norm": 2.462924003601074, "learning_rate": 4.0004401760704287e-05, "loss": 0.6792, "mean_token_accuracy": 0.8059291243553162, "num_tokens": 259467921.0, "step": 25030 }, { "entropy": 0.7115656316280365, "epoch": 0.20032, "grad_norm": 2.933295965194702, "learning_rate": 4.000040016006403e-05, "loss": 0.7113, "mean_token_accuracy": 0.7873055338859558, "num_tokens": 259613179.0, "step": 25040 }, { "entropy": 0.6968668818473815, "epoch": 0.2004, "grad_norm": 5.490992069244385, "learning_rate": 3.999639855942377e-05, "loss": 0.6875, "mean_token_accuracy": 0.8115338563919068, "num_tokens": 259656524.0, "step": 25050 }, { "entropy": 0.6283232212066651, "epoch": 0.20048, "grad_norm": 1.3228689432144165, "learning_rate": 3.999239695878352e-05, "loss": 0.628, "mean_token_accuracy": 0.797270393371582, "num_tokens": 259820364.0, "step": 25060 }, { "entropy": 0.618096861243248, "epoch": 0.20056, "grad_norm": 2.6827218532562256, "learning_rate": 3.998839535814326e-05, "loss": 0.6133, "mean_token_accuracy": 0.8177373051643372, "num_tokens": 259914085.0, "step": 25070 }, { "entropy": 0.722721379995346, "epoch": 0.20064, "grad_norm": 1.5182441473007202, "learning_rate": 3.9984393757503005e-05, "loss": 0.7365, "mean_token_accuracy": 0.7872776687145233, "num_tokens": 260009810.0, "step": 25080 }, { "entropy": 0.7095038235187531, "epoch": 0.20072, "grad_norm": 3.0434467792510986, "learning_rate": 3.998039215686274e-05, "loss": 0.7002, "mean_token_accuracy": 0.7873101770877838, "num_tokens": 260149408.0, "step": 25090 }, { "entropy": 0.771328940987587, "epoch": 0.2008, "grad_norm": 4.771576881408691, "learning_rate": 3.997639055622249e-05, "loss": 0.7665, "mean_token_accuracy": 0.7978789150714874, "num_tokens": 260187083.0, "step": 25100 }, { "entropy": 0.7131183445453644, "epoch": 0.20088, "grad_norm": 1.680153250694275, "learning_rate": 3.9972388955582236e-05, "loss": 0.711, "mean_token_accuracy": 0.7797813773155212, "num_tokens": 260350923.0, "step": 25110 }, { "entropy": 0.7369094729423523, "epoch": 0.20096, "grad_norm": 3.8054938316345215, "learning_rate": 3.996838735494198e-05, "loss": 0.7221, "mean_token_accuracy": 0.8027411997318268, "num_tokens": 260421897.0, "step": 25120 }, { "entropy": 0.6318001747131348, "epoch": 0.20104, "grad_norm": 1.6397740840911865, "learning_rate": 3.9964385754301724e-05, "loss": 0.6458, "mean_token_accuracy": 0.814090496301651, "num_tokens": 260513641.0, "step": 25130 }, { "entropy": 0.7953001022338867, "epoch": 0.20112, "grad_norm": 2.975283145904541, "learning_rate": 3.996038415366147e-05, "loss": 0.7842, "mean_token_accuracy": 0.7714639902114868, "num_tokens": 260653698.0, "step": 25140 }, { "entropy": 0.6991019904613495, "epoch": 0.2012, "grad_norm": 4.621942520141602, "learning_rate": 3.995638255302121e-05, "loss": 0.7144, "mean_token_accuracy": 0.8093109548091888, "num_tokens": 260696893.0, "step": 25150 }, { "entropy": 0.6667765438556671, "epoch": 0.20128, "grad_norm": 1.3331973552703857, "learning_rate": 3.9952380952380955e-05, "loss": 0.6636, "mean_token_accuracy": 0.792946994304657, "num_tokens": 260860733.0, "step": 25160 }, { "entropy": 0.6473306536674499, "epoch": 0.20136, "grad_norm": 3.0171492099761963, "learning_rate": 3.99483793517407e-05, "loss": 0.633, "mean_token_accuracy": 0.8126182436943055, "num_tokens": 260953564.0, "step": 25170 }, { "entropy": 0.6792721807956695, "epoch": 0.20144, "grad_norm": 1.8973530530929565, "learning_rate": 3.994437775110044e-05, "loss": 0.6725, "mean_token_accuracy": 0.8020931720733643, "num_tokens": 261049057.0, "step": 25180 }, { "entropy": 0.7262223899364472, "epoch": 0.20152, "grad_norm": 2.4005537033081055, "learning_rate": 3.9940376150460186e-05, "loss": 0.7203, "mean_token_accuracy": 0.7846355557441711, "num_tokens": 261183679.0, "step": 25190 }, { "entropy": 0.7140032768249511, "epoch": 0.2016, "grad_norm": 4.639849662780762, "learning_rate": 3.993637454981993e-05, "loss": 0.714, "mean_token_accuracy": 0.8095127761363983, "num_tokens": 261224192.0, "step": 25200 }, { "entropy": 0.6447705507278443, "epoch": 0.20168, "grad_norm": 1.5477720499038696, "learning_rate": 3.993237294917967e-05, "loss": 0.6509, "mean_token_accuracy": 0.7936437606811524, "num_tokens": 261387994.0, "step": 25210 }, { "entropy": 0.7277528136968613, "epoch": 0.20176, "grad_norm": 2.994816541671753, "learning_rate": 3.992837134853942e-05, "loss": 0.7162, "mean_token_accuracy": 0.8008717656135559, "num_tokens": 261462648.0, "step": 25220 }, { "entropy": 0.7663103103637695, "epoch": 0.20184, "grad_norm": 1.7574437856674194, "learning_rate": 3.992436974789916e-05, "loss": 0.7517, "mean_token_accuracy": 0.7869740724563599, "num_tokens": 261555094.0, "step": 25230 }, { "entropy": 0.6925870656967164, "epoch": 0.20192, "grad_norm": 2.3821182250976562, "learning_rate": 3.9920368147258905e-05, "loss": 0.6965, "mean_token_accuracy": 0.7850136697292328, "num_tokens": 261699305.0, "step": 25240 }, { "entropy": 0.6955644130706787, "epoch": 0.202, "grad_norm": 5.189542293548584, "learning_rate": 3.991636654661865e-05, "loss": 0.6941, "mean_token_accuracy": 0.8153845012187958, "num_tokens": 261739864.0, "step": 25250 }, { "entropy": 0.6578835129737854, "epoch": 0.20208, "grad_norm": 1.5818095207214355, "learning_rate": 3.991236494597839e-05, "loss": 0.6561, "mean_token_accuracy": 0.792641669511795, "num_tokens": 261903704.0, "step": 25260 }, { "entropy": 0.6895025432109833, "epoch": 0.20216, "grad_norm": 3.5199012756347656, "learning_rate": 3.9908363345338136e-05, "loss": 0.6906, "mean_token_accuracy": 0.7974128901958466, "num_tokens": 261994047.0, "step": 25270 }, { "entropy": 0.6654979705810546, "epoch": 0.20224, "grad_norm": 2.6273205280303955, "learning_rate": 3.990436174469788e-05, "loss": 0.6511, "mean_token_accuracy": 0.8138873338699341, "num_tokens": 262086789.0, "step": 25280 }, { "entropy": 0.7417469322681427, "epoch": 0.20232, "grad_norm": 2.222193479537964, "learning_rate": 3.990036014405762e-05, "loss": 0.7328, "mean_token_accuracy": 0.7819559752941132, "num_tokens": 262226166.0, "step": 25290 }, { "entropy": 0.6241596609354019, "epoch": 0.2024, "grad_norm": 4.1083760261535645, "learning_rate": 3.989635854341737e-05, "loss": 0.6261, "mean_token_accuracy": 0.8272576093673706, "num_tokens": 262262478.0, "step": 25300 }, { "entropy": 0.6260010659694671, "epoch": 0.20248, "grad_norm": 2.10209059715271, "learning_rate": 3.989235694277712e-05, "loss": 0.6269, "mean_token_accuracy": 0.8004030227661133, "num_tokens": 262426318.0, "step": 25310 }, { "entropy": 0.6442861437797547, "epoch": 0.20256, "grad_norm": 3.7990543842315674, "learning_rate": 3.9888355342136854e-05, "loss": 0.6379, "mean_token_accuracy": 0.8100100696086884, "num_tokens": 262527157.0, "step": 25320 }, { "entropy": 0.6845799028873444, "epoch": 0.20264, "grad_norm": 1.5901566743850708, "learning_rate": 3.98843537414966e-05, "loss": 0.6917, "mean_token_accuracy": 0.8044047951698303, "num_tokens": 262621881.0, "step": 25330 }, { "entropy": 0.7143328845500946, "epoch": 0.20272, "grad_norm": 2.5208330154418945, "learning_rate": 3.988035214085634e-05, "loss": 0.7014, "mean_token_accuracy": 0.789809912443161, "num_tokens": 262756902.0, "step": 25340 }, { "entropy": 0.6578180730342865, "epoch": 0.2028, "grad_norm": 4.625339031219482, "learning_rate": 3.987635054021609e-05, "loss": 0.6544, "mean_token_accuracy": 0.8225129842758179, "num_tokens": 262791645.0, "step": 25350 }, { "entropy": 0.6993514418601989, "epoch": 0.20288, "grad_norm": 1.8647209405899048, "learning_rate": 3.987234893957583e-05, "loss": 0.7018, "mean_token_accuracy": 0.7835330307483673, "num_tokens": 262954744.0, "step": 25360 }, { "entropy": 0.6461169093847274, "epoch": 0.20296, "grad_norm": 4.488020420074463, "learning_rate": 3.986834733893557e-05, "loss": 0.642, "mean_token_accuracy": 0.8156070351600647, "num_tokens": 263032522.0, "step": 25370 }, { "entropy": 0.71181880235672, "epoch": 0.20304, "grad_norm": 1.4542288780212402, "learning_rate": 3.986434573829532e-05, "loss": 0.7242, "mean_token_accuracy": 0.7964806318283081, "num_tokens": 263124856.0, "step": 25380 }, { "entropy": 0.7168243944644928, "epoch": 0.20312, "grad_norm": 3.1399924755096436, "learning_rate": 3.986034413765507e-05, "loss": 0.7085, "mean_token_accuracy": 0.783941638469696, "num_tokens": 263270030.0, "step": 25390 }, { "entropy": 0.6724173992872238, "epoch": 0.2032, "grad_norm": 4.49687385559082, "learning_rate": 3.9856342537014804e-05, "loss": 0.6617, "mean_token_accuracy": 0.8222220122814179, "num_tokens": 263316319.0, "step": 25400 }, { "entropy": 0.7136051237583161, "epoch": 0.20328, "grad_norm": 1.517360806465149, "learning_rate": 3.985234093637455e-05, "loss": 0.7186, "mean_token_accuracy": 0.7783751487731934, "num_tokens": 263480124.0, "step": 25410 }, { "entropy": 0.6389696180820466, "epoch": 0.20336, "grad_norm": 3.0959880352020264, "learning_rate": 3.98483393357343e-05, "loss": 0.6337, "mean_token_accuracy": 0.8150713384151459, "num_tokens": 263561599.0, "step": 25420 }, { "entropy": 0.6841227412223816, "epoch": 0.20344, "grad_norm": 1.591244101524353, "learning_rate": 3.984433773509404e-05, "loss": 0.6815, "mean_token_accuracy": 0.8067872405052186, "num_tokens": 263656628.0, "step": 25430 }, { "entropy": 0.6696559131145478, "epoch": 0.20352, "grad_norm": 2.619391918182373, "learning_rate": 3.984033613445378e-05, "loss": 0.6664, "mean_token_accuracy": 0.799454289674759, "num_tokens": 263789146.0, "step": 25440 }, { "entropy": 0.7188738375902176, "epoch": 0.2036, "grad_norm": 6.366925239562988, "learning_rate": 3.983633453381353e-05, "loss": 0.7154, "mean_token_accuracy": 0.810100132226944, "num_tokens": 263827592.0, "step": 25450 }, { "entropy": 0.6599365115165711, "epoch": 0.20368, "grad_norm": 2.804137706756592, "learning_rate": 3.983233293317327e-05, "loss": 0.6614, "mean_token_accuracy": 0.7905410528182983, "num_tokens": 263991432.0, "step": 25460 }, { "entropy": 0.734381765127182, "epoch": 0.20376, "grad_norm": 2.8372795581817627, "learning_rate": 3.982833133253302e-05, "loss": 0.7253, "mean_token_accuracy": 0.7925753772258759, "num_tokens": 264080392.0, "step": 25470 }, { "entropy": 0.7343921422958374, "epoch": 0.20384, "grad_norm": 1.8563711643218994, "learning_rate": 3.9824329731892754e-05, "loss": 0.7396, "mean_token_accuracy": 0.7869485259056092, "num_tokens": 264175318.0, "step": 25480 }, { "entropy": 0.7332039594650268, "epoch": 0.20392, "grad_norm": 3.760741949081421, "learning_rate": 3.9820328131252504e-05, "loss": 0.7275, "mean_token_accuracy": 0.7802991211414337, "num_tokens": 264307972.0, "step": 25490 }, { "entropy": 0.6481894046068192, "epoch": 0.204, "grad_norm": 4.219727993011475, "learning_rate": 3.981632653061225e-05, "loss": 0.628, "mean_token_accuracy": 0.8289444208145141, "num_tokens": 264348218.0, "step": 25500 }, { "entropy": 0.6966863691806793, "epoch": 0.20408, "grad_norm": 1.9701169729232788, "learning_rate": 3.981232492997199e-05, "loss": 0.701, "mean_token_accuracy": 0.7832464873790741, "num_tokens": 264510668.0, "step": 25510 }, { "entropy": 0.738735806941986, "epoch": 0.20416, "grad_norm": 3.452115058898926, "learning_rate": 3.9808323329331735e-05, "loss": 0.7281, "mean_token_accuracy": 0.7944708108901978, "num_tokens": 264592078.0, "step": 25520 }, { "entropy": 0.7567759215831756, "epoch": 0.20424, "grad_norm": 1.5742406845092773, "learning_rate": 3.980432172869148e-05, "loss": 0.7538, "mean_token_accuracy": 0.7877562820911408, "num_tokens": 264687936.0, "step": 25530 }, { "entropy": 0.696179884672165, "epoch": 0.20432, "grad_norm": 2.5601277351379395, "learning_rate": 3.980032012805122e-05, "loss": 0.698, "mean_token_accuracy": 0.7882402777671814, "num_tokens": 264829205.0, "step": 25540 }, { "entropy": 0.6192919135093689, "epoch": 0.2044, "grad_norm": 5.8695197105407715, "learning_rate": 3.9796318527410966e-05, "loss": 0.6163, "mean_token_accuracy": 0.8290840864181519, "num_tokens": 264869143.0, "step": 25550 }, { "entropy": 0.6387640327215195, "epoch": 0.20448, "grad_norm": 1.5422310829162598, "learning_rate": 3.979231692677071e-05, "loss": 0.6383, "mean_token_accuracy": 0.7996030867099762, "num_tokens": 265032983.0, "step": 25560 }, { "entropy": 0.6861141502857209, "epoch": 0.20456, "grad_norm": 2.831608533859253, "learning_rate": 3.9788315326130454e-05, "loss": 0.6816, "mean_token_accuracy": 0.8078847765922547, "num_tokens": 265116271.0, "step": 25570 }, { "entropy": 0.7126593708992004, "epoch": 0.20464, "grad_norm": 1.4403151273727417, "learning_rate": 3.97843137254902e-05, "loss": 0.7013, "mean_token_accuracy": 0.7999130010604858, "num_tokens": 265209394.0, "step": 25580 }, { "entropy": 0.6966061770915986, "epoch": 0.20472, "grad_norm": 2.3389594554901123, "learning_rate": 3.978031212484994e-05, "loss": 0.6931, "mean_token_accuracy": 0.7913048744201661, "num_tokens": 265338220.0, "step": 25590 }, { "entropy": 0.6078276634216309, "epoch": 0.2048, "grad_norm": 6.349025249481201, "learning_rate": 3.9776310524209685e-05, "loss": 0.6073, "mean_token_accuracy": 0.8295013725757598, "num_tokens": 265375989.0, "step": 25600 }, { "entropy": 0.7980846881866455, "epoch": 0.20488, "grad_norm": 2.1716060638427734, "learning_rate": 3.977230892356943e-05, "loss": 0.7964, "mean_token_accuracy": 0.7654250204563141, "num_tokens": 265539829.0, "step": 25610 }, { "entropy": 0.65274518430233, "epoch": 0.20496, "grad_norm": 2.8036282062530518, "learning_rate": 3.976830732292917e-05, "loss": 0.642, "mean_token_accuracy": 0.8126444756984711, "num_tokens": 265624732.0, "step": 25620 }, { "entropy": 0.6401945114135742, "epoch": 0.20504, "grad_norm": 1.6771445274353027, "learning_rate": 3.9764305722288916e-05, "loss": 0.6574, "mean_token_accuracy": 0.8096670866012573, "num_tokens": 265720117.0, "step": 25630 }, { "entropy": 0.6359527289867402, "epoch": 0.20512, "grad_norm": 2.8850467205047607, "learning_rate": 3.976030412164866e-05, "loss": 0.6257, "mean_token_accuracy": 0.8063440144062042, "num_tokens": 265856296.0, "step": 25640 }, { "entropy": 0.7150340765714646, "epoch": 0.2052, "grad_norm": 4.229689598083496, "learning_rate": 3.9756302521008404e-05, "loss": 0.7068, "mean_token_accuracy": 0.8150962948799133, "num_tokens": 265888047.0, "step": 25650 }, { "entropy": 0.6628670454025268, "epoch": 0.20528, "grad_norm": 1.6222285032272339, "learning_rate": 3.9752300920368154e-05, "loss": 0.6662, "mean_token_accuracy": 0.7873948752880097, "num_tokens": 266051401.0, "step": 25660 }, { "entropy": 0.7389098942279816, "epoch": 0.20536, "grad_norm": 2.959505558013916, "learning_rate": 3.974829931972789e-05, "loss": 0.7264, "mean_token_accuracy": 0.7944243252277374, "num_tokens": 266131318.0, "step": 25670 }, { "entropy": 0.6670049905776978, "epoch": 0.20544, "grad_norm": 2.21547269821167, "learning_rate": 3.9744297719087635e-05, "loss": 0.6508, "mean_token_accuracy": 0.8081030905246734, "num_tokens": 266226541.0, "step": 25680 }, { "entropy": 0.6422082364559174, "epoch": 0.20552, "grad_norm": 2.0161874294281006, "learning_rate": 3.974029611844738e-05, "loss": 0.6489, "mean_token_accuracy": 0.7998554408550262, "num_tokens": 266351951.0, "step": 25690 }, { "entropy": 0.7016429781913758, "epoch": 0.2056, "grad_norm": 5.0944414138793945, "learning_rate": 3.973629451780713e-05, "loss": 0.6896, "mean_token_accuracy": 0.8184211254119873, "num_tokens": 266384912.0, "step": 25700 }, { "entropy": 0.6700229406356811, "epoch": 0.20568, "grad_norm": 2.152367353439331, "learning_rate": 3.9732292917166866e-05, "loss": 0.6646, "mean_token_accuracy": 0.7895884215831757, "num_tokens": 266548752.0, "step": 25710 }, { "entropy": 0.6537436842918396, "epoch": 0.20576, "grad_norm": 3.3976316452026367, "learning_rate": 3.972829131652661e-05, "loss": 0.648, "mean_token_accuracy": 0.8109478056430817, "num_tokens": 266629106.0, "step": 25720 }, { "entropy": 0.7735417068004609, "epoch": 0.20584, "grad_norm": 1.5225986242294312, "learning_rate": 3.972428971588636e-05, "loss": 0.7809, "mean_token_accuracy": 0.7783564865589142, "num_tokens": 266723623.0, "step": 25730 }, { "entropy": 0.647225159406662, "epoch": 0.20592, "grad_norm": 2.6532981395721436, "learning_rate": 3.9720288115246104e-05, "loss": 0.6407, "mean_token_accuracy": 0.8022166609764099, "num_tokens": 266860841.0, "step": 25740 }, { "entropy": 0.7222413450479508, "epoch": 0.206, "grad_norm": 5.191136360168457, "learning_rate": 3.971628651460584e-05, "loss": 0.7322, "mean_token_accuracy": 0.8067708432674408, "num_tokens": 266894489.0, "step": 25750 }, { "entropy": 0.6292054831981659, "epoch": 0.20608, "grad_norm": 2.5977447032928467, "learning_rate": 3.9712284913965584e-05, "loss": 0.6251, "mean_token_accuracy": 0.7996458113193512, "num_tokens": 267058329.0, "step": 25760 }, { "entropy": 0.6201052397489548, "epoch": 0.20616, "grad_norm": 3.533583641052246, "learning_rate": 3.9708283313325335e-05, "loss": 0.6179, "mean_token_accuracy": 0.8179591119289398, "num_tokens": 267153654.0, "step": 25770 }, { "entropy": 0.7457313716411591, "epoch": 0.20624, "grad_norm": 1.724035620689392, "learning_rate": 3.970428171268508e-05, "loss": 0.7471, "mean_token_accuracy": 0.7918721139431, "num_tokens": 267246440.0, "step": 25780 }, { "entropy": 0.6836652338504792, "epoch": 0.20632, "grad_norm": 2.6119232177734375, "learning_rate": 3.9700280112044816e-05, "loss": 0.6675, "mean_token_accuracy": 0.8006380021572113, "num_tokens": 267367650.0, "step": 25790 }, { "entropy": 0.7106513261795044, "epoch": 0.2064, "grad_norm": 4.41978645324707, "learning_rate": 3.969627851140456e-05, "loss": 0.7248, "mean_token_accuracy": 0.8094318747520447, "num_tokens": 267404063.0, "step": 25800 }, { "entropy": 0.6330256283283233, "epoch": 0.20648, "grad_norm": 1.5059890747070312, "learning_rate": 3.969227691076431e-05, "loss": 0.6378, "mean_token_accuracy": 0.7961590230464936, "num_tokens": 267567903.0, "step": 25810 }, { "entropy": 0.6832630157470703, "epoch": 0.20656, "grad_norm": 2.939708709716797, "learning_rate": 3.9688275310124053e-05, "loss": 0.6738, "mean_token_accuracy": 0.8021480679512024, "num_tokens": 267664162.0, "step": 25820 }, { "entropy": 0.7124542653560638, "epoch": 0.20664, "grad_norm": 1.487086534500122, "learning_rate": 3.968427370948379e-05, "loss": 0.7096, "mean_token_accuracy": 0.8004218876361847, "num_tokens": 267758246.0, "step": 25830 }, { "entropy": 0.6713570564985275, "epoch": 0.20672, "grad_norm": 2.7082290649414062, "learning_rate": 3.968027210884354e-05, "loss": 0.6711, "mean_token_accuracy": 0.7955439150333404, "num_tokens": 267892226.0, "step": 25840 }, { "entropy": 0.7869588673114777, "epoch": 0.2068, "grad_norm": 4.685821056365967, "learning_rate": 3.9676270508203285e-05, "loss": 0.7792, "mean_token_accuracy": 0.8008663296699524, "num_tokens": 267928545.0, "step": 25850 }, { "entropy": 0.6420732736587524, "epoch": 0.20688, "grad_norm": 2.241485118865967, "learning_rate": 3.967226890756303e-05, "loss": 0.6403, "mean_token_accuracy": 0.7960246801376343, "num_tokens": 268092385.0, "step": 25860 }, { "entropy": 0.7171670317649841, "epoch": 0.20696, "grad_norm": 3.3935296535491943, "learning_rate": 3.9668267306922765e-05, "loss": 0.7079, "mean_token_accuracy": 0.798249113559723, "num_tokens": 268184539.0, "step": 25870 }, { "entropy": 0.6949678122997284, "epoch": 0.20704, "grad_norm": 1.7848869562149048, "learning_rate": 3.9664265706282516e-05, "loss": 0.6846, "mean_token_accuracy": 0.8056893348693848, "num_tokens": 268279875.0, "step": 25880 }, { "entropy": 0.6965011656284332, "epoch": 0.20712, "grad_norm": 2.7146811485290527, "learning_rate": 3.966026410564226e-05, "loss": 0.696, "mean_token_accuracy": 0.7909701466560364, "num_tokens": 268399484.0, "step": 25890 }, { "entropy": 0.7024367451667786, "epoch": 0.2072, "grad_norm": 4.395477294921875, "learning_rate": 3.9656262505002e-05, "loss": 0.7049, "mean_token_accuracy": 0.815142822265625, "num_tokens": 268433374.0, "step": 25900 }, { "entropy": 0.6783629477024078, "epoch": 0.20728, "grad_norm": 1.4107352495193481, "learning_rate": 3.965226090436175e-05, "loss": 0.6775, "mean_token_accuracy": 0.7863031327724457, "num_tokens": 268597214.0, "step": 25910 }, { "entropy": 0.6579757660627366, "epoch": 0.20736, "grad_norm": 4.311005592346191, "learning_rate": 3.964825930372149e-05, "loss": 0.6536, "mean_token_accuracy": 0.8093653500080109, "num_tokens": 268685668.0, "step": 25920 }, { "entropy": 0.7030425012111664, "epoch": 0.20744, "grad_norm": 1.8334481716156006, "learning_rate": 3.9644257703081234e-05, "loss": 0.6941, "mean_token_accuracy": 0.8005288183689118, "num_tokens": 268779738.0, "step": 25930 }, { "entropy": 0.722130399942398, "epoch": 0.20752, "grad_norm": 2.1225473880767822, "learning_rate": 3.964025610244098e-05, "loss": 0.7152, "mean_token_accuracy": 0.7823333024978638, "num_tokens": 268925034.0, "step": 25940 }, { "entropy": 0.693048632144928, "epoch": 0.2076, "grad_norm": 5.777894973754883, "learning_rate": 3.963625450180072e-05, "loss": 0.6847, "mean_token_accuracy": 0.8135523796081543, "num_tokens": 268965328.0, "step": 25950 }, { "entropy": 0.6365913331508637, "epoch": 0.20768, "grad_norm": 1.6648709774017334, "learning_rate": 3.9632252901160465e-05, "loss": 0.6395, "mean_token_accuracy": 0.7927760124206543, "num_tokens": 269129168.0, "step": 25960 }, { "entropy": 0.6758990168571473, "epoch": 0.20776, "grad_norm": 3.357668876647949, "learning_rate": 3.962825130052021e-05, "loss": 0.6634, "mean_token_accuracy": 0.8104947626590728, "num_tokens": 269215545.0, "step": 25970 }, { "entropy": 0.6902761280536651, "epoch": 0.20784, "grad_norm": 1.38667631149292, "learning_rate": 3.962424969987995e-05, "loss": 0.7022, "mean_token_accuracy": 0.8008007943630219, "num_tokens": 269309791.0, "step": 25980 }, { "entropy": 0.6504361093044281, "epoch": 0.20792, "grad_norm": 2.254383087158203, "learning_rate": 3.9620248099239697e-05, "loss": 0.6556, "mean_token_accuracy": 0.7984991133213043, "num_tokens": 269446638.0, "step": 25990 }, { "entropy": 0.630236029624939, "epoch": 0.208, "grad_norm": 4.3770833015441895, "learning_rate": 3.961624649859944e-05, "loss": 0.6198, "mean_token_accuracy": 0.8360102415084839, "num_tokens": 269482906.0, "step": 26000 }, { "entropy": 0.6380090475082397, "epoch": 0.20808, "grad_norm": 1.5954519510269165, "learning_rate": 3.9612244897959184e-05, "loss": 0.635, "mean_token_accuracy": 0.8004274547100068, "num_tokens": 269646746.0, "step": 26010 }, { "entropy": 0.6927581578493118, "epoch": 0.20816, "grad_norm": 3.7970380783081055, "learning_rate": 3.960824329731893e-05, "loss": 0.6899, "mean_token_accuracy": 0.8041170477867127, "num_tokens": 269733465.0, "step": 26020 }, { "entropy": 0.6999450385570526, "epoch": 0.20824, "grad_norm": 1.9999412298202515, "learning_rate": 3.960424169667867e-05, "loss": 0.7067, "mean_token_accuracy": 0.8002549648284912, "num_tokens": 269827978.0, "step": 26030 }, { "entropy": 0.7252226531505584, "epoch": 0.20832, "grad_norm": 2.864431381225586, "learning_rate": 3.9600240096038415e-05, "loss": 0.7262, "mean_token_accuracy": 0.781513899564743, "num_tokens": 269971305.0, "step": 26040 }, { "entropy": 0.6789401769638062, "epoch": 0.2084, "grad_norm": 5.258605480194092, "learning_rate": 3.9596238495398166e-05, "loss": 0.6687, "mean_token_accuracy": 0.8194678425788879, "num_tokens": 270008902.0, "step": 26050 }, { "entropy": 0.6854479134082794, "epoch": 0.20848, "grad_norm": 1.745119571685791, "learning_rate": 3.95922368947579e-05, "loss": 0.6846, "mean_token_accuracy": 0.7881106555461883, "num_tokens": 270172742.0, "step": 26060 }, { "entropy": 0.6992386698722839, "epoch": 0.20856, "grad_norm": 3.237032413482666, "learning_rate": 3.9588235294117646e-05, "loss": 0.6959, "mean_token_accuracy": 0.8030310153961182, "num_tokens": 270258215.0, "step": 26070 }, { "entropy": 0.7204101145267486, "epoch": 0.20864, "grad_norm": 2.1251330375671387, "learning_rate": 3.958423369347739e-05, "loss": 0.7345, "mean_token_accuracy": 0.7900099933147431, "num_tokens": 270352569.0, "step": 26080 }, { "entropy": 0.7382087707519531, "epoch": 0.20872, "grad_norm": 3.3145933151245117, "learning_rate": 3.958023209283714e-05, "loss": 0.7263, "mean_token_accuracy": 0.7820479929447174, "num_tokens": 270499516.0, "step": 26090 }, { "entropy": 0.7145756304264068, "epoch": 0.2088, "grad_norm": 4.214910507202148, "learning_rate": 3.957623049219688e-05, "loss": 0.7189, "mean_token_accuracy": 0.8066617131233216, "num_tokens": 270543457.0, "step": 26100 }, { "entropy": 0.7175606966018677, "epoch": 0.20888, "grad_norm": 1.7091223001480103, "learning_rate": 3.957222889155662e-05, "loss": 0.7135, "mean_token_accuracy": 0.7782791972160339, "num_tokens": 270707297.0, "step": 26110 }, { "entropy": 0.7027863323688507, "epoch": 0.20896, "grad_norm": 3.2738966941833496, "learning_rate": 3.956822729091637e-05, "loss": 0.6959, "mean_token_accuracy": 0.7946766495704651, "num_tokens": 270798528.0, "step": 26120 }, { "entropy": 0.6802214920520783, "epoch": 0.20904, "grad_norm": 2.039320468902588, "learning_rate": 3.9564225690276115e-05, "loss": 0.6715, "mean_token_accuracy": 0.8059803545475006, "num_tokens": 270892104.0, "step": 26130 }, { "entropy": 0.6946530938148499, "epoch": 0.20912, "grad_norm": 2.304502487182617, "learning_rate": 3.956022408963585e-05, "loss": 0.6886, "mean_token_accuracy": 0.7891117334365845, "num_tokens": 271032851.0, "step": 26140 }, { "entropy": 0.7177651405334473, "epoch": 0.2092, "grad_norm": 4.4216437339782715, "learning_rate": 3.9556222488995596e-05, "loss": 0.7104, "mean_token_accuracy": 0.809225070476532, "num_tokens": 271076787.0, "step": 26150 }, { "entropy": 0.666985034942627, "epoch": 0.20928, "grad_norm": 1.5103905200958252, "learning_rate": 3.9552220888355346e-05, "loss": 0.6651, "mean_token_accuracy": 0.7904433369636535, "num_tokens": 271240627.0, "step": 26160 }, { "entropy": 0.6754383236169815, "epoch": 0.20936, "grad_norm": 3.0380916595458984, "learning_rate": 3.954821928771509e-05, "loss": 0.6762, "mean_token_accuracy": 0.8063707768917083, "num_tokens": 271323957.0, "step": 26170 }, { "entropy": 0.75105060338974, "epoch": 0.20944, "grad_norm": 1.5062624216079712, "learning_rate": 3.954421768707483e-05, "loss": 0.74, "mean_token_accuracy": 0.7931781888008118, "num_tokens": 271416905.0, "step": 26180 }, { "entropy": 0.7093431115150451, "epoch": 0.20952, "grad_norm": 3.3899474143981934, "learning_rate": 3.954021608643458e-05, "loss": 0.7133, "mean_token_accuracy": 0.7841263949871063, "num_tokens": 271549986.0, "step": 26190 }, { "entropy": 0.6950425148010254, "epoch": 0.2096, "grad_norm": 3.9819281101226807, "learning_rate": 3.953621448579432e-05, "loss": 0.687, "mean_token_accuracy": 0.8136707246303558, "num_tokens": 271586858.0, "step": 26200 }, { "entropy": 0.6679485380649567, "epoch": 0.20968, "grad_norm": 2.211413860321045, "learning_rate": 3.9532212885154065e-05, "loss": 0.6637, "mean_token_accuracy": 0.7923119306564331, "num_tokens": 271750698.0, "step": 26210 }, { "entropy": 0.6461467415094375, "epoch": 0.20976, "grad_norm": 3.731077194213867, "learning_rate": 3.95282112845138e-05, "loss": 0.6406, "mean_token_accuracy": 0.8158885538578033, "num_tokens": 271834924.0, "step": 26220 }, { "entropy": 0.7313366830348969, "epoch": 0.20984, "grad_norm": 1.4892152547836304, "learning_rate": 3.952420968387355e-05, "loss": 0.7294, "mean_token_accuracy": 0.7963824331760406, "num_tokens": 271926700.0, "step": 26230 }, { "entropy": 0.765681391954422, "epoch": 0.20992, "grad_norm": 2.564143419265747, "learning_rate": 3.9520208083233296e-05, "loss": 0.7706, "mean_token_accuracy": 0.7720565080642701, "num_tokens": 272067648.0, "step": 26240 }, { "entropy": 0.691275817155838, "epoch": 0.21, "grad_norm": 4.911830425262451, "learning_rate": 3.951620648259304e-05, "loss": 0.6836, "mean_token_accuracy": 0.8140800058841705, "num_tokens": 272104387.0, "step": 26250 }, { "entropy": 0.6886591851711273, "epoch": 0.21008, "grad_norm": 2.141245126724243, "learning_rate": 3.9512204881952784e-05, "loss": 0.6913, "mean_token_accuracy": 0.7885330557823181, "num_tokens": 272265141.0, "step": 26260 }, { "entropy": 0.6511899322271347, "epoch": 0.21016, "grad_norm": 3.6745266914367676, "learning_rate": 3.950820328131253e-05, "loss": 0.6395, "mean_token_accuracy": 0.8190278112888336, "num_tokens": 272340256.0, "step": 26270 }, { "entropy": 0.7316523194313049, "epoch": 0.21024, "grad_norm": 1.8853168487548828, "learning_rate": 3.950420168067227e-05, "loss": 0.7254, "mean_token_accuracy": 0.7916485667228699, "num_tokens": 272433589.0, "step": 26280 }, { "entropy": 0.7071425676345825, "epoch": 0.21032, "grad_norm": 2.568312883377075, "learning_rate": 3.9500200080032015e-05, "loss": 0.7037, "mean_token_accuracy": 0.7855027854442597, "num_tokens": 272577269.0, "step": 26290 }, { "entropy": 0.6843074768781662, "epoch": 0.2104, "grad_norm": 3.9397666454315186, "learning_rate": 3.949619847939176e-05, "loss": 0.6775, "mean_token_accuracy": 0.8210567712783814, "num_tokens": 272617277.0, "step": 26300 }, { "entropy": 0.7036170363426208, "epoch": 0.21048, "grad_norm": 1.8613879680633545, "learning_rate": 3.94921968787515e-05, "loss": 0.6949, "mean_token_accuracy": 0.783582079410553, "num_tokens": 272780724.0, "step": 26310 }, { "entropy": 0.6297882437705994, "epoch": 0.21056, "grad_norm": 3.373650074005127, "learning_rate": 3.9488195278111246e-05, "loss": 0.6323, "mean_token_accuracy": 0.8168894052505493, "num_tokens": 272856028.0, "step": 26320 }, { "entropy": 0.6731867671012879, "epoch": 0.21064, "grad_norm": 1.4529075622558594, "learning_rate": 3.948419367747099e-05, "loss": 0.6805, "mean_token_accuracy": 0.8037088513374329, "num_tokens": 272949597.0, "step": 26330 }, { "entropy": 0.6943272531032563, "epoch": 0.21072, "grad_norm": 2.4284157752990723, "learning_rate": 3.948019207683073e-05, "loss": 0.679, "mean_token_accuracy": 0.7939604341983795, "num_tokens": 273085752.0, "step": 26340 }, { "entropy": 0.6834249258041382, "epoch": 0.2108, "grad_norm": 4.943732261657715, "learning_rate": 3.947619047619048e-05, "loss": 0.6875, "mean_token_accuracy": 0.8124493181705474, "num_tokens": 273122045.0, "step": 26350 }, { "entropy": 0.6598065793514252, "epoch": 0.21088, "grad_norm": 1.8989732265472412, "learning_rate": 3.947218887555022e-05, "loss": 0.6626, "mean_token_accuracy": 0.7930508077144622, "num_tokens": 273285885.0, "step": 26360 }, { "entropy": 0.7123493015766144, "epoch": 0.21096, "grad_norm": 3.594951629638672, "learning_rate": 3.946818727490997e-05, "loss": 0.7026, "mean_token_accuracy": 0.8026896357536316, "num_tokens": 273373460.0, "step": 26370 }, { "entropy": 0.6849965333938599, "epoch": 0.21104, "grad_norm": 1.7927830219268799, "learning_rate": 3.946418567426971e-05, "loss": 0.6932, "mean_token_accuracy": 0.7998254656791687, "num_tokens": 273466576.0, "step": 26380 }, { "entropy": 0.6913886845111847, "epoch": 0.21112, "grad_norm": 2.8182566165924072, "learning_rate": 3.946018407362945e-05, "loss": 0.6764, "mean_token_accuracy": 0.7930060863494873, "num_tokens": 273607327.0, "step": 26390 }, { "entropy": 0.7409124970436096, "epoch": 0.2112, "grad_norm": 4.383538722991943, "learning_rate": 3.9456182472989196e-05, "loss": 0.7399, "mean_token_accuracy": 0.8110754013061523, "num_tokens": 273645150.0, "step": 26400 }, { "entropy": 0.6302568197250367, "epoch": 0.21128, "grad_norm": 1.8377089500427246, "learning_rate": 3.9452180872348946e-05, "loss": 0.6291, "mean_token_accuracy": 0.7987710654735565, "num_tokens": 273807688.0, "step": 26410 }, { "entropy": 0.7213980793952942, "epoch": 0.21136, "grad_norm": 3.0453550815582275, "learning_rate": 3.944817927170868e-05, "loss": 0.7158, "mean_token_accuracy": 0.794190239906311, "num_tokens": 273884530.0, "step": 26420 }, { "entropy": 0.6111736238002777, "epoch": 0.21144, "grad_norm": 2.218510627746582, "learning_rate": 3.944417767106843e-05, "loss": 0.6084, "mean_token_accuracy": 0.8203320980072022, "num_tokens": 273978094.0, "step": 26430 }, { "entropy": 0.7097018182277679, "epoch": 0.21152, "grad_norm": 2.4533002376556396, "learning_rate": 3.944017607042818e-05, "loss": 0.699, "mean_token_accuracy": 0.7913313388824463, "num_tokens": 274101038.0, "step": 26440 }, { "entropy": 0.7417670071125031, "epoch": 0.2116, "grad_norm": 4.530383110046387, "learning_rate": 3.943617446978792e-05, "loss": 0.7297, "mean_token_accuracy": 0.8084664702415466, "num_tokens": 274133927.0, "step": 26450 }, { "entropy": 0.606174635887146, "epoch": 0.21168, "grad_norm": 1.4896551370620728, "learning_rate": 3.943217286914766e-05, "loss": 0.6075, "mean_token_accuracy": 0.8075415313243866, "num_tokens": 274297767.0, "step": 26460 }, { "entropy": 0.7162563920021057, "epoch": 0.21176, "grad_norm": 3.2835073471069336, "learning_rate": 3.94281712685074e-05, "loss": 0.7061, "mean_token_accuracy": 0.801158320903778, "num_tokens": 274385728.0, "step": 26470 }, { "entropy": 0.7585044384002686, "epoch": 0.21184, "grad_norm": 2.4984867572784424, "learning_rate": 3.942416966786715e-05, "loss": 0.758, "mean_token_accuracy": 0.7864907503128051, "num_tokens": 274480154.0, "step": 26480 }, { "entropy": 0.7435497522354126, "epoch": 0.21192, "grad_norm": 2.69217848777771, "learning_rate": 3.9420168067226896e-05, "loss": 0.7389, "mean_token_accuracy": 0.7778694689273834, "num_tokens": 274619272.0, "step": 26490 }, { "entropy": 0.6743844509124756, "epoch": 0.212, "grad_norm": 4.600988388061523, "learning_rate": 3.941616646658663e-05, "loss": 0.6798, "mean_token_accuracy": 0.8181379020214081, "num_tokens": 274661451.0, "step": 26500 }, { "entropy": 0.6841980218887329, "epoch": 0.21208, "grad_norm": 2.0520150661468506, "learning_rate": 3.941216486594638e-05, "loss": 0.6876, "mean_token_accuracy": 0.7886174976825714, "num_tokens": 274825291.0, "step": 26510 }, { "entropy": 0.7343377947807312, "epoch": 0.21216, "grad_norm": 3.235459327697754, "learning_rate": 3.940816326530613e-05, "loss": 0.7153, "mean_token_accuracy": 0.7962718725204467, "num_tokens": 274906322.0, "step": 26520 }, { "entropy": 0.6895760774612427, "epoch": 0.21224, "grad_norm": 2.095125913619995, "learning_rate": 3.940416166466587e-05, "loss": 0.6899, "mean_token_accuracy": 0.7993171513080597, "num_tokens": 274998976.0, "step": 26530 }, { "entropy": 0.6969447374343872, "epoch": 0.21232, "grad_norm": 3.627067804336548, "learning_rate": 3.940016006402561e-05, "loss": 0.693, "mean_token_accuracy": 0.787922066450119, "num_tokens": 275144501.0, "step": 26540 }, { "entropy": 0.7700412839651107, "epoch": 0.2124, "grad_norm": 5.38113260269165, "learning_rate": 3.939615846338536e-05, "loss": 0.7906, "mean_token_accuracy": 0.7963928997516632, "num_tokens": 275185477.0, "step": 26550 }, { "entropy": 0.6536677181720734, "epoch": 0.21248, "grad_norm": 1.6796905994415283, "learning_rate": 3.93921568627451e-05, "loss": 0.646, "mean_token_accuracy": 0.7955233097076416, "num_tokens": 275348259.0, "step": 26560 }, { "entropy": 0.6531887650489807, "epoch": 0.21256, "grad_norm": 2.8774776458740234, "learning_rate": 3.9388155262104846e-05, "loss": 0.6403, "mean_token_accuracy": 0.8168245315551758, "num_tokens": 275420171.0, "step": 26570 }, { "entropy": 0.6630780875682831, "epoch": 0.21264, "grad_norm": 1.7772128582000732, "learning_rate": 3.938415366146459e-05, "loss": 0.6869, "mean_token_accuracy": 0.8023043870925903, "num_tokens": 275512988.0, "step": 26580 }, { "entropy": 0.6576256871223449, "epoch": 0.21272, "grad_norm": 3.10561466217041, "learning_rate": 3.938015206082433e-05, "loss": 0.6311, "mean_token_accuracy": 0.8061201989650726, "num_tokens": 275652083.0, "step": 26590 }, { "entropy": 0.6659539967775345, "epoch": 0.2128, "grad_norm": 4.093822002410889, "learning_rate": 3.937615046018408e-05, "loss": 0.6884, "mean_token_accuracy": 0.8193136811256408, "num_tokens": 275685586.0, "step": 26600 }, { "entropy": 0.6245937824249268, "epoch": 0.21288, "grad_norm": 1.4920238256454468, "learning_rate": 3.937214885954382e-05, "loss": 0.6367, "mean_token_accuracy": 0.7962445020675659, "num_tokens": 275849426.0, "step": 26610 }, { "entropy": 0.7087166368961334, "epoch": 0.21296, "grad_norm": 2.770886182785034, "learning_rate": 3.9368147258903564e-05, "loss": 0.6902, "mean_token_accuracy": 0.8022646486759186, "num_tokens": 275948088.0, "step": 26620 }, { "entropy": 0.7227681994438171, "epoch": 0.21304, "grad_norm": 2.052422285079956, "learning_rate": 3.936414565826331e-05, "loss": 0.7159, "mean_token_accuracy": 0.7953931868076325, "num_tokens": 276042922.0, "step": 26630 }, { "entropy": 0.735739278793335, "epoch": 0.21312, "grad_norm": 2.3534975051879883, "learning_rate": 3.936014405762305e-05, "loss": 0.7376, "mean_token_accuracy": 0.7816283583641053, "num_tokens": 276177884.0, "step": 26640 }, { "entropy": 0.7065069943666458, "epoch": 0.2132, "grad_norm": 5.069843769073486, "learning_rate": 3.9356142456982795e-05, "loss": 0.709, "mean_token_accuracy": 0.811664241552353, "num_tokens": 276212804.0, "step": 26650 }, { "entropy": 0.6777209401130676, "epoch": 0.21328, "grad_norm": 1.8771620988845825, "learning_rate": 3.935214085634254e-05, "loss": 0.6697, "mean_token_accuracy": 0.7924254715442658, "num_tokens": 276376535.0, "step": 26660 }, { "entropy": 0.6950774222612381, "epoch": 0.21336, "grad_norm": 2.847191095352173, "learning_rate": 3.934813925570228e-05, "loss": 0.6917, "mean_token_accuracy": 0.8029844284057617, "num_tokens": 276459759.0, "step": 26670 }, { "entropy": 0.7008790254592896, "epoch": 0.21344, "grad_norm": 1.5657349824905396, "learning_rate": 3.9344137655062026e-05, "loss": 0.6956, "mean_token_accuracy": 0.8025653600692749, "num_tokens": 276551428.0, "step": 26680 }, { "entropy": 0.7181881010532379, "epoch": 0.21352, "grad_norm": 2.5802321434020996, "learning_rate": 3.934013605442177e-05, "loss": 0.7031, "mean_token_accuracy": 0.7860591471195221, "num_tokens": 276688652.0, "step": 26690 }, { "entropy": 0.6714756488800049, "epoch": 0.2136, "grad_norm": 5.720299243927002, "learning_rate": 3.9336134453781514e-05, "loss": 0.6668, "mean_token_accuracy": 0.8211300253868103, "num_tokens": 276726097.0, "step": 26700 }, { "entropy": 0.6713546991348267, "epoch": 0.21368, "grad_norm": 1.7779419422149658, "learning_rate": 3.933213285314126e-05, "loss": 0.675, "mean_token_accuracy": 0.791530293226242, "num_tokens": 276889937.0, "step": 26710 }, { "entropy": 0.649800181388855, "epoch": 0.21376, "grad_norm": 3.0792391300201416, "learning_rate": 3.932813125250101e-05, "loss": 0.6386, "mean_token_accuracy": 0.81081303358078, "num_tokens": 276976745.0, "step": 26720 }, { "entropy": 0.6916120290756226, "epoch": 0.21384, "grad_norm": 2.2696468830108643, "learning_rate": 3.9324129651860745e-05, "loss": 0.6988, "mean_token_accuracy": 0.8018103897571563, "num_tokens": 277070572.0, "step": 26730 }, { "entropy": 0.7043094038963318, "epoch": 0.21392, "grad_norm": 3.382568597793579, "learning_rate": 3.932012805122049e-05, "loss": 0.6909, "mean_token_accuracy": 0.7886595726013184, "num_tokens": 277213859.0, "step": 26740 }, { "entropy": 0.6771474391222, "epoch": 0.214, "grad_norm": 4.614317893981934, "learning_rate": 3.931612645058023e-05, "loss": 0.6845, "mean_token_accuracy": 0.812552934885025, "num_tokens": 277254964.0, "step": 26750 }, { "entropy": 0.6646194815635681, "epoch": 0.21408, "grad_norm": 2.2541489601135254, "learning_rate": 3.931212484993998e-05, "loss": 0.6667, "mean_token_accuracy": 0.7883780658245086, "num_tokens": 277418074.0, "step": 26760 }, { "entropy": 0.702345323562622, "epoch": 0.21416, "grad_norm": 3.4752120971679688, "learning_rate": 3.930812324929972e-05, "loss": 0.6898, "mean_token_accuracy": 0.8008921325206757, "num_tokens": 277497997.0, "step": 26770 }, { "entropy": 0.7676267385482788, "epoch": 0.21424, "grad_norm": 2.212143659591675, "learning_rate": 3.9304121648659464e-05, "loss": 0.7718, "mean_token_accuracy": 0.7810831725597381, "num_tokens": 277590927.0, "step": 26780 }, { "entropy": 0.709066778421402, "epoch": 0.21432, "grad_norm": 3.486746311187744, "learning_rate": 3.9300120048019214e-05, "loss": 0.7076, "mean_token_accuracy": 0.7861394882202148, "num_tokens": 277728995.0, "step": 26790 }, { "entropy": 0.7705330073833465, "epoch": 0.2144, "grad_norm": 4.9169511795043945, "learning_rate": 3.929611844737896e-05, "loss": 0.7747, "mean_token_accuracy": 0.7963698148727417, "num_tokens": 277769581.0, "step": 26800 }, { "entropy": 0.7183443665504455, "epoch": 0.21448, "grad_norm": 2.3116800785064697, "learning_rate": 3.9292116846738695e-05, "loss": 0.7186, "mean_token_accuracy": 0.7833694875240326, "num_tokens": 277932505.0, "step": 26810 }, { "entropy": 0.757407808303833, "epoch": 0.21456, "grad_norm": 3.4535176753997803, "learning_rate": 3.928811524609844e-05, "loss": 0.7514, "mean_token_accuracy": 0.7914154052734375, "num_tokens": 278012447.0, "step": 26820 }, { "entropy": 0.6741983890533447, "epoch": 0.21464, "grad_norm": 1.400278091430664, "learning_rate": 3.928411364545819e-05, "loss": 0.6664, "mean_token_accuracy": 0.8059969186782837, "num_tokens": 278107128.0, "step": 26830 }, { "entropy": 0.7134167850017548, "epoch": 0.21472, "grad_norm": 2.356175184249878, "learning_rate": 3.928011204481793e-05, "loss": 0.7151, "mean_token_accuracy": 0.7812339186668396, "num_tokens": 278252198.0, "step": 26840 }, { "entropy": 0.7147589445114135, "epoch": 0.2148, "grad_norm": 4.402573585510254, "learning_rate": 3.927611044417767e-05, "loss": 0.7133, "mean_token_accuracy": 0.8083986282348633, "num_tokens": 278287085.0, "step": 26850 }, { "entropy": 0.7340508818626403, "epoch": 0.21488, "grad_norm": 1.8175737857818604, "learning_rate": 3.927210884353742e-05, "loss": 0.7343, "mean_token_accuracy": 0.7712078750133514, "num_tokens": 278450925.0, "step": 26860 }, { "entropy": 0.7594166338443756, "epoch": 0.21496, "grad_norm": 3.4062585830688477, "learning_rate": 3.9268107242897164e-05, "loss": 0.7447, "mean_token_accuracy": 0.7899671971797944, "num_tokens": 278531724.0, "step": 26870 }, { "entropy": 0.7379371523857117, "epoch": 0.21504, "grad_norm": 2.244649887084961, "learning_rate": 3.926410564225691e-05, "loss": 0.718, "mean_token_accuracy": 0.7938495695590972, "num_tokens": 278624678.0, "step": 26880 }, { "entropy": 0.6961973309516907, "epoch": 0.21512, "grad_norm": 2.165696382522583, "learning_rate": 3.9260104041616644e-05, "loss": 0.7096, "mean_token_accuracy": 0.7846891820430756, "num_tokens": 278766713.0, "step": 26890 }, { "entropy": 0.6071455538272857, "epoch": 0.2152, "grad_norm": 5.626785755157471, "learning_rate": 3.9256102440976395e-05, "loss": 0.5929, "mean_token_accuracy": 0.8372160851955414, "num_tokens": 278808456.0, "step": 26900 }, { "entropy": 0.6814864993095398, "epoch": 0.21528, "grad_norm": 2.1963186264038086, "learning_rate": 3.925210084033614e-05, "loss": 0.6841, "mean_token_accuracy": 0.784300422668457, "num_tokens": 278971100.0, "step": 26910 }, { "entropy": 0.6092913091182709, "epoch": 0.21536, "grad_norm": 3.655984878540039, "learning_rate": 3.924809923969588e-05, "loss": 0.607, "mean_token_accuracy": 0.8265171051025391, "num_tokens": 279036526.0, "step": 26920 }, { "entropy": 0.6879002273082733, "epoch": 0.21544, "grad_norm": 2.523503303527832, "learning_rate": 3.924409763905562e-05, "loss": 0.6853, "mean_token_accuracy": 0.805427098274231, "num_tokens": 279128697.0, "step": 26930 }, { "entropy": 0.6993666827678681, "epoch": 0.21552, "grad_norm": 2.2285380363464355, "learning_rate": 3.924009603841537e-05, "loss": 0.7049, "mean_token_accuracy": 0.7848437011241913, "num_tokens": 279263725.0, "step": 26940 }, { "entropy": 0.7055212974548339, "epoch": 0.2156, "grad_norm": 4.814144611358643, "learning_rate": 3.9236094437775113e-05, "loss": 0.7044, "mean_token_accuracy": 0.8093373477458954, "num_tokens": 279301184.0, "step": 26950 }, { "entropy": 0.723582512140274, "epoch": 0.21568, "grad_norm": 1.5361173152923584, "learning_rate": 3.923209283713486e-05, "loss": 0.7227, "mean_token_accuracy": 0.7817680597305298, "num_tokens": 279464222.0, "step": 26960 }, { "entropy": 0.7417714953422546, "epoch": 0.21576, "grad_norm": 3.661759853363037, "learning_rate": 3.92280912364946e-05, "loss": 0.7453, "mean_token_accuracy": 0.7923592686653137, "num_tokens": 279532585.0, "step": 26970 }, { "entropy": 0.7588687002658844, "epoch": 0.21584, "grad_norm": 1.9237674474716187, "learning_rate": 3.9224089635854345e-05, "loss": 0.7625, "mean_token_accuracy": 0.791006326675415, "num_tokens": 279624122.0, "step": 26980 }, { "entropy": 0.6783428609371185, "epoch": 0.21592, "grad_norm": 2.6948225498199463, "learning_rate": 3.922008803521409e-05, "loss": 0.6789, "mean_token_accuracy": 0.7891014695167542, "num_tokens": 279770182.0, "step": 26990 }, { "entropy": 0.6541492581367493, "epoch": 0.216, "grad_norm": 6.286215305328369, "learning_rate": 3.921608643457383e-05, "loss": 0.6464, "mean_token_accuracy": 0.8196382164955139, "num_tokens": 279811329.0, "step": 27000 }, { "entropy": 0.6707334876060486, "epoch": 0.21608, "grad_norm": 2.6269257068634033, "learning_rate": 3.9212084833933576e-05, "loss": 0.6687, "mean_token_accuracy": 0.7893298447132111, "num_tokens": 279974002.0, "step": 27010 }, { "entropy": 0.6455707520246505, "epoch": 0.21616, "grad_norm": 3.7028324604034424, "learning_rate": 3.920808323329332e-05, "loss": 0.6437, "mean_token_accuracy": 0.812095433473587, "num_tokens": 280049622.0, "step": 27020 }, { "entropy": 0.696572893857956, "epoch": 0.21624, "grad_norm": 1.979952096939087, "learning_rate": 3.920408163265306e-05, "loss": 0.6954, "mean_token_accuracy": 0.800733768939972, "num_tokens": 280142268.0, "step": 27030 }, { "entropy": 0.7548587322235107, "epoch": 0.21632, "grad_norm": 2.41125750541687, "learning_rate": 3.920008003201281e-05, "loss": 0.7342, "mean_token_accuracy": 0.7838636875152588, "num_tokens": 280273675.0, "step": 27040 }, { "entropy": 0.6525760620832444, "epoch": 0.2164, "grad_norm": 4.42751932144165, "learning_rate": 3.919607843137255e-05, "loss": 0.6527, "mean_token_accuracy": 0.8224212229251862, "num_tokens": 280308370.0, "step": 27050 }, { "entropy": 0.7358780324459075, "epoch": 0.21648, "grad_norm": 1.7061818838119507, "learning_rate": 3.9192076830732294e-05, "loss": 0.731, "mean_token_accuracy": 0.7740290820598602, "num_tokens": 280472210.0, "step": 27060 }, { "entropy": 0.6925740510225296, "epoch": 0.21656, "grad_norm": 3.5604965686798096, "learning_rate": 3.918807523009204e-05, "loss": 0.6921, "mean_token_accuracy": 0.8015562951564789, "num_tokens": 280553906.0, "step": 27070 }, { "entropy": 0.7371748685836792, "epoch": 0.21664, "grad_norm": 1.9047458171844482, "learning_rate": 3.918407362945178e-05, "loss": 0.7404, "mean_token_accuracy": 0.7920568764209748, "num_tokens": 280647038.0, "step": 27080 }, { "entropy": 0.7275823056697845, "epoch": 0.21672, "grad_norm": 2.8352715969085693, "learning_rate": 3.9180072028811525e-05, "loss": 0.7256, "mean_token_accuracy": 0.7866041183471679, "num_tokens": 280779757.0, "step": 27090 }, { "entropy": 0.6969830155372619, "epoch": 0.2168, "grad_norm": 4.842375755310059, "learning_rate": 3.917607042817127e-05, "loss": 0.6832, "mean_token_accuracy": 0.81367147564888, "num_tokens": 280819630.0, "step": 27100 }, { "entropy": 0.69041348695755, "epoch": 0.21688, "grad_norm": 1.4732410907745361, "learning_rate": 3.917206882753102e-05, "loss": 0.6876, "mean_token_accuracy": 0.7853627383708954, "num_tokens": 280983470.0, "step": 27110 }, { "entropy": 0.7300039499998092, "epoch": 0.21696, "grad_norm": 3.0227651596069336, "learning_rate": 3.9168067226890757e-05, "loss": 0.7296, "mean_token_accuracy": 0.7946610152721405, "num_tokens": 281068938.0, "step": 27120 }, { "entropy": 0.6643126964569092, "epoch": 0.21704, "grad_norm": 1.441568374633789, "learning_rate": 3.91640656262505e-05, "loss": 0.6656, "mean_token_accuracy": 0.8036152839660644, "num_tokens": 281163966.0, "step": 27130 }, { "entropy": 0.6421612501144409, "epoch": 0.21712, "grad_norm": 3.7276384830474854, "learning_rate": 3.9160064025610244e-05, "loss": 0.6387, "mean_token_accuracy": 0.8030059099197387, "num_tokens": 281294976.0, "step": 27140 }, { "entropy": 0.7121159970760346, "epoch": 0.2172, "grad_norm": 4.936465263366699, "learning_rate": 3.9156062424969994e-05, "loss": 0.7142, "mean_token_accuracy": 0.8152485311031341, "num_tokens": 281330566.0, "step": 27150 }, { "entropy": 0.6448746800422669, "epoch": 0.21728, "grad_norm": 1.4111697673797607, "learning_rate": 3.915206082432973e-05, "loss": 0.6464, "mean_token_accuracy": 0.7961345970630646, "num_tokens": 281494406.0, "step": 27160 }, { "entropy": 0.6967593640089035, "epoch": 0.21736, "grad_norm": 3.5986204147338867, "learning_rate": 3.9148059223689475e-05, "loss": 0.6895, "mean_token_accuracy": 0.8004416644573211, "num_tokens": 281592292.0, "step": 27170 }, { "entropy": 0.7719028890132904, "epoch": 0.21744, "grad_norm": 2.070828676223755, "learning_rate": 3.9144057623049226e-05, "loss": 0.7657, "mean_token_accuracy": 0.7842763900756836, "num_tokens": 281687172.0, "step": 27180 }, { "entropy": 0.6673712313175202, "epoch": 0.21752, "grad_norm": 2.0740373134613037, "learning_rate": 3.914005602240897e-05, "loss": 0.6668, "mean_token_accuracy": 0.7934641778469086, "num_tokens": 281830630.0, "step": 27190 }, { "entropy": 0.7508579820394516, "epoch": 0.2176, "grad_norm": 5.102350234985352, "learning_rate": 3.9136054421768706e-05, "loss": 0.7629, "mean_token_accuracy": 0.7969001770019531, "num_tokens": 281871867.0, "step": 27200 }, { "entropy": 0.662693852186203, "epoch": 0.21768, "grad_norm": 1.7995132207870483, "learning_rate": 3.913205282112845e-05, "loss": 0.6574, "mean_token_accuracy": 0.7937103152275086, "num_tokens": 282035707.0, "step": 27210 }, { "entropy": 0.7196696400642395, "epoch": 0.21776, "grad_norm": 3.928300380706787, "learning_rate": 3.91280512204882e-05, "loss": 0.7149, "mean_token_accuracy": 0.7931930899620057, "num_tokens": 282124105.0, "step": 27220 }, { "entropy": 0.7062586843967438, "epoch": 0.21784, "grad_norm": 1.6385482549667358, "learning_rate": 3.9124049619847944e-05, "loss": 0.693, "mean_token_accuracy": 0.80333291888237, "num_tokens": 282215840.0, "step": 27230 }, { "entropy": 0.6660923063755035, "epoch": 0.21792, "grad_norm": 2.147519588470459, "learning_rate": 3.912004801920768e-05, "loss": 0.6661, "mean_token_accuracy": 0.7952483177185059, "num_tokens": 282358344.0, "step": 27240 }, { "entropy": 0.7824674129486084, "epoch": 0.218, "grad_norm": 5.22788143157959, "learning_rate": 3.911604641856743e-05, "loss": 0.7815, "mean_token_accuracy": 0.7964759349822998, "num_tokens": 282395485.0, "step": 27250 }, { "entropy": 0.6280992865562439, "epoch": 0.21808, "grad_norm": 1.9713232517242432, "learning_rate": 3.9112044817927175e-05, "loss": 0.6321, "mean_token_accuracy": 0.797539085149765, "num_tokens": 282559325.0, "step": 27260 }, { "entropy": 0.7160340547561646, "epoch": 0.21816, "grad_norm": 4.383601188659668, "learning_rate": 3.910804321728692e-05, "loss": 0.7065, "mean_token_accuracy": 0.7986875355243683, "num_tokens": 282654183.0, "step": 27270 }, { "entropy": 0.6557425737380982, "epoch": 0.21824, "grad_norm": 1.7092492580413818, "learning_rate": 3.9104041616646656e-05, "loss": 0.6565, "mean_token_accuracy": 0.809484726190567, "num_tokens": 282748641.0, "step": 27280 }, { "entropy": 0.7180464506149292, "epoch": 0.21832, "grad_norm": 2.0008363723754883, "learning_rate": 3.9100040016006406e-05, "loss": 0.7176, "mean_token_accuracy": 0.7845998287200928, "num_tokens": 282900798.0, "step": 27290 }, { "entropy": 0.7368523716926575, "epoch": 0.2184, "grad_norm": 5.206602096557617, "learning_rate": 3.909603841536615e-05, "loss": 0.7242, "mean_token_accuracy": 0.8087309896945953, "num_tokens": 282944705.0, "step": 27300 }, { "entropy": 0.6773066699504853, "epoch": 0.21848, "grad_norm": 1.430687665939331, "learning_rate": 3.9092036814725894e-05, "loss": 0.6797, "mean_token_accuracy": 0.786138254404068, "num_tokens": 283108545.0, "step": 27310 }, { "entropy": 0.635982409119606, "epoch": 0.21856, "grad_norm": 3.105569839477539, "learning_rate": 3.908803521408564e-05, "loss": 0.6324, "mean_token_accuracy": 0.8150593042373657, "num_tokens": 283192699.0, "step": 27320 }, { "entropy": 0.731592184305191, "epoch": 0.21864, "grad_norm": 1.7311171293258667, "learning_rate": 3.908403361344538e-05, "loss": 0.7322, "mean_token_accuracy": 0.7922157406806946, "num_tokens": 283287812.0, "step": 27330 }, { "entropy": 0.7401925981044769, "epoch": 0.21872, "grad_norm": 2.7838785648345947, "learning_rate": 3.9080032012805125e-05, "loss": 0.742, "mean_token_accuracy": 0.779241007566452, "num_tokens": 283424090.0, "step": 27340 }, { "entropy": 0.7493141710758209, "epoch": 0.2188, "grad_norm": 6.290281772613525, "learning_rate": 3.907603041216487e-05, "loss": 0.7268, "mean_token_accuracy": 0.8077750265598297, "num_tokens": 283463082.0, "step": 27350 }, { "entropy": 0.6275368839502334, "epoch": 0.21888, "grad_norm": 1.7335741519927979, "learning_rate": 3.907202881152461e-05, "loss": 0.637, "mean_token_accuracy": 0.7952796816825867, "num_tokens": 283626922.0, "step": 27360 }, { "entropy": 0.708866024017334, "epoch": 0.21896, "grad_norm": 3.3992786407470703, "learning_rate": 3.9068027210884356e-05, "loss": 0.6978, "mean_token_accuracy": 0.8016093611717224, "num_tokens": 283723586.0, "step": 27370 }, { "entropy": 0.6914326310157776, "epoch": 0.21904, "grad_norm": 1.5989832878112793, "learning_rate": 3.90640256102441e-05, "loss": 0.6878, "mean_token_accuracy": 0.8033667027950286, "num_tokens": 283818488.0, "step": 27380 }, { "entropy": 0.7845169842243195, "epoch": 0.21912, "grad_norm": 3.2772469520568848, "learning_rate": 3.9060024009603844e-05, "loss": 0.8, "mean_token_accuracy": 0.7621031701564789, "num_tokens": 283964912.0, "step": 27390 }, { "entropy": 0.7235605657100678, "epoch": 0.2192, "grad_norm": 4.106330394744873, "learning_rate": 3.905602240896359e-05, "loss": 0.7051, "mean_token_accuracy": 0.8084068953990936, "num_tokens": 284009948.0, "step": 27400 }, { "entropy": 0.7073939442634583, "epoch": 0.21928, "grad_norm": 2.2011449337005615, "learning_rate": 3.905202080832333e-05, "loss": 0.7065, "mean_token_accuracy": 0.7851894855499267, "num_tokens": 284170465.0, "step": 27410 }, { "entropy": 0.7392814040184021, "epoch": 0.21936, "grad_norm": 3.422276735305786, "learning_rate": 3.9048019207683075e-05, "loss": 0.7484, "mean_token_accuracy": 0.7933563888072968, "num_tokens": 284245113.0, "step": 27420 }, { "entropy": 0.6711870610713959, "epoch": 0.21944, "grad_norm": 1.8841497898101807, "learning_rate": 3.904401760704282e-05, "loss": 0.6709, "mean_token_accuracy": 0.8047521650791168, "num_tokens": 284338568.0, "step": 27430 }, { "entropy": 0.7361063539981842, "epoch": 0.21952, "grad_norm": 2.422212839126587, "learning_rate": 3.904001600640256e-05, "loss": 0.7315, "mean_token_accuracy": 0.7805924534797668, "num_tokens": 284480607.0, "step": 27440 }, { "entropy": 0.642387467622757, "epoch": 0.2196, "grad_norm": 5.7282633781433105, "learning_rate": 3.9036014405762306e-05, "loss": 0.642, "mean_token_accuracy": 0.8254070281982422, "num_tokens": 284525259.0, "step": 27450 }, { "entropy": 0.6891042768955231, "epoch": 0.21968, "grad_norm": 2.000062942504883, "learning_rate": 3.903201280512205e-05, "loss": 0.6822, "mean_token_accuracy": 0.7889106094837188, "num_tokens": 284689099.0, "step": 27460 }, { "entropy": 0.606512475013733, "epoch": 0.21976, "grad_norm": 3.2905144691467285, "learning_rate": 3.902801120448179e-05, "loss": 0.6076, "mean_token_accuracy": 0.8184005200862885, "num_tokens": 284774630.0, "step": 27470 }, { "entropy": 0.6784806251525879, "epoch": 0.21984, "grad_norm": 1.6617649793624878, "learning_rate": 3.902400960384154e-05, "loss": 0.6775, "mean_token_accuracy": 0.8006407260894776, "num_tokens": 284868757.0, "step": 27480 }, { "entropy": 0.6767319202423095, "epoch": 0.21992, "grad_norm": 3.0667331218719482, "learning_rate": 3.902000800320128e-05, "loss": 0.6755, "mean_token_accuracy": 0.796345728635788, "num_tokens": 285005132.0, "step": 27490 }, { "entropy": 0.6643770068883896, "epoch": 0.22, "grad_norm": 4.3640313148498535, "learning_rate": 3.901600640256103e-05, "loss": 0.6534, "mean_token_accuracy": 0.8192321538925171, "num_tokens": 285049133.0, "step": 27500 }, { "entropy": 0.685813695192337, "epoch": 0.22008, "grad_norm": 2.0408613681793213, "learning_rate": 3.901200480192077e-05, "loss": 0.6871, "mean_token_accuracy": 0.7822239935398102, "num_tokens": 285212973.0, "step": 27510 }, { "entropy": 0.589213240146637, "epoch": 0.22016, "grad_norm": 3.5522654056549072, "learning_rate": 3.900800320128051e-05, "loss": 0.5863, "mean_token_accuracy": 0.8255951642990113, "num_tokens": 285304190.0, "step": 27520 }, { "entropy": 0.7352356344461441, "epoch": 0.22024, "grad_norm": 1.9129046201705933, "learning_rate": 3.9004001600640256e-05, "loss": 0.7303, "mean_token_accuracy": 0.7946894645690918, "num_tokens": 285400036.0, "step": 27530 }, { "entropy": 0.725493985414505, "epoch": 0.22032, "grad_norm": 3.245215892791748, "learning_rate": 3.9000000000000006e-05, "loss": 0.7163, "mean_token_accuracy": 0.7878298759460449, "num_tokens": 285535172.0, "step": 27540 }, { "entropy": 0.6973737478256226, "epoch": 0.2204, "grad_norm": 5.170871257781982, "learning_rate": 3.899599839935974e-05, "loss": 0.6944, "mean_token_accuracy": 0.8135204613208771, "num_tokens": 285574156.0, "step": 27550 }, { "entropy": 0.6769505143165588, "epoch": 0.22048, "grad_norm": 2.1802120208740234, "learning_rate": 3.899199679871949e-05, "loss": 0.6756, "mean_token_accuracy": 0.789106011390686, "num_tokens": 285737996.0, "step": 27560 }, { "entropy": 0.678687134385109, "epoch": 0.22056, "grad_norm": 3.015298843383789, "learning_rate": 3.898799519807924e-05, "loss": 0.6648, "mean_token_accuracy": 0.8066443145275116, "num_tokens": 285835227.0, "step": 27570 }, { "entropy": 0.6886173784732819, "epoch": 0.22064, "grad_norm": 2.8796603679656982, "learning_rate": 3.898399359743898e-05, "loss": 0.7034, "mean_token_accuracy": 0.7989058554172516, "num_tokens": 285928731.0, "step": 27580 }, { "entropy": 0.7203181028366089, "epoch": 0.22072, "grad_norm": 2.9151041507720947, "learning_rate": 3.897999199679872e-05, "loss": 0.7109, "mean_token_accuracy": 0.7884249985218048, "num_tokens": 286063176.0, "step": 27590 }, { "entropy": 0.7628344655036926, "epoch": 0.2208, "grad_norm": 4.555479526519775, "learning_rate": 3.897599039615846e-05, "loss": 0.7564, "mean_token_accuracy": 0.8025054931640625, "num_tokens": 286096757.0, "step": 27600 }, { "entropy": 0.625517749786377, "epoch": 0.22088, "grad_norm": 1.4842493534088135, "learning_rate": 3.897198879551821e-05, "loss": 0.6288, "mean_token_accuracy": 0.7986138224601745, "num_tokens": 286260597.0, "step": 27610 }, { "entropy": 0.7269652187824249, "epoch": 0.22096, "grad_norm": 3.656562566757202, "learning_rate": 3.8967987194877956e-05, "loss": 0.7185, "mean_token_accuracy": 0.7977747857570648, "num_tokens": 286347989.0, "step": 27620 }, { "entropy": 0.6493797779083252, "epoch": 0.22104, "grad_norm": 2.249295949935913, "learning_rate": 3.896398559423769e-05, "loss": 0.6488, "mean_token_accuracy": 0.8122573316097259, "num_tokens": 286440076.0, "step": 27630 }, { "entropy": 0.6996841132640839, "epoch": 0.22112, "grad_norm": 3.462632894515991, "learning_rate": 3.895998399359744e-05, "loss": 0.6943, "mean_token_accuracy": 0.7877792298793793, "num_tokens": 286585265.0, "step": 27640 }, { "entropy": 0.7860414385795593, "epoch": 0.2212, "grad_norm": 4.986369609832764, "learning_rate": 3.895598239295719e-05, "loss": 0.7762, "mean_token_accuracy": 0.7955433189868927, "num_tokens": 286631167.0, "step": 27650 }, { "entropy": 0.6297180354595184, "epoch": 0.22128, "grad_norm": 1.7981730699539185, "learning_rate": 3.895198079231693e-05, "loss": 0.6336, "mean_token_accuracy": 0.7983774840831757, "num_tokens": 286793684.0, "step": 27660 }, { "entropy": 0.680980059504509, "epoch": 0.22136, "grad_norm": 4.3536272048950195, "learning_rate": 3.894797919167667e-05, "loss": 0.672, "mean_token_accuracy": 0.8098876774311066, "num_tokens": 286858477.0, "step": 27670 }, { "entropy": 0.743590384721756, "epoch": 0.22144, "grad_norm": 1.4570989608764648, "learning_rate": 3.894397759103642e-05, "loss": 0.7562, "mean_token_accuracy": 0.7866055250167847, "num_tokens": 286950155.0, "step": 27680 }, { "entropy": 0.7216241002082825, "epoch": 0.22152, "grad_norm": 1.8717843294143677, "learning_rate": 3.893997599039616e-05, "loss": 0.7095, "mean_token_accuracy": 0.7848259031772613, "num_tokens": 287103793.0, "step": 27690 }, { "entropy": 0.5981183409690857, "epoch": 0.2216, "grad_norm": 5.34121561050415, "learning_rate": 3.8935974389755905e-05, "loss": 0.5966, "mean_token_accuracy": 0.8307221591472626, "num_tokens": 287146661.0, "step": 27700 }, { "entropy": 0.6813369631767273, "epoch": 0.22168, "grad_norm": 2.558443307876587, "learning_rate": 3.893197278911565e-05, "loss": 0.6844, "mean_token_accuracy": 0.787108862400055, "num_tokens": 287309501.0, "step": 27710 }, { "entropy": 0.6955247014760971, "epoch": 0.22176, "grad_norm": 4.561991214752197, "learning_rate": 3.892797118847539e-05, "loss": 0.6885, "mean_token_accuracy": 0.806123012304306, "num_tokens": 287385478.0, "step": 27720 }, { "entropy": 0.7711933076381683, "epoch": 0.22184, "grad_norm": 1.7196063995361328, "learning_rate": 3.892396958783514e-05, "loss": 0.7783, "mean_token_accuracy": 0.7811911582946778, "num_tokens": 287478837.0, "step": 27730 }, { "entropy": 0.7149177312850952, "epoch": 0.22192, "grad_norm": 2.413647413253784, "learning_rate": 3.891996798719488e-05, "loss": 0.7086, "mean_token_accuracy": 0.7831164836883545, "num_tokens": 287623925.0, "step": 27740 }, { "entropy": 0.7072100281715393, "epoch": 0.222, "grad_norm": 4.819375991821289, "learning_rate": 3.8915966386554624e-05, "loss": 0.7023, "mean_token_accuracy": 0.8106452465057373, "num_tokens": 287669782.0, "step": 27750 }, { "entropy": 0.7110617280006408, "epoch": 0.22208, "grad_norm": 1.6651989221572876, "learning_rate": 3.891196478591437e-05, "loss": 0.7085, "mean_token_accuracy": 0.7853321969509125, "num_tokens": 287833622.0, "step": 27760 }, { "entropy": 0.6625147759914398, "epoch": 0.22216, "grad_norm": 2.8449928760528564, "learning_rate": 3.890796318527411e-05, "loss": 0.6659, "mean_token_accuracy": 0.8068940877914429, "num_tokens": 287920969.0, "step": 27770 }, { "entropy": 0.690157163143158, "epoch": 0.22224, "grad_norm": 1.4694358110427856, "learning_rate": 3.8903961584633855e-05, "loss": 0.6811, "mean_token_accuracy": 0.8035999715328217, "num_tokens": 288015902.0, "step": 27780 }, { "entropy": 0.6926711082458497, "epoch": 0.22232, "grad_norm": 3.165992021560669, "learning_rate": 3.88999599839936e-05, "loss": 0.6952, "mean_token_accuracy": 0.7915516912937164, "num_tokens": 288150213.0, "step": 27790 }, { "entropy": 0.6571344316005707, "epoch": 0.2224, "grad_norm": 4.528578281402588, "learning_rate": 3.889595838335334e-05, "loss": 0.6382, "mean_token_accuracy": 0.822318696975708, "num_tokens": 288187734.0, "step": 27800 }, { "entropy": 0.6046840369701385, "epoch": 0.22248, "grad_norm": 1.2810511589050293, "learning_rate": 3.8891956782713086e-05, "loss": 0.6106, "mean_token_accuracy": 0.8041280031204223, "num_tokens": 288351574.0, "step": 27810 }, { "entropy": 0.7228864192962646, "epoch": 0.22256, "grad_norm": 4.780538558959961, "learning_rate": 3.888795518207283e-05, "loss": 0.7111, "mean_token_accuracy": 0.7993007838726044, "num_tokens": 288430752.0, "step": 27820 }, { "entropy": 0.6479090929031373, "epoch": 0.22264, "grad_norm": 1.5267966985702515, "learning_rate": 3.8883953581432574e-05, "loss": 0.6608, "mean_token_accuracy": 0.8097300231456757, "num_tokens": 288524476.0, "step": 27830 }, { "entropy": 0.6787120938301087, "epoch": 0.22272, "grad_norm": 1.983798623085022, "learning_rate": 3.887995198079232e-05, "loss": 0.6719, "mean_token_accuracy": 0.7904426634311676, "num_tokens": 288675673.0, "step": 27840 }, { "entropy": 0.687301641702652, "epoch": 0.2228, "grad_norm": 5.025615692138672, "learning_rate": 3.887595038015207e-05, "loss": 0.7055, "mean_token_accuracy": 0.806577080488205, "num_tokens": 288716279.0, "step": 27850 }, { "entropy": 0.6734529614448548, "epoch": 0.22288, "grad_norm": 1.4859118461608887, "learning_rate": 3.8871948779511805e-05, "loss": 0.6746, "mean_token_accuracy": 0.791229385137558, "num_tokens": 288880076.0, "step": 27860 }, { "entropy": 0.6515998065471649, "epoch": 0.22296, "grad_norm": 2.9581737518310547, "learning_rate": 3.886794717887155e-05, "loss": 0.6345, "mean_token_accuracy": 0.8151789426803588, "num_tokens": 288962676.0, "step": 27870 }, { "entropy": 0.6874524533748627, "epoch": 0.22304, "grad_norm": 1.5964078903198242, "learning_rate": 3.886394557823129e-05, "loss": 0.6872, "mean_token_accuracy": 0.806913423538208, "num_tokens": 289057515.0, "step": 27880 }, { "entropy": 0.691349858045578, "epoch": 0.22312, "grad_norm": 4.10420036315918, "learning_rate": 3.885994397759104e-05, "loss": 0.682, "mean_token_accuracy": 0.7923214077949524, "num_tokens": 289194064.0, "step": 27890 }, { "entropy": 0.6193387359380722, "epoch": 0.2232, "grad_norm": 5.200186252593994, "learning_rate": 3.885594237695078e-05, "loss": 0.6347, "mean_token_accuracy": 0.8283923864364624, "num_tokens": 289233722.0, "step": 27900 }, { "entropy": 0.6864478468894959, "epoch": 0.22328, "grad_norm": 2.4385263919830322, "learning_rate": 3.8851940776310523e-05, "loss": 0.6746, "mean_token_accuracy": 0.790442556142807, "num_tokens": 289397400.0, "step": 27910 }, { "entropy": 0.6940521359443664, "epoch": 0.22336, "grad_norm": 4.6802287101745605, "learning_rate": 3.8847939175670274e-05, "loss": 0.6747, "mean_token_accuracy": 0.8091898858547211, "num_tokens": 289469084.0, "step": 27920 }, { "entropy": 0.6675815224647522, "epoch": 0.22344, "grad_norm": 1.6791834831237793, "learning_rate": 3.884393757503002e-05, "loss": 0.6884, "mean_token_accuracy": 0.8012504398822784, "num_tokens": 289562471.0, "step": 27930 }, { "entropy": 0.6806174576282501, "epoch": 0.22352, "grad_norm": 2.6758930683135986, "learning_rate": 3.8839935974389755e-05, "loss": 0.6757, "mean_token_accuracy": 0.7932691514492035, "num_tokens": 289711517.0, "step": 27940 }, { "entropy": 0.6755045175552368, "epoch": 0.2236, "grad_norm": 4.660898685455322, "learning_rate": 3.88359343737495e-05, "loss": 0.655, "mean_token_accuracy": 0.823748517036438, "num_tokens": 289752861.0, "step": 27950 }, { "entropy": 0.692398089170456, "epoch": 0.22368, "grad_norm": 1.9849497079849243, "learning_rate": 3.883193277310925e-05, "loss": 0.6977, "mean_token_accuracy": 0.7800660908222199, "num_tokens": 289916659.0, "step": 27960 }, { "entropy": 0.7195165812969208, "epoch": 0.22376, "grad_norm": 2.9976301193237305, "learning_rate": 3.882793117246899e-05, "loss": 0.7128, "mean_token_accuracy": 0.7979287803173065, "num_tokens": 289993440.0, "step": 27970 }, { "entropy": 0.7148554682731628, "epoch": 0.22384, "grad_norm": 1.7530155181884766, "learning_rate": 3.882392957182873e-05, "loss": 0.7078, "mean_token_accuracy": 0.7980033934116364, "num_tokens": 290087530.0, "step": 27980 }, { "entropy": 0.6749071657657624, "epoch": 0.22392, "grad_norm": 3.1122355461120605, "learning_rate": 3.881992797118848e-05, "loss": 0.6738, "mean_token_accuracy": 0.7994189441204071, "num_tokens": 290225142.0, "step": 27990 }, { "entropy": 0.6803030967712402, "epoch": 0.224, "grad_norm": 7.281333923339844, "learning_rate": 3.8815926370548224e-05, "loss": 0.6796, "mean_token_accuracy": 0.8142123818397522, "num_tokens": 290263349.0, "step": 28000 }, { "entropy": 0.5996529191732407, "epoch": 0.22408, "grad_norm": 1.6785252094268799, "learning_rate": 3.881192476990797e-05, "loss": 0.6005, "mean_token_accuracy": 0.8078224301338196, "num_tokens": 290427189.0, "step": 28010 }, { "entropy": 0.727570527791977, "epoch": 0.22416, "grad_norm": 2.986943244934082, "learning_rate": 3.8807923169267704e-05, "loss": 0.727, "mean_token_accuracy": 0.7987436711788177, "num_tokens": 290511330.0, "step": 28020 }, { "entropy": 0.727657425403595, "epoch": 0.22424, "grad_norm": 1.6877079010009766, "learning_rate": 3.8803921568627455e-05, "loss": 0.7221, "mean_token_accuracy": 0.7942252933979035, "num_tokens": 290604233.0, "step": 28030 }, { "entropy": 0.7375030636787414, "epoch": 0.22432, "grad_norm": 2.3769237995147705, "learning_rate": 3.87999199679872e-05, "loss": 0.7378, "mean_token_accuracy": 0.7809018850326538, "num_tokens": 290736597.0, "step": 28040 }, { "entropy": 0.669641524553299, "epoch": 0.2244, "grad_norm": 5.029604911804199, "learning_rate": 3.879591836734694e-05, "loss": 0.6418, "mean_token_accuracy": 0.8237970292568206, "num_tokens": 290776338.0, "step": 28050 }, { "entropy": 0.6559235095977783, "epoch": 0.22448, "grad_norm": 1.949576497077942, "learning_rate": 3.879191676670668e-05, "loss": 0.6606, "mean_token_accuracy": 0.7909814894199372, "num_tokens": 290939317.0, "step": 28060 }, { "entropy": 0.6225083827972412, "epoch": 0.22456, "grad_norm": 4.314252853393555, "learning_rate": 3.878791516606643e-05, "loss": 0.6185, "mean_token_accuracy": 0.8210379660129548, "num_tokens": 291017148.0, "step": 28070 }, { "entropy": 0.67251535654068, "epoch": 0.22464, "grad_norm": 1.651477336883545, "learning_rate": 3.878391356542617e-05, "loss": 0.6752, "mean_token_accuracy": 0.8064182877540589, "num_tokens": 291109993.0, "step": 28080 }, { "entropy": 0.7055123209953308, "epoch": 0.22472, "grad_norm": 1.8254125118255615, "learning_rate": 3.877991196478592e-05, "loss": 0.709, "mean_token_accuracy": 0.7850914835929871, "num_tokens": 291248579.0, "step": 28090 }, { "entropy": 0.7145269036293029, "epoch": 0.2248, "grad_norm": 5.127782821655273, "learning_rate": 3.877591036414566e-05, "loss": 0.6903, "mean_token_accuracy": 0.8165881752967834, "num_tokens": 291289526.0, "step": 28100 }, { "entropy": 0.6624893248081207, "epoch": 0.22488, "grad_norm": 1.9016002416610718, "learning_rate": 3.8771908763505405e-05, "loss": 0.6672, "mean_token_accuracy": 0.7908586919307709, "num_tokens": 291452776.0, "step": 28110 }, { "entropy": 0.6786527186632156, "epoch": 0.22496, "grad_norm": 3.3006820678710938, "learning_rate": 3.876790716286515e-05, "loss": 0.6727, "mean_token_accuracy": 0.8122219145298004, "num_tokens": 291528318.0, "step": 28120 }, { "entropy": 0.6876749455928802, "epoch": 0.22504, "grad_norm": 2.1124978065490723, "learning_rate": 3.876390556222489e-05, "loss": 0.6798, "mean_token_accuracy": 0.8033161222934723, "num_tokens": 291622153.0, "step": 28130 }, { "entropy": 0.6525245606899261, "epoch": 0.22512, "grad_norm": 2.8715059757232666, "learning_rate": 3.8759903961584636e-05, "loss": 0.6449, "mean_token_accuracy": 0.8021113455295563, "num_tokens": 291761279.0, "step": 28140 }, { "entropy": 0.7173500478267669, "epoch": 0.2252, "grad_norm": 6.229965686798096, "learning_rate": 3.875590236094438e-05, "loss": 0.7154, "mean_token_accuracy": 0.8105143189430237, "num_tokens": 291798593.0, "step": 28150 }, { "entropy": 0.6740898370742798, "epoch": 0.22528, "grad_norm": 1.7160921096801758, "learning_rate": 3.875190076030412e-05, "loss": 0.6751, "mean_token_accuracy": 0.7892503678798676, "num_tokens": 291962390.0, "step": 28160 }, { "entropy": 0.7097939789295197, "epoch": 0.22536, "grad_norm": 3.548095226287842, "learning_rate": 3.874789915966387e-05, "loss": 0.6937, "mean_token_accuracy": 0.801624858379364, "num_tokens": 292044604.0, "step": 28170 }, { "entropy": 0.772225946187973, "epoch": 0.22544, "grad_norm": 2.132248878479004, "learning_rate": 3.874389755902361e-05, "loss": 0.7759, "mean_token_accuracy": 0.7817851960659027, "num_tokens": 292137687.0, "step": 28180 }, { "entropy": 0.6901964366436004, "epoch": 0.22552, "grad_norm": 2.103364944458008, "learning_rate": 3.8739895958383354e-05, "loss": 0.6897, "mean_token_accuracy": 0.7891409277915955, "num_tokens": 292285162.0, "step": 28190 }, { "entropy": 0.6833249807357789, "epoch": 0.2256, "grad_norm": 4.547841548919678, "learning_rate": 3.87358943577431e-05, "loss": 0.6787, "mean_token_accuracy": 0.8176266193389893, "num_tokens": 292331082.0, "step": 28200 }, { "entropy": 0.6697789549827575, "epoch": 0.22568, "grad_norm": 1.5050798654556274, "learning_rate": 3.873189275710284e-05, "loss": 0.6732, "mean_token_accuracy": 0.7898632228374481, "num_tokens": 292494922.0, "step": 28210 }, { "entropy": 0.693873843550682, "epoch": 0.22576, "grad_norm": 3.1510822772979736, "learning_rate": 3.8727891156462585e-05, "loss": 0.6889, "mean_token_accuracy": 0.7999085783958435, "num_tokens": 292600513.0, "step": 28220 }, { "entropy": 0.7259892761707306, "epoch": 0.22584, "grad_norm": 1.5053939819335938, "learning_rate": 3.872388955582233e-05, "loss": 0.7307, "mean_token_accuracy": 0.7938873887062072, "num_tokens": 292696009.0, "step": 28230 }, { "entropy": 0.7005078494548798, "epoch": 0.22592, "grad_norm": 4.358614444732666, "learning_rate": 3.871988795518208e-05, "loss": 0.697, "mean_token_accuracy": 0.7894753277301788, "num_tokens": 292822405.0, "step": 28240 }, { "entropy": 0.7080426633358001, "epoch": 0.226, "grad_norm": 7.563408374786377, "learning_rate": 3.8715886354541817e-05, "loss": 0.7228, "mean_token_accuracy": 0.8086334705352783, "num_tokens": 292857013.0, "step": 28250 }, { "entropy": 0.6557284355163574, "epoch": 0.22608, "grad_norm": 1.8517688512802124, "learning_rate": 3.871188475390156e-05, "loss": 0.6585, "mean_token_accuracy": 0.7912916779518128, "num_tokens": 293020750.0, "step": 28260 }, { "entropy": 0.6628100037574768, "epoch": 0.22616, "grad_norm": 3.587047576904297, "learning_rate": 3.8707883153261304e-05, "loss": 0.6524, "mean_token_accuracy": 0.8141617000102996, "num_tokens": 293094832.0, "step": 28270 }, { "entropy": 0.688450375199318, "epoch": 0.22624, "grad_norm": 1.9984135627746582, "learning_rate": 3.8703881552621054e-05, "loss": 0.6988, "mean_token_accuracy": 0.8009282886981964, "num_tokens": 293188784.0, "step": 28280 }, { "entropy": 0.6511707067489624, "epoch": 0.22632, "grad_norm": 3.0781354904174805, "learning_rate": 3.869987995198079e-05, "loss": 0.6414, "mean_token_accuracy": 0.7988965272903442, "num_tokens": 293331676.0, "step": 28290 }, { "entropy": 0.7644808709621429, "epoch": 0.2264, "grad_norm": 4.246520042419434, "learning_rate": 3.8695878351340535e-05, "loss": 0.7489, "mean_token_accuracy": 0.8008861839771271, "num_tokens": 293374296.0, "step": 28300 }, { "entropy": 0.6463509410619735, "epoch": 0.22648, "grad_norm": 2.0141069889068604, "learning_rate": 3.8691876750700286e-05, "loss": 0.6451, "mean_token_accuracy": 0.7971055269241333, "num_tokens": 293538136.0, "step": 28310 }, { "entropy": 0.6729506254196167, "epoch": 0.22656, "grad_norm": 3.176548719406128, "learning_rate": 3.868787515006003e-05, "loss": 0.676, "mean_token_accuracy": 0.8049908399581909, "num_tokens": 293632106.0, "step": 28320 }, { "entropy": 0.7576556861400604, "epoch": 0.22664, "grad_norm": 1.8616782426834106, "learning_rate": 3.8683873549419766e-05, "loss": 0.7599, "mean_token_accuracy": 0.7849449932575225, "num_tokens": 293726070.0, "step": 28330 }, { "entropy": 0.6987791240215302, "epoch": 0.22672, "grad_norm": 2.9583327770233154, "learning_rate": 3.867987194877951e-05, "loss": 0.6927, "mean_token_accuracy": 0.7891317129135131, "num_tokens": 293870396.0, "step": 28340 }, { "entropy": 0.6760865807533264, "epoch": 0.2268, "grad_norm": 5.55919885635376, "learning_rate": 3.867587034813926e-05, "loss": 0.6698, "mean_token_accuracy": 0.8134198844432831, "num_tokens": 293911911.0, "step": 28350 }, { "entropy": 0.6600249886512757, "epoch": 0.22688, "grad_norm": 1.542479157447815, "learning_rate": 3.8671868747499004e-05, "loss": 0.657, "mean_token_accuracy": 0.7915270686149597, "num_tokens": 294075448.0, "step": 28360 }, { "entropy": 0.7606615900993348, "epoch": 0.22696, "grad_norm": 3.23238205909729, "learning_rate": 3.866786714685874e-05, "loss": 0.7647, "mean_token_accuracy": 0.7915719151496887, "num_tokens": 294153647.0, "step": 28370 }, { "entropy": 0.7210103929042816, "epoch": 0.22704, "grad_norm": 1.6589820384979248, "learning_rate": 3.866386554621849e-05, "loss": 0.7133, "mean_token_accuracy": 0.7934557497501373, "num_tokens": 294246648.0, "step": 28380 }, { "entropy": 0.6951833546161652, "epoch": 0.22712, "grad_norm": 2.2034637928009033, "learning_rate": 3.8659863945578235e-05, "loss": 0.6881, "mean_token_accuracy": 0.7882921934127808, "num_tokens": 294382947.0, "step": 28390 }, { "entropy": 0.6428438901901246, "epoch": 0.2272, "grad_norm": 4.929635524749756, "learning_rate": 3.865586234493798e-05, "loss": 0.6489, "mean_token_accuracy": 0.8280277729034424, "num_tokens": 294418660.0, "step": 28400 }, { "entropy": 0.6382199287414551, "epoch": 0.22728, "grad_norm": 1.6377490758895874, "learning_rate": 3.8651860744297716e-05, "loss": 0.635, "mean_token_accuracy": 0.7969944655895234, "num_tokens": 294582034.0, "step": 28410 }, { "entropy": 0.6505769252777099, "epoch": 0.22736, "grad_norm": 4.563881874084473, "learning_rate": 3.8647859143657466e-05, "loss": 0.6346, "mean_token_accuracy": 0.8150460302829743, "num_tokens": 294664357.0, "step": 28420 }, { "entropy": 0.6632324934005738, "epoch": 0.22744, "grad_norm": 1.9656224250793457, "learning_rate": 3.864385754301721e-05, "loss": 0.6665, "mean_token_accuracy": 0.807571405172348, "num_tokens": 294758025.0, "step": 28430 }, { "entropy": 0.7070749700069427, "epoch": 0.22752, "grad_norm": 2.347139358520508, "learning_rate": 3.8639855942376954e-05, "loss": 0.6988, "mean_token_accuracy": 0.7854972600936889, "num_tokens": 294907823.0, "step": 28440 }, { "entropy": 0.6368288934230805, "epoch": 0.2276, "grad_norm": 4.844583511352539, "learning_rate": 3.86358543417367e-05, "loss": 0.628, "mean_token_accuracy": 0.8276041865348815, "num_tokens": 294953687.0, "step": 28450 }, { "entropy": 0.607543820142746, "epoch": 0.22768, "grad_norm": 1.8935480117797852, "learning_rate": 3.863185274109644e-05, "loss": 0.6074, "mean_token_accuracy": 0.8058439195156097, "num_tokens": 295117527.0, "step": 28460 }, { "entropy": 0.6894995629787445, "epoch": 0.22776, "grad_norm": 3.0910236835479736, "learning_rate": 3.8627851140456185e-05, "loss": 0.674, "mean_token_accuracy": 0.8063475012779235, "num_tokens": 295199843.0, "step": 28470 }, { "entropy": 0.7973409533500672, "epoch": 0.22784, "grad_norm": 1.418645977973938, "learning_rate": 3.862384953981593e-05, "loss": 0.8086, "mean_token_accuracy": 0.7742859661579132, "num_tokens": 295294428.0, "step": 28480 }, { "entropy": 0.653503668308258, "epoch": 0.22792, "grad_norm": 2.382866621017456, "learning_rate": 3.861984793917567e-05, "loss": 0.6469, "mean_token_accuracy": 0.8020063996315002, "num_tokens": 295418112.0, "step": 28490 }, { "entropy": 0.7211060762405396, "epoch": 0.228, "grad_norm": 5.397673606872559, "learning_rate": 3.8615846338535416e-05, "loss": 0.7177, "mean_token_accuracy": 0.812913715839386, "num_tokens": 295452153.0, "step": 28500 }, { "entropy": 0.6789373338222504, "epoch": 0.22808, "grad_norm": 2.397245407104492, "learning_rate": 3.861184473789516e-05, "loss": 0.6738, "mean_token_accuracy": 0.7899975597858429, "num_tokens": 295615993.0, "step": 28510 }, { "entropy": 0.6796456336975097, "epoch": 0.22816, "grad_norm": 3.1006386280059814, "learning_rate": 3.8607843137254904e-05, "loss": 0.6797, "mean_token_accuracy": 0.803111857175827, "num_tokens": 295701045.0, "step": 28520 }, { "entropy": 0.7147704660892487, "epoch": 0.22824, "grad_norm": 1.7065505981445312, "learning_rate": 3.860384153661465e-05, "loss": 0.7199, "mean_token_accuracy": 0.7996283173561096, "num_tokens": 295794455.0, "step": 28530 }, { "entropy": 0.7100818037986756, "epoch": 0.22832, "grad_norm": 2.6792449951171875, "learning_rate": 3.859983993597439e-05, "loss": 0.7063, "mean_token_accuracy": 0.7870419561862946, "num_tokens": 295930145.0, "step": 28540 }, { "entropy": 0.7656387567520142, "epoch": 0.2284, "grad_norm": 5.966435432434082, "learning_rate": 3.8595838335334135e-05, "loss": 0.7578, "mean_token_accuracy": 0.8004308640956879, "num_tokens": 295965340.0, "step": 28550 }, { "entropy": 0.653037765622139, "epoch": 0.22848, "grad_norm": 2.8571081161499023, "learning_rate": 3.859183673469388e-05, "loss": 0.6532, "mean_token_accuracy": 0.7954308748245239, "num_tokens": 296128800.0, "step": 28560 }, { "entropy": 0.6595976173877716, "epoch": 0.22856, "grad_norm": 3.885164737701416, "learning_rate": 3.858783513405362e-05, "loss": 0.6432, "mean_token_accuracy": 0.8122878909111023, "num_tokens": 296208618.0, "step": 28570 }, { "entropy": 0.6884805291891098, "epoch": 0.22864, "grad_norm": 2.461407423019409, "learning_rate": 3.8583833533413366e-05, "loss": 0.6892, "mean_token_accuracy": 0.7986358106136322, "num_tokens": 296301568.0, "step": 28580 }, { "entropy": 0.7108764529228211, "epoch": 0.22872, "grad_norm": 3.736772298812866, "learning_rate": 3.857983193277311e-05, "loss": 0.7113, "mean_token_accuracy": 0.7882125079631805, "num_tokens": 296435538.0, "step": 28590 }, { "entropy": 0.7025870740413666, "epoch": 0.2288, "grad_norm": 4.835516929626465, "learning_rate": 3.857583033213285e-05, "loss": 0.7042, "mean_token_accuracy": 0.8104437291622162, "num_tokens": 296478283.0, "step": 28600 }, { "entropy": 0.6630096077919007, "epoch": 0.22888, "grad_norm": 1.4447485208511353, "learning_rate": 3.85718287314926e-05, "loss": 0.668, "mean_token_accuracy": 0.7893808066844941, "num_tokens": 296642123.0, "step": 28610 }, { "entropy": 0.7112935453653335, "epoch": 0.22896, "grad_norm": 3.197747230529785, "learning_rate": 3.856782713085234e-05, "loss": 0.7063, "mean_token_accuracy": 0.8028533756732941, "num_tokens": 296726937.0, "step": 28620 }, { "entropy": 0.688731449842453, "epoch": 0.22904, "grad_norm": 1.6969014406204224, "learning_rate": 3.856382553021209e-05, "loss": 0.7032, "mean_token_accuracy": 0.7969925463199615, "num_tokens": 296820430.0, "step": 28630 }, { "entropy": 0.7116374671459198, "epoch": 0.22912, "grad_norm": 2.2774152755737305, "learning_rate": 3.855982392957183e-05, "loss": 0.7062, "mean_token_accuracy": 0.7835233688354493, "num_tokens": 296967423.0, "step": 28640 }, { "entropy": 0.6789879024028778, "epoch": 0.2292, "grad_norm": 5.031730651855469, "learning_rate": 3.855582232893157e-05, "loss": 0.6742, "mean_token_accuracy": 0.8154579520225524, "num_tokens": 297012069.0, "step": 28650 }, { "entropy": 0.6439083904027939, "epoch": 0.22928, "grad_norm": 1.4903701543807983, "learning_rate": 3.8551820728291316e-05, "loss": 0.6465, "mean_token_accuracy": 0.7964706003665925, "num_tokens": 297175079.0, "step": 28660 }, { "entropy": 0.6388187348842621, "epoch": 0.22936, "grad_norm": 2.879167318344116, "learning_rate": 3.8547819127651066e-05, "loss": 0.6242, "mean_token_accuracy": 0.8177991509437561, "num_tokens": 297249215.0, "step": 28670 }, { "entropy": 0.6783812642097473, "epoch": 0.22944, "grad_norm": 2.5308241844177246, "learning_rate": 3.85438175270108e-05, "loss": 0.6767, "mean_token_accuracy": 0.8030477464199066, "num_tokens": 297342758.0, "step": 28680 }, { "entropy": 0.7029115438461304, "epoch": 0.22952, "grad_norm": 2.361898422241211, "learning_rate": 3.853981592637055e-05, "loss": 0.7069, "mean_token_accuracy": 0.7867688655853271, "num_tokens": 297482616.0, "step": 28690 }, { "entropy": 0.720455664396286, "epoch": 0.2296, "grad_norm": 4.440229892730713, "learning_rate": 3.85358143257303e-05, "loss": 0.7131, "mean_token_accuracy": 0.8093164503574372, "num_tokens": 297519710.0, "step": 28700 }, { "entropy": 0.680617481470108, "epoch": 0.22968, "grad_norm": 1.3496984243392944, "learning_rate": 3.853181272509004e-05, "loss": 0.6805, "mean_token_accuracy": 0.7866145670413971, "num_tokens": 297683550.0, "step": 28710 }, { "entropy": 0.6568483471870422, "epoch": 0.22976, "grad_norm": 3.432009220123291, "learning_rate": 3.852781112444978e-05, "loss": 0.65, "mean_token_accuracy": 0.8145849525928497, "num_tokens": 297757618.0, "step": 28720 }, { "entropy": 0.6984366178512573, "epoch": 0.22984, "grad_norm": 1.3697004318237305, "learning_rate": 3.852380952380952e-05, "loss": 0.6958, "mean_token_accuracy": 0.8019313156604767, "num_tokens": 297849700.0, "step": 28730 }, { "entropy": 0.7023856282234192, "epoch": 0.22992, "grad_norm": 2.7503435611724854, "learning_rate": 3.851980792316927e-05, "loss": 0.7045, "mean_token_accuracy": 0.7840401828289032, "num_tokens": 297998972.0, "step": 28740 }, { "entropy": 0.7155920892953873, "epoch": 0.23, "grad_norm": 4.67487096786499, "learning_rate": 3.8515806322529016e-05, "loss": 0.6991, "mean_token_accuracy": 0.8089016377925873, "num_tokens": 298043399.0, "step": 28750 }, { "entropy": 0.660211056470871, "epoch": 0.23008, "grad_norm": 2.1325981616973877, "learning_rate": 3.851180472188875e-05, "loss": 0.6646, "mean_token_accuracy": 0.789372193813324, "num_tokens": 298207166.0, "step": 28760 }, { "entropy": 0.6563167572021484, "epoch": 0.23016, "grad_norm": 3.360661506652832, "learning_rate": 3.85078031212485e-05, "loss": 0.647, "mean_token_accuracy": 0.8135738551616669, "num_tokens": 298289644.0, "step": 28770 }, { "entropy": 0.6682308077812195, "epoch": 0.23024, "grad_norm": 2.091371774673462, "learning_rate": 3.850380152060825e-05, "loss": 0.6573, "mean_token_accuracy": 0.8091172277927399, "num_tokens": 298382944.0, "step": 28780 }, { "entropy": 0.6700646668672562, "epoch": 0.23032, "grad_norm": 2.4822332859039307, "learning_rate": 3.849979991996799e-05, "loss": 0.6708, "mean_token_accuracy": 0.792322838306427, "num_tokens": 298525665.0, "step": 28790 }, { "entropy": 0.7360095798969268, "epoch": 0.2304, "grad_norm": 6.793980121612549, "learning_rate": 3.849579831932773e-05, "loss": 0.7471, "mean_token_accuracy": 0.803049647808075, "num_tokens": 298565073.0, "step": 28800 }, { "entropy": 0.7144905209541321, "epoch": 0.23048, "grad_norm": 2.016232490539551, "learning_rate": 3.849179671868748e-05, "loss": 0.7126, "mean_token_accuracy": 0.7802699089050293, "num_tokens": 298728913.0, "step": 28810 }, { "entropy": 0.6206568807363511, "epoch": 0.23056, "grad_norm": 4.3558807373046875, "learning_rate": 3.848779511804722e-05, "loss": 0.6073, "mean_token_accuracy": 0.8179115355014801, "num_tokens": 298825884.0, "step": 28820 }, { "entropy": 0.6753339052200318, "epoch": 0.23064, "grad_norm": 1.7074528932571411, "learning_rate": 3.8483793517406965e-05, "loss": 0.6897, "mean_token_accuracy": 0.8006141483783722, "num_tokens": 298920907.0, "step": 28830 }, { "entropy": 0.6490342617034912, "epoch": 0.23072, "grad_norm": 2.483647584915161, "learning_rate": 3.847979191676671e-05, "loss": 0.6391, "mean_token_accuracy": 0.7999810039997101, "num_tokens": 299052780.0, "step": 28840 }, { "entropy": 0.7046158373355865, "epoch": 0.2308, "grad_norm": 4.295358180999756, "learning_rate": 3.847579031612645e-05, "loss": 0.7011, "mean_token_accuracy": 0.8106746673583984, "num_tokens": 299091023.0, "step": 28850 }, { "entropy": 0.632254832983017, "epoch": 0.23088, "grad_norm": 2.316498279571533, "learning_rate": 3.8471788715486197e-05, "loss": 0.6279, "mean_token_accuracy": 0.8001587748527527, "num_tokens": 299254863.0, "step": 28860 }, { "entropy": 0.6822593420743942, "epoch": 0.23096, "grad_norm": 3.75156569480896, "learning_rate": 3.846778711484594e-05, "loss": 0.683, "mean_token_accuracy": 0.802617073059082, "num_tokens": 299345020.0, "step": 28870 }, { "entropy": 0.6857425153255463, "epoch": 0.23104, "grad_norm": 2.370502233505249, "learning_rate": 3.8463785514205684e-05, "loss": 0.7001, "mean_token_accuracy": 0.7971289873123169, "num_tokens": 299439520.0, "step": 28880 }, { "entropy": 0.6957984209060669, "epoch": 0.23112, "grad_norm": 3.5407731533050537, "learning_rate": 3.845978391356543e-05, "loss": 0.6871, "mean_token_accuracy": 0.791324907541275, "num_tokens": 299578719.0, "step": 28890 }, { "entropy": 0.7403955072164535, "epoch": 0.2312, "grad_norm": 5.299394130706787, "learning_rate": 3.845578231292517e-05, "loss": 0.7241, "mean_token_accuracy": 0.807181179523468, "num_tokens": 299613506.0, "step": 28900 }, { "entropy": 0.6900528490543365, "epoch": 0.23128, "grad_norm": 2.1381916999816895, "learning_rate": 3.8451780712284915e-05, "loss": 0.6893, "mean_token_accuracy": 0.7837811529636383, "num_tokens": 299777346.0, "step": 28910 }, { "entropy": 0.7296165287494659, "epoch": 0.23136, "grad_norm": 3.2298331260681152, "learning_rate": 3.844777911164466e-05, "loss": 0.7259, "mean_token_accuracy": 0.7898525893688202, "num_tokens": 299878932.0, "step": 28920 }, { "entropy": 0.6924139022827148, "epoch": 0.23144, "grad_norm": 1.8891704082489014, "learning_rate": 3.84437775110044e-05, "loss": 0.6998, "mean_token_accuracy": 0.8016445398330688, "num_tokens": 299972099.0, "step": 28930 }, { "entropy": 0.7102539598941803, "epoch": 0.23152, "grad_norm": 2.5427157878875732, "learning_rate": 3.8439775910364146e-05, "loss": 0.6997, "mean_token_accuracy": 0.7865810334682465, "num_tokens": 300123508.0, "step": 28940 }, { "entropy": 0.7149665534496308, "epoch": 0.2316, "grad_norm": 4.300592422485352, "learning_rate": 3.843577430972389e-05, "loss": 0.7139, "mean_token_accuracy": 0.8058163464069367, "num_tokens": 300166838.0, "step": 28950 }, { "entropy": 0.6752479076385498, "epoch": 0.23168, "grad_norm": 1.773704171180725, "learning_rate": 3.8431772709083634e-05, "loss": 0.6777, "mean_token_accuracy": 0.788512909412384, "num_tokens": 300329381.0, "step": 28960 }, { "entropy": 0.6708120316267013, "epoch": 0.23176, "grad_norm": 3.010099172592163, "learning_rate": 3.842777110844338e-05, "loss": 0.6616, "mean_token_accuracy": 0.8097731113433838, "num_tokens": 300411833.0, "step": 28970 }, { "entropy": 0.6708915948867797, "epoch": 0.23184, "grad_norm": 1.5974271297454834, "learning_rate": 3.842376950780313e-05, "loss": 0.6769, "mean_token_accuracy": 0.8023571312427521, "num_tokens": 300506310.0, "step": 28980 }, { "entropy": 0.6889089345932007, "epoch": 0.23192, "grad_norm": 4.401962757110596, "learning_rate": 3.8419767907162865e-05, "loss": 0.6847, "mean_token_accuracy": 0.7910833418369293, "num_tokens": 300644632.0, "step": 28990 }, { "entropy": 0.6963737517595291, "epoch": 0.232, "grad_norm": 5.4122490882873535, "learning_rate": 3.841576630652261e-05, "loss": 0.6995, "mean_token_accuracy": 0.8160124599933625, "num_tokens": 300682930.0, "step": 29000 }, { "entropy": 0.7105002701282501, "epoch": 0.23208, "grad_norm": 1.5396411418914795, "learning_rate": 3.841176470588235e-05, "loss": 0.7016, "mean_token_accuracy": 0.783719927072525, "num_tokens": 300846186.0, "step": 29010 }, { "entropy": 0.6825655937194824, "epoch": 0.23216, "grad_norm": 3.213820695877075, "learning_rate": 3.84077631052421e-05, "loss": 0.6772, "mean_token_accuracy": 0.80758256316185, "num_tokens": 300916053.0, "step": 29020 }, { "entropy": 0.7542977809906006, "epoch": 0.23224, "grad_norm": 1.5993452072143555, "learning_rate": 3.840376150460184e-05, "loss": 0.7562, "mean_token_accuracy": 0.7874170541763306, "num_tokens": 301009132.0, "step": 29030 }, { "entropy": 0.681239104270935, "epoch": 0.23232, "grad_norm": 2.1569643020629883, "learning_rate": 3.8399759903961583e-05, "loss": 0.6778, "mean_token_accuracy": 0.7960439085960388, "num_tokens": 301139411.0, "step": 29040 }, { "entropy": 0.7083799302577972, "epoch": 0.2324, "grad_norm": 5.159834861755371, "learning_rate": 3.8395758303321334e-05, "loss": 0.7076, "mean_token_accuracy": 0.8165555238723755, "num_tokens": 301174607.0, "step": 29050 }, { "entropy": 0.6856056094169617, "epoch": 0.23248, "grad_norm": 1.7012746334075928, "learning_rate": 3.839175670268108e-05, "loss": 0.6826, "mean_token_accuracy": 0.7837811470031738, "num_tokens": 301338447.0, "step": 29060 }, { "entropy": 0.6362015843391419, "epoch": 0.23256, "grad_norm": 4.580314636230469, "learning_rate": 3.8387755102040815e-05, "loss": 0.6326, "mean_token_accuracy": 0.8186413884162903, "num_tokens": 301428650.0, "step": 29070 }, { "entropy": 0.7242695152759552, "epoch": 0.23264, "grad_norm": 2.6790223121643066, "learning_rate": 3.838375350140056e-05, "loss": 0.7281, "mean_token_accuracy": 0.7911460757255554, "num_tokens": 301522370.0, "step": 29080 }, { "entropy": 0.7348896145820618, "epoch": 0.23272, "grad_norm": 2.2438042163848877, "learning_rate": 3.837975190076031e-05, "loss": 0.7222, "mean_token_accuracy": 0.780486136674881, "num_tokens": 301663582.0, "step": 29090 }, { "entropy": 0.6411467283964157, "epoch": 0.2328, "grad_norm": 4.6943888664245605, "learning_rate": 3.837575030012005e-05, "loss": 0.6599, "mean_token_accuracy": 0.8203813433647156, "num_tokens": 301702743.0, "step": 29100 }, { "entropy": 0.6900161921977996, "epoch": 0.23288, "grad_norm": 2.0202529430389404, "learning_rate": 3.837174869947979e-05, "loss": 0.6908, "mean_token_accuracy": 0.7849461257457733, "num_tokens": 301866438.0, "step": 29110 }, { "entropy": 0.644134908914566, "epoch": 0.23296, "grad_norm": 2.7339491844177246, "learning_rate": 3.836774709883953e-05, "loss": 0.6315, "mean_token_accuracy": 0.8153997182846069, "num_tokens": 301947392.0, "step": 29120 }, { "entropy": 0.7219496011734009, "epoch": 0.23304, "grad_norm": 1.3723983764648438, "learning_rate": 3.8363745498199284e-05, "loss": 0.726, "mean_token_accuracy": 0.79398113489151, "num_tokens": 302041430.0, "step": 29130 }, { "entropy": 0.7586014628410339, "epoch": 0.23312, "grad_norm": 2.1862661838531494, "learning_rate": 3.835974389755903e-05, "loss": 0.7488, "mean_token_accuracy": 0.7763629853725433, "num_tokens": 302180491.0, "step": 29140 }, { "entropy": 0.7144089221954346, "epoch": 0.2332, "grad_norm": 6.410830974578857, "learning_rate": 3.8355742296918764e-05, "loss": 0.7122, "mean_token_accuracy": 0.8098628938198089, "num_tokens": 302219597.0, "step": 29150 }, { "entropy": 0.6721783399581909, "epoch": 0.23328, "grad_norm": 1.5875266790390015, "learning_rate": 3.8351740696278515e-05, "loss": 0.6725, "mean_token_accuracy": 0.7865951716899872, "num_tokens": 302383006.0, "step": 29160 }, { "entropy": 0.7231800973415374, "epoch": 0.23336, "grad_norm": 4.868473529815674, "learning_rate": 3.834773909563826e-05, "loss": 0.7212, "mean_token_accuracy": 0.8032155692577362, "num_tokens": 302457865.0, "step": 29170 }, { "entropy": 0.7115787327289581, "epoch": 0.23344, "grad_norm": 2.4406514167785645, "learning_rate": 3.8343737494998e-05, "loss": 0.7076, "mean_token_accuracy": 0.7979638874530792, "num_tokens": 302550633.0, "step": 29180 }, { "entropy": 0.7004437386989594, "epoch": 0.23352, "grad_norm": 2.410276412963867, "learning_rate": 3.833973589435774e-05, "loss": 0.6963, "mean_token_accuracy": 0.7868836343288421, "num_tokens": 302688613.0, "step": 29190 }, { "entropy": 0.7084612250328064, "epoch": 0.2336, "grad_norm": 4.005441188812256, "learning_rate": 3.833573429371749e-05, "loss": 0.715, "mean_token_accuracy": 0.8070316016674042, "num_tokens": 302726905.0, "step": 29200 }, { "entropy": 0.6583856463432312, "epoch": 0.23368, "grad_norm": 2.526380777359009, "learning_rate": 3.833173269307723e-05, "loss": 0.6629, "mean_token_accuracy": 0.7892647802829742, "num_tokens": 302890745.0, "step": 29210 }, { "entropy": 0.6370019137859344, "epoch": 0.23376, "grad_norm": 2.921710729598999, "learning_rate": 3.832773109243698e-05, "loss": 0.6269, "mean_token_accuracy": 0.812660425901413, "num_tokens": 302998041.0, "step": 29220 }, { "entropy": 0.7181430995464325, "epoch": 0.23384, "grad_norm": 1.4312785863876343, "learning_rate": 3.832372949179672e-05, "loss": 0.7188, "mean_token_accuracy": 0.796421492099762, "num_tokens": 303093290.0, "step": 29230 }, { "entropy": 0.7598770678043365, "epoch": 0.23392, "grad_norm": 2.584834098815918, "learning_rate": 3.8319727891156464e-05, "loss": 0.7515, "mean_token_accuracy": 0.7774403631687165, "num_tokens": 303239520.0, "step": 29240 }, { "entropy": 0.6654952228069305, "epoch": 0.234, "grad_norm": 5.446678161621094, "learning_rate": 3.831572629051621e-05, "loss": 0.6446, "mean_token_accuracy": 0.8214798033237457, "num_tokens": 303282420.0, "step": 29250 }, { "entropy": 0.6941537320613861, "epoch": 0.23408, "grad_norm": 1.74727201461792, "learning_rate": 3.831172468987595e-05, "loss": 0.7039, "mean_token_accuracy": 0.7803591012954711, "num_tokens": 303446244.0, "step": 29260 }, { "entropy": 0.7263847947120666, "epoch": 0.23416, "grad_norm": 5.307975769042969, "learning_rate": 3.8307723089235696e-05, "loss": 0.7229, "mean_token_accuracy": 0.7973493576049805, "num_tokens": 303532021.0, "step": 29270 }, { "entropy": 0.6922463834285736, "epoch": 0.23424, "grad_norm": 2.1316757202148438, "learning_rate": 3.830372148859544e-05, "loss": 0.6939, "mean_token_accuracy": 0.7989501953125, "num_tokens": 303626083.0, "step": 29280 }, { "entropy": 0.6589918553829193, "epoch": 0.23432, "grad_norm": 2.264829397201538, "learning_rate": 3.829971988795518e-05, "loss": 0.6523, "mean_token_accuracy": 0.7991452217102051, "num_tokens": 303764645.0, "step": 29290 }, { "entropy": 0.744175472855568, "epoch": 0.2344, "grad_norm": 4.786051273345947, "learning_rate": 3.829571828731493e-05, "loss": 0.7284, "mean_token_accuracy": 0.8083287656307221, "num_tokens": 303801502.0, "step": 29300 }, { "entropy": 0.6342475354671478, "epoch": 0.23448, "grad_norm": 2.2035586833953857, "learning_rate": 3.829171668667467e-05, "loss": 0.6369, "mean_token_accuracy": 0.7967384457588196, "num_tokens": 303964642.0, "step": 29310 }, { "entropy": 0.6528411090373993, "epoch": 0.23456, "grad_norm": 3.1775949001312256, "learning_rate": 3.8287715086034414e-05, "loss": 0.6468, "mean_token_accuracy": 0.8146111845970154, "num_tokens": 304039088.0, "step": 29320 }, { "entropy": 0.6900832116603851, "epoch": 0.23464, "grad_norm": 2.0905158519744873, "learning_rate": 3.828371348539416e-05, "loss": 0.6852, "mean_token_accuracy": 0.8048605501651764, "num_tokens": 304132413.0, "step": 29330 }, { "entropy": 0.7187586486339569, "epoch": 0.23472, "grad_norm": 2.5334837436676025, "learning_rate": 3.82797118847539e-05, "loss": 0.7196, "mean_token_accuracy": 0.7849082708358764, "num_tokens": 304258652.0, "step": 29340 }, { "entropy": 0.6925971776247024, "epoch": 0.2348, "grad_norm": 5.1118621826171875, "learning_rate": 3.8275710284113645e-05, "loss": 0.678, "mean_token_accuracy": 0.8208799600601197, "num_tokens": 304297069.0, "step": 29350 }, { "entropy": 0.6854688644409179, "epoch": 0.23488, "grad_norm": 1.9462807178497314, "learning_rate": 3.827170868347339e-05, "loss": 0.6826, "mean_token_accuracy": 0.7876404464244843, "num_tokens": 304460909.0, "step": 29360 }, { "entropy": 0.6815329968929291, "epoch": 0.23496, "grad_norm": 3.9393317699432373, "learning_rate": 3.826770708283314e-05, "loss": 0.6738, "mean_token_accuracy": 0.7988922536373139, "num_tokens": 304563053.0, "step": 29370 }, { "entropy": 0.6675440073013306, "epoch": 0.23504, "grad_norm": 1.7929977178573608, "learning_rate": 3.8263705482192876e-05, "loss": 0.6604, "mean_token_accuracy": 0.8114097654819489, "num_tokens": 304657331.0, "step": 29380 }, { "entropy": 0.6548615634441376, "epoch": 0.23512, "grad_norm": 2.6241843700408936, "learning_rate": 3.825970388155262e-05, "loss": 0.6546, "mean_token_accuracy": 0.8011631488800048, "num_tokens": 304776361.0, "step": 29390 }, { "entropy": 0.7441423088312149, "epoch": 0.2352, "grad_norm": 5.318191051483154, "learning_rate": 3.8255702280912364e-05, "loss": 0.7485, "mean_token_accuracy": 0.8061083614826202, "num_tokens": 304811098.0, "step": 29400 }, { "entropy": 0.6581678271293641, "epoch": 0.23528, "grad_norm": 1.8375341892242432, "learning_rate": 3.8251700680272114e-05, "loss": 0.6495, "mean_token_accuracy": 0.7943026483058929, "num_tokens": 304974938.0, "step": 29410 }, { "entropy": 0.6746938526630402, "epoch": 0.23536, "grad_norm": 4.001742362976074, "learning_rate": 3.824769907963185e-05, "loss": 0.6819, "mean_token_accuracy": 0.8018779873847961, "num_tokens": 305077462.0, "step": 29420 }, { "entropy": 0.7560806810855866, "epoch": 0.23544, "grad_norm": 1.5373061895370483, "learning_rate": 3.8243697478991595e-05, "loss": 0.7501, "mean_token_accuracy": 0.7870127558708191, "num_tokens": 305173415.0, "step": 29430 }, { "entropy": 0.7505146563053131, "epoch": 0.23552, "grad_norm": 2.1271564960479736, "learning_rate": 3.8239695878351346e-05, "loss": 0.7522, "mean_token_accuracy": 0.7724569439888, "num_tokens": 305319556.0, "step": 29440 }, { "entropy": 0.6418207287788391, "epoch": 0.2356, "grad_norm": 4.262415885925293, "learning_rate": 3.823569427771109e-05, "loss": 0.6257, "mean_token_accuracy": 0.827003538608551, "num_tokens": 305361219.0, "step": 29450 }, { "entropy": 0.6459989696741104, "epoch": 0.23568, "grad_norm": 2.23437762260437, "learning_rate": 3.8231692677070826e-05, "loss": 0.644, "mean_token_accuracy": 0.7938017129898072, "num_tokens": 305524494.0, "step": 29460 }, { "entropy": 0.657333317399025, "epoch": 0.23576, "grad_norm": 2.785377264022827, "learning_rate": 3.822769107643057e-05, "loss": 0.655, "mean_token_accuracy": 0.8091710090637207, "num_tokens": 305607065.0, "step": 29470 }, { "entropy": 0.7071308493614197, "epoch": 0.23584, "grad_norm": 1.6539422273635864, "learning_rate": 3.822368947579032e-05, "loss": 0.7129, "mean_token_accuracy": 0.7978179812431335, "num_tokens": 305701432.0, "step": 29480 }, { "entropy": 0.691384756565094, "epoch": 0.23592, "grad_norm": 2.4439353942871094, "learning_rate": 3.8219687875150064e-05, "loss": 0.678, "mean_token_accuracy": 0.7898975908756256, "num_tokens": 305843401.0, "step": 29490 }, { "entropy": 0.5948547095060348, "epoch": 0.236, "grad_norm": 5.003346920013428, "learning_rate": 3.82156862745098e-05, "loss": 0.6119, "mean_token_accuracy": 0.8328547894954681, "num_tokens": 305882060.0, "step": 29500 }, { "entropy": 0.7120680332183837, "epoch": 0.23608, "grad_norm": 1.9594749212265015, "learning_rate": 3.821168467386955e-05, "loss": 0.7075, "mean_token_accuracy": 0.7835002481937409, "num_tokens": 306045900.0, "step": 29510 }, { "entropy": 0.6462056040763855, "epoch": 0.23616, "grad_norm": 2.5999088287353516, "learning_rate": 3.8207683073229295e-05, "loss": 0.6384, "mean_token_accuracy": 0.8107047796249389, "num_tokens": 306135448.0, "step": 29520 }, { "entropy": 0.7184804677963257, "epoch": 0.23624, "grad_norm": 2.3943724632263184, "learning_rate": 3.820368147258904e-05, "loss": 0.7231, "mean_token_accuracy": 0.7933057129383088, "num_tokens": 306230919.0, "step": 29530 }, { "entropy": 0.7160722315311432, "epoch": 0.23632, "grad_norm": 2.617692470550537, "learning_rate": 3.8199679871948776e-05, "loss": 0.709, "mean_token_accuracy": 0.782482260465622, "num_tokens": 306372516.0, "step": 29540 }, { "entropy": 0.6801962912082672, "epoch": 0.2364, "grad_norm": 3.590909481048584, "learning_rate": 3.8195678271308526e-05, "loss": 0.6799, "mean_token_accuracy": 0.8229756951332092, "num_tokens": 306405881.0, "step": 29550 }, { "entropy": 0.6414723813533783, "epoch": 0.23648, "grad_norm": 1.7715513706207275, "learning_rate": 3.819167667066827e-05, "loss": 0.6378, "mean_token_accuracy": 0.7978199899196625, "num_tokens": 306569721.0, "step": 29560 }, { "entropy": 0.6797637224197388, "epoch": 0.23656, "grad_norm": 2.770639419555664, "learning_rate": 3.8187675070028014e-05, "loss": 0.6561, "mean_token_accuracy": 0.8108890354633331, "num_tokens": 306657290.0, "step": 29570 }, { "entropy": 0.7536417782306671, "epoch": 0.23664, "grad_norm": 1.544022560119629, "learning_rate": 3.818367346938776e-05, "loss": 0.7384, "mean_token_accuracy": 0.7916944980621338, "num_tokens": 306752563.0, "step": 29580 }, { "entropy": 0.6645780205726624, "epoch": 0.23672, "grad_norm": 2.833270788192749, "learning_rate": 3.81796718687475e-05, "loss": 0.6685, "mean_token_accuracy": 0.793790078163147, "num_tokens": 306887664.0, "step": 29590 }, { "entropy": 0.6281648993492126, "epoch": 0.2368, "grad_norm": 5.464479923248291, "learning_rate": 3.8175670268107245e-05, "loss": 0.6288, "mean_token_accuracy": 0.828252911567688, "num_tokens": 306925825.0, "step": 29600 }, { "entropy": 0.6245821475982666, "epoch": 0.23688, "grad_norm": 1.6779581308364868, "learning_rate": 3.817166866746699e-05, "loss": 0.6263, "mean_token_accuracy": 0.8059965789318084, "num_tokens": 307089665.0, "step": 29610 }, { "entropy": 0.664203155040741, "epoch": 0.23696, "grad_norm": 2.8661742210388184, "learning_rate": 3.816766706682673e-05, "loss": 0.6571, "mean_token_accuracy": 0.8022767961025238, "num_tokens": 307189873.0, "step": 29620 }, { "entropy": 0.7231469571590423, "epoch": 0.23704, "grad_norm": 1.6892688274383545, "learning_rate": 3.8163665466186476e-05, "loss": 0.7108, "mean_token_accuracy": 0.7981338739395142, "num_tokens": 307285597.0, "step": 29630 }, { "entropy": 0.697960615158081, "epoch": 0.23712, "grad_norm": 2.575655937194824, "learning_rate": 3.815966386554622e-05, "loss": 0.7045, "mean_token_accuracy": 0.7899921774864197, "num_tokens": 307420714.0, "step": 29640 }, { "entropy": 0.726756751537323, "epoch": 0.2372, "grad_norm": 6.080735206604004, "learning_rate": 3.815566226490597e-05, "loss": 0.713, "mean_token_accuracy": 0.8086764216423035, "num_tokens": 307461190.0, "step": 29650 }, { "entropy": 0.7001678943634033, "epoch": 0.23728, "grad_norm": 2.8182339668273926, "learning_rate": 3.815166066426571e-05, "loss": 0.7016, "mean_token_accuracy": 0.7809477329254151, "num_tokens": 307625030.0, "step": 29660 }, { "entropy": 0.6890390008687973, "epoch": 0.23736, "grad_norm": 3.7839322090148926, "learning_rate": 3.814765906362545e-05, "loss": 0.681, "mean_token_accuracy": 0.806336909532547, "num_tokens": 307713424.0, "step": 29670 }, { "entropy": 0.7116407096385956, "epoch": 0.23744, "grad_norm": 1.609990119934082, "learning_rate": 3.8143657462985195e-05, "loss": 0.721, "mean_token_accuracy": 0.7936419427394867, "num_tokens": 307809391.0, "step": 29680 }, { "entropy": 0.7101019620895386, "epoch": 0.23752, "grad_norm": 2.937375545501709, "learning_rate": 3.8139655862344945e-05, "loss": 0.7058, "mean_token_accuracy": 0.7867546856403351, "num_tokens": 307945620.0, "step": 29690 }, { "entropy": 0.6790033757686615, "epoch": 0.2376, "grad_norm": 4.416901588439941, "learning_rate": 3.813565426170468e-05, "loss": 0.6735, "mean_token_accuracy": 0.8160673260688782, "num_tokens": 307981157.0, "step": 29700 }, { "entropy": 0.6576007246971131, "epoch": 0.23768, "grad_norm": 1.610428810119629, "learning_rate": 3.8131652661064426e-05, "loss": 0.6579, "mean_token_accuracy": 0.7898449122905731, "num_tokens": 308144997.0, "step": 29710 }, { "entropy": 0.6495826274156571, "epoch": 0.23776, "grad_norm": 3.263939380645752, "learning_rate": 3.812765106042417e-05, "loss": 0.6494, "mean_token_accuracy": 0.810878449678421, "num_tokens": 308231387.0, "step": 29720 }, { "entropy": 0.6619291603565216, "epoch": 0.23784, "grad_norm": 1.319502353668213, "learning_rate": 3.812364945978392e-05, "loss": 0.6575, "mean_token_accuracy": 0.8121784865856171, "num_tokens": 308324365.0, "step": 29730 }, { "entropy": 0.7150793373584747, "epoch": 0.23792, "grad_norm": 2.2126567363739014, "learning_rate": 3.811964785914366e-05, "loss": 0.7061, "mean_token_accuracy": 0.7829699337482452, "num_tokens": 308470792.0, "step": 29740 }, { "entropy": 0.6732642412185669, "epoch": 0.238, "grad_norm": 4.796016216278076, "learning_rate": 3.81156462585034e-05, "loss": 0.6819, "mean_token_accuracy": 0.8124646961688995, "num_tokens": 308516167.0, "step": 29750 }, { "entropy": 0.6644490480422973, "epoch": 0.23808, "grad_norm": 1.565529465675354, "learning_rate": 3.811164465786315e-05, "loss": 0.6655, "mean_token_accuracy": 0.7899364948272705, "num_tokens": 308680007.0, "step": 29760 }, { "entropy": 0.7451524913311005, "epoch": 0.23816, "grad_norm": 3.185251474380493, "learning_rate": 3.8107643057222895e-05, "loss": 0.7392, "mean_token_accuracy": 0.7906656622886657, "num_tokens": 308775421.0, "step": 29770 }, { "entropy": 0.6916776955127716, "epoch": 0.23824, "grad_norm": 1.5820097923278809, "learning_rate": 3.810364145658263e-05, "loss": 0.6993, "mean_token_accuracy": 0.7982605934143067, "num_tokens": 308869147.0, "step": 29780 }, { "entropy": 0.7194137215614319, "epoch": 0.23832, "grad_norm": 2.475923538208008, "learning_rate": 3.8099639855942375e-05, "loss": 0.7107, "mean_token_accuracy": 0.7822701930999756, "num_tokens": 309002254.0, "step": 29790 }, { "entropy": 0.7295898079872132, "epoch": 0.2384, "grad_norm": 5.240875720977783, "learning_rate": 3.8095638255302126e-05, "loss": 0.7211, "mean_token_accuracy": 0.8065344989299774, "num_tokens": 309036644.0, "step": 29800 }, { "entropy": 0.7197242736816406, "epoch": 0.23848, "grad_norm": 1.8394410610198975, "learning_rate": 3.809163665466187e-05, "loss": 0.721, "mean_token_accuracy": 0.7786883294582367, "num_tokens": 309200484.0, "step": 29810 }, { "entropy": 0.7262386500835418, "epoch": 0.23856, "grad_norm": 3.795287609100342, "learning_rate": 3.808763505402161e-05, "loss": 0.7248, "mean_token_accuracy": 0.7971453428268432, "num_tokens": 309288757.0, "step": 29820 }, { "entropy": 0.7054389119148254, "epoch": 0.23864, "grad_norm": 2.0597500801086426, "learning_rate": 3.808363345338136e-05, "loss": 0.6969, "mean_token_accuracy": 0.7981247127056121, "num_tokens": 309383846.0, "step": 29830 }, { "entropy": 0.6422497034072876, "epoch": 0.23872, "grad_norm": 2.764244556427002, "learning_rate": 3.80796318527411e-05, "loss": 0.6429, "mean_token_accuracy": 0.8025283575057983, "num_tokens": 309521607.0, "step": 29840 }, { "entropy": 0.7307280987501145, "epoch": 0.2388, "grad_norm": 4.944462776184082, "learning_rate": 3.8075630252100845e-05, "loss": 0.724, "mean_token_accuracy": 0.8094738543033599, "num_tokens": 309557116.0, "step": 29850 }, { "entropy": 0.6502196192741394, "epoch": 0.23888, "grad_norm": 2.290173053741455, "learning_rate": 3.807162865146058e-05, "loss": 0.6531, "mean_token_accuracy": 0.7947106003761292, "num_tokens": 309720294.0, "step": 29860 }, { "entropy": 0.700825309753418, "epoch": 0.23896, "grad_norm": 3.5908236503601074, "learning_rate": 3.806762705082033e-05, "loss": 0.6829, "mean_token_accuracy": 0.8095528900623321, "num_tokens": 309798210.0, "step": 29870 }, { "entropy": 0.6663327693939209, "epoch": 0.23904, "grad_norm": 1.9092434644699097, "learning_rate": 3.8063625450180076e-05, "loss": 0.6819, "mean_token_accuracy": 0.8037920951843261, "num_tokens": 309890337.0, "step": 29880 }, { "entropy": 0.7491155505180359, "epoch": 0.23912, "grad_norm": 2.236365795135498, "learning_rate": 3.805962384953982e-05, "loss": 0.7462, "mean_token_accuracy": 0.7767568111419678, "num_tokens": 310028051.0, "step": 29890 }, { "entropy": 0.6559141755104065, "epoch": 0.2392, "grad_norm": 3.92087984085083, "learning_rate": 3.805562224889956e-05, "loss": 0.649, "mean_token_accuracy": 0.8210954189300537, "num_tokens": 310067367.0, "step": 29900 }, { "entropy": 0.6835334420204162, "epoch": 0.23928, "grad_norm": 2.4502384662628174, "learning_rate": 3.805162064825931e-05, "loss": 0.6757, "mean_token_accuracy": 0.7934924960136414, "num_tokens": 310229724.0, "step": 29910 }, { "entropy": 0.6515567123889923, "epoch": 0.23936, "grad_norm": 3.3504292964935303, "learning_rate": 3.804761904761905e-05, "loss": 0.643, "mean_token_accuracy": 0.8158716917037964, "num_tokens": 310301885.0, "step": 29920 }, { "entropy": 0.7725499153137207, "epoch": 0.23944, "grad_norm": 2.2583558559417725, "learning_rate": 3.8043617446978794e-05, "loss": 0.7798, "mean_token_accuracy": 0.7854190409183502, "num_tokens": 310395768.0, "step": 29930 }, { "entropy": 0.7569449245929718, "epoch": 0.23952, "grad_norm": 3.0039315223693848, "learning_rate": 3.803961584633854e-05, "loss": 0.7548, "mean_token_accuracy": 0.781719696521759, "num_tokens": 310525595.0, "step": 29940 }, { "entropy": 0.8112922430038452, "epoch": 0.2396, "grad_norm": 4.408492565155029, "learning_rate": 3.803561424569828e-05, "loss": 0.7987, "mean_token_accuracy": 0.7889165937900543, "num_tokens": 310561716.0, "step": 29950 }, { "entropy": 0.621287390589714, "epoch": 0.23968, "grad_norm": 1.5304372310638428, "learning_rate": 3.8031612645058025e-05, "loss": 0.6232, "mean_token_accuracy": 0.8011419177055359, "num_tokens": 310725556.0, "step": 29960 }, { "entropy": 0.7021441251039505, "epoch": 0.23976, "grad_norm": 4.909167766571045, "learning_rate": 3.802761104441777e-05, "loss": 0.6978, "mean_token_accuracy": 0.8012277185916901, "num_tokens": 310817467.0, "step": 29970 }, { "entropy": 0.6788117468357087, "epoch": 0.23984, "grad_norm": 1.4661452770233154, "learning_rate": 3.802360944377751e-05, "loss": 0.6744, "mean_token_accuracy": 0.8042880296707153, "num_tokens": 310911990.0, "step": 29980 }, { "entropy": 0.7029042959213256, "epoch": 0.23992, "grad_norm": 2.053616762161255, "learning_rate": 3.8019607843137257e-05, "loss": 0.6922, "mean_token_accuracy": 0.7851566135883331, "num_tokens": 311060999.0, "step": 29990 }, { "entropy": 0.6646015644073486, "epoch": 0.24, "grad_norm": 4.623722076416016, "learning_rate": 3.8015606242497e-05, "loss": 0.6674, "mean_token_accuracy": 0.8192285418510437, "num_tokens": 311101134.0, "step": 30000 }, { "entropy": 0.6394200205802918, "epoch": 0.24008, "grad_norm": 1.6872361898422241, "learning_rate": 3.8011604641856744e-05, "loss": 0.6481, "mean_token_accuracy": 0.7952674686908722, "num_tokens": 311264974.0, "step": 30010 }, { "entropy": 0.6254288077354431, "epoch": 0.24016, "grad_norm": 3.412062406539917, "learning_rate": 3.800760304121649e-05, "loss": 0.6102, "mean_token_accuracy": 0.8200756669044494, "num_tokens": 311346125.0, "step": 30020 }, { "entropy": 0.7145704448223114, "epoch": 0.24024, "grad_norm": 2.0838584899902344, "learning_rate": 3.800360144057623e-05, "loss": 0.7332, "mean_token_accuracy": 0.7933964848518371, "num_tokens": 311440677.0, "step": 30030 }, { "entropy": 0.7558577954769135, "epoch": 0.24032, "grad_norm": 2.463989019393921, "learning_rate": 3.799959983993598e-05, "loss": 0.7553, "mean_token_accuracy": 0.7799119293689728, "num_tokens": 311572032.0, "step": 30040 }, { "entropy": 0.6908641576766967, "epoch": 0.2404, "grad_norm": 4.516998291015625, "learning_rate": 3.799559823929572e-05, "loss": 0.6747, "mean_token_accuracy": 0.8122901499271393, "num_tokens": 311614100.0, "step": 30050 }, { "entropy": 0.6914568901062011, "epoch": 0.24048, "grad_norm": 1.6938687562942505, "learning_rate": 3.799159663865546e-05, "loss": 0.6908, "mean_token_accuracy": 0.7865634262561798, "num_tokens": 311777530.0, "step": 30060 }, { "entropy": 0.7670585811138153, "epoch": 0.24056, "grad_norm": 3.157313346862793, "learning_rate": 3.7987595038015206e-05, "loss": 0.7578, "mean_token_accuracy": 0.7890501976013183, "num_tokens": 311854860.0, "step": 30070 }, { "entropy": 0.7270930886268616, "epoch": 0.24064, "grad_norm": 2.1837284564971924, "learning_rate": 3.798359343737496e-05, "loss": 0.7236, "mean_token_accuracy": 0.7965595185756683, "num_tokens": 311948352.0, "step": 30080 }, { "entropy": 0.6960250556468963, "epoch": 0.24072, "grad_norm": 3.875000476837158, "learning_rate": 3.7979591836734694e-05, "loss": 0.6868, "mean_token_accuracy": 0.790935468673706, "num_tokens": 312086469.0, "step": 30090 }, { "entropy": 0.6658692330121994, "epoch": 0.2408, "grad_norm": 4.111528396606445, "learning_rate": 3.797559023609444e-05, "loss": 0.6646, "mean_token_accuracy": 0.8246415972709655, "num_tokens": 312124820.0, "step": 30100 }, { "entropy": 0.629214757680893, "epoch": 0.24088, "grad_norm": 1.5025687217712402, "learning_rate": 3.797158863545419e-05, "loss": 0.6318, "mean_token_accuracy": 0.798308688402176, "num_tokens": 312288457.0, "step": 30110 }, { "entropy": 0.6470441877841949, "epoch": 0.24096, "grad_norm": 3.5079691410064697, "learning_rate": 3.796758703481393e-05, "loss": 0.6408, "mean_token_accuracy": 0.8161898851394653, "num_tokens": 312369915.0, "step": 30120 }, { "entropy": 0.6710689961910248, "epoch": 0.24104, "grad_norm": 1.9791545867919922, "learning_rate": 3.796358543417367e-05, "loss": 0.665, "mean_token_accuracy": 0.8083407282829285, "num_tokens": 312462479.0, "step": 30130 }, { "entropy": 0.6871200442314148, "epoch": 0.24112, "grad_norm": 2.704596757888794, "learning_rate": 3.795958383353341e-05, "loss": 0.6919, "mean_token_accuracy": 0.7857816576957702, "num_tokens": 312610150.0, "step": 30140 }, { "entropy": 0.6872598648071289, "epoch": 0.2412, "grad_norm": 4.482398509979248, "learning_rate": 3.795558223289316e-05, "loss": 0.6712, "mean_token_accuracy": 0.8144765555858612, "num_tokens": 312651230.0, "step": 30150 }, { "entropy": 0.6527845442295075, "epoch": 0.24128, "grad_norm": 2.0744881629943848, "learning_rate": 3.7951580632252906e-05, "loss": 0.6539, "mean_token_accuracy": 0.7908463656902314, "num_tokens": 312815070.0, "step": 30160 }, { "entropy": 0.691580718755722, "epoch": 0.24136, "grad_norm": 3.539292573928833, "learning_rate": 3.794757903161264e-05, "loss": 0.6875, "mean_token_accuracy": 0.8001800537109375, "num_tokens": 312905251.0, "step": 30170 }, { "entropy": 0.7174664378166199, "epoch": 0.24144, "grad_norm": 1.3002487421035767, "learning_rate": 3.7943577430972394e-05, "loss": 0.7286, "mean_token_accuracy": 0.7942419826984406, "num_tokens": 313000136.0, "step": 30180 }, { "entropy": 0.654313063621521, "epoch": 0.24152, "grad_norm": 2.2838990688323975, "learning_rate": 3.793957583033214e-05, "loss": 0.6486, "mean_token_accuracy": 0.8025494635105133, "num_tokens": 313141508.0, "step": 30190 }, { "entropy": 0.618103775382042, "epoch": 0.2416, "grad_norm": 5.36961555480957, "learning_rate": 3.793557422969188e-05, "loss": 0.6166, "mean_token_accuracy": 0.8312333583831787, "num_tokens": 313184019.0, "step": 30200 }, { "entropy": 0.6276259660720825, "epoch": 0.24168, "grad_norm": 1.834195852279663, "learning_rate": 3.793157262905162e-05, "loss": 0.6251, "mean_token_accuracy": 0.7958170592784881, "num_tokens": 313347859.0, "step": 30210 }, { "entropy": 0.6853401362895966, "epoch": 0.24176, "grad_norm": 2.7976529598236084, "learning_rate": 3.792757102841137e-05, "loss": 0.6819, "mean_token_accuracy": 0.8038802921772004, "num_tokens": 313436041.0, "step": 30220 }, { "entropy": 0.6939248859882354, "epoch": 0.24184, "grad_norm": 1.9971060752868652, "learning_rate": 3.792356942777111e-05, "loss": 0.6909, "mean_token_accuracy": 0.799696558713913, "num_tokens": 313529701.0, "step": 30230 }, { "entropy": 0.7396000385284424, "epoch": 0.24192, "grad_norm": 2.085036516189575, "learning_rate": 3.7919567827130856e-05, "loss": 0.7339, "mean_token_accuracy": 0.7806006491184234, "num_tokens": 313667314.0, "step": 30240 }, { "entropy": 0.6899922341108322, "epoch": 0.242, "grad_norm": 4.805088043212891, "learning_rate": 3.791556622649059e-05, "loss": 0.6973, "mean_token_accuracy": 0.8148862183094024, "num_tokens": 313707857.0, "step": 30250 }, { "entropy": 0.7050902903079986, "epoch": 0.24208, "grad_norm": 1.640560507774353, "learning_rate": 3.7911564625850344e-05, "loss": 0.7037, "mean_token_accuracy": 0.7793844282627106, "num_tokens": 313871489.0, "step": 30260 }, { "entropy": 0.7031037390232087, "epoch": 0.24216, "grad_norm": 3.014374017715454, "learning_rate": 3.790756302521009e-05, "loss": 0.6983, "mean_token_accuracy": 0.8029541790485382, "num_tokens": 313945340.0, "step": 30270 }, { "entropy": 0.7385219037532806, "epoch": 0.24224, "grad_norm": 1.5967329740524292, "learning_rate": 3.790356142456983e-05, "loss": 0.7189, "mean_token_accuracy": 0.7923491835594177, "num_tokens": 314039976.0, "step": 30280 }, { "entropy": 0.6560667157173157, "epoch": 0.24232, "grad_norm": 2.550929307937622, "learning_rate": 3.7899559823929575e-05, "loss": 0.6643, "mean_token_accuracy": 0.799645620584488, "num_tokens": 314187939.0, "step": 30290 }, { "entropy": 0.6640577852725983, "epoch": 0.2424, "grad_norm": 3.68214750289917, "learning_rate": 3.789555822328932e-05, "loss": 0.6546, "mean_token_accuracy": 0.8180967688560485, "num_tokens": 314234021.0, "step": 30300 }, { "entropy": 0.6990637838840484, "epoch": 0.24248, "grad_norm": 1.3784563541412354, "learning_rate": 3.789155662264906e-05, "loss": 0.6953, "mean_token_accuracy": 0.7827491521835327, "num_tokens": 314397861.0, "step": 30310 }, { "entropy": 0.6809579282999039, "epoch": 0.24256, "grad_norm": 3.382974147796631, "learning_rate": 3.7887555022008806e-05, "loss": 0.6777, "mean_token_accuracy": 0.8082246840000152, "num_tokens": 314480716.0, "step": 30320 }, { "entropy": 0.7377672910690307, "epoch": 0.24264, "grad_norm": 2.1689858436584473, "learning_rate": 3.788355342136855e-05, "loss": 0.7399, "mean_token_accuracy": 0.7910287737846374, "num_tokens": 314573937.0, "step": 30330 }, { "entropy": 0.712227326631546, "epoch": 0.24272, "grad_norm": 3.34438419342041, "learning_rate": 3.787955182072829e-05, "loss": 0.7163, "mean_token_accuracy": 0.7860293328762055, "num_tokens": 314716413.0, "step": 30340 }, { "entropy": 0.63404680788517, "epoch": 0.2428, "grad_norm": 5.510424613952637, "learning_rate": 3.787555022008804e-05, "loss": 0.6207, "mean_token_accuracy": 0.8324467360973358, "num_tokens": 314754316.0, "step": 30350 }, { "entropy": 0.6648972690105438, "epoch": 0.24288, "grad_norm": 1.7801740169525146, "learning_rate": 3.787154861944778e-05, "loss": 0.6699, "mean_token_accuracy": 0.7871458292007446, "num_tokens": 314918156.0, "step": 30360 }, { "entropy": 0.7580771088600159, "epoch": 0.24296, "grad_norm": 3.3589022159576416, "learning_rate": 3.7867547018807524e-05, "loss": 0.7529, "mean_token_accuracy": 0.7834067463874816, "num_tokens": 315000598.0, "step": 30370 }, { "entropy": 0.6665680766105652, "epoch": 0.24304, "grad_norm": 2.0877645015716553, "learning_rate": 3.786354541816727e-05, "loss": 0.6688, "mean_token_accuracy": 0.8089056730270385, "num_tokens": 315093807.0, "step": 30380 }, { "entropy": 0.6995831429958344, "epoch": 0.24312, "grad_norm": 2.1386752128601074, "learning_rate": 3.785954381752701e-05, "loss": 0.6981, "mean_token_accuracy": 0.7853460729122161, "num_tokens": 315231657.0, "step": 30390 }, { "entropy": 0.7064889907836914, "epoch": 0.2432, "grad_norm": 5.429035663604736, "learning_rate": 3.7855542216886756e-05, "loss": 0.6885, "mean_token_accuracy": 0.814093041419983, "num_tokens": 315268423.0, "step": 30400 }, { "entropy": 0.6655415177345276, "epoch": 0.24328, "grad_norm": 1.574379324913025, "learning_rate": 3.78515406162465e-05, "loss": 0.6635, "mean_token_accuracy": 0.7917134881019592, "num_tokens": 315432263.0, "step": 30410 }, { "entropy": 0.7175698757171631, "epoch": 0.24336, "grad_norm": 2.8040213584899902, "learning_rate": 3.784753901560624e-05, "loss": 0.7086, "mean_token_accuracy": 0.7969100534915924, "num_tokens": 315528974.0, "step": 30420 }, { "entropy": 0.7040822982788086, "epoch": 0.24344, "grad_norm": 1.5095959901809692, "learning_rate": 3.7843537414965993e-05, "loss": 0.7163, "mean_token_accuracy": 0.7924989461898804, "num_tokens": 315622377.0, "step": 30430 }, { "entropy": 0.7354205071926116, "epoch": 0.24352, "grad_norm": 2.7128682136535645, "learning_rate": 3.783953581432573e-05, "loss": 0.7288, "mean_token_accuracy": 0.7815457701683044, "num_tokens": 315753834.0, "step": 30440 }, { "entropy": 0.6747774243354797, "epoch": 0.2436, "grad_norm": 5.526517391204834, "learning_rate": 3.7835534213685474e-05, "loss": 0.6785, "mean_token_accuracy": 0.8160153031349182, "num_tokens": 315792239.0, "step": 30450 }, { "entropy": 0.6176472067832947, "epoch": 0.24368, "grad_norm": 1.898470163345337, "learning_rate": 3.783153261304522e-05, "loss": 0.619, "mean_token_accuracy": 0.8007633209228515, "num_tokens": 315956079.0, "step": 30460 }, { "entropy": 0.6751412957906723, "epoch": 0.24376, "grad_norm": 3.2231321334838867, "learning_rate": 3.782753101240497e-05, "loss": 0.6545, "mean_token_accuracy": 0.8044045627117157, "num_tokens": 316056433.0, "step": 30470 }, { "entropy": 0.7402957230806351, "epoch": 0.24384, "grad_norm": 1.4463495016098022, "learning_rate": 3.7823529411764705e-05, "loss": 0.744, "mean_token_accuracy": 0.7912320494651794, "num_tokens": 316150530.0, "step": 30480 }, { "entropy": 0.668220728635788, "epoch": 0.24392, "grad_norm": 2.3411648273468018, "learning_rate": 3.781952781112445e-05, "loss": 0.6665, "mean_token_accuracy": 0.7912035465240479, "num_tokens": 316300048.0, "step": 30490 }, { "entropy": 0.6685658633708954, "epoch": 0.244, "grad_norm": 4.500831604003906, "learning_rate": 3.78155262104842e-05, "loss": 0.676, "mean_token_accuracy": 0.8140061378479004, "num_tokens": 316340939.0, "step": 30500 }, { "entropy": 0.6748408377170563, "epoch": 0.24408, "grad_norm": 2.204741954803467, "learning_rate": 3.781152460984394e-05, "loss": 0.6762, "mean_token_accuracy": 0.7853932559490204, "num_tokens": 316504779.0, "step": 30510 }, { "entropy": 0.6307462394237519, "epoch": 0.24416, "grad_norm": 3.556439161300659, "learning_rate": 3.780752300920368e-05, "loss": 0.6177, "mean_token_accuracy": 0.8178483009338379, "num_tokens": 316597073.0, "step": 30520 }, { "entropy": 0.7508694231510162, "epoch": 0.24424, "grad_norm": 1.3600486516952515, "learning_rate": 3.7803521408563424e-05, "loss": 0.7507, "mean_token_accuracy": 0.7883583605289459, "num_tokens": 316692790.0, "step": 30530 }, { "entropy": 0.6937614619731903, "epoch": 0.24432, "grad_norm": 2.607452869415283, "learning_rate": 3.7799519807923174e-05, "loss": 0.6945, "mean_token_accuracy": 0.7882154047489166, "num_tokens": 316833565.0, "step": 30540 }, { "entropy": 0.6463418364524841, "epoch": 0.2444, "grad_norm": 4.782273292541504, "learning_rate": 3.779551820728292e-05, "loss": 0.6351, "mean_token_accuracy": 0.8203496217727662, "num_tokens": 316873438.0, "step": 30550 }, { "entropy": 0.7039958596229553, "epoch": 0.24448, "grad_norm": 1.8844484090805054, "learning_rate": 3.7791516606642655e-05, "loss": 0.7087, "mean_token_accuracy": 0.7807776272296906, "num_tokens": 317037228.0, "step": 30560 }, { "entropy": 0.7059263467788697, "epoch": 0.24456, "grad_norm": 3.0703580379486084, "learning_rate": 3.7787515006002405e-05, "loss": 0.6915, "mean_token_accuracy": 0.8045436859130859, "num_tokens": 317120597.0, "step": 30570 }, { "entropy": 0.6677689790725708, "epoch": 0.24464, "grad_norm": 2.958277702331543, "learning_rate": 3.778351340536215e-05, "loss": 0.6713, "mean_token_accuracy": 0.8120179176330566, "num_tokens": 317213937.0, "step": 30580 }, { "entropy": 0.7575699687004089, "epoch": 0.24472, "grad_norm": 2.9022910594940186, "learning_rate": 3.777951180472189e-05, "loss": 0.7518, "mean_token_accuracy": 0.7741329431533813, "num_tokens": 317353407.0, "step": 30590 }, { "entropy": 0.7294464141130448, "epoch": 0.2448, "grad_norm": 5.447605133056641, "learning_rate": 3.777551020408163e-05, "loss": 0.7304, "mean_token_accuracy": 0.809649521112442, "num_tokens": 317392605.0, "step": 30600 }, { "entropy": 0.6662223637104034, "epoch": 0.24488, "grad_norm": 1.8603283166885376, "learning_rate": 3.777150860344138e-05, "loss": 0.6754, "mean_token_accuracy": 0.7887151896953583, "num_tokens": 317556445.0, "step": 30610 }, { "entropy": 0.6824384927749634, "epoch": 0.24496, "grad_norm": 3.624852180480957, "learning_rate": 3.7767507002801124e-05, "loss": 0.6693, "mean_token_accuracy": 0.8061619281768799, "num_tokens": 317650345.0, "step": 30620 }, { "entropy": 0.7423516035079956, "epoch": 0.24504, "grad_norm": 1.5599627494812012, "learning_rate": 3.776350540216087e-05, "loss": 0.7258, "mean_token_accuracy": 0.7899595975875855, "num_tokens": 317745346.0, "step": 30630 }, { "entropy": 0.6836314141750336, "epoch": 0.24512, "grad_norm": 3.2635679244995117, "learning_rate": 3.775950380152061e-05, "loss": 0.6905, "mean_token_accuracy": 0.7958535432815552, "num_tokens": 317876798.0, "step": 30640 }, { "entropy": 0.729744267463684, "epoch": 0.2452, "grad_norm": 4.0854315757751465, "learning_rate": 3.7755502200880355e-05, "loss": 0.7255, "mean_token_accuracy": 0.8122419059276581, "num_tokens": 317909688.0, "step": 30650 }, { "entropy": 0.6743601083755493, "epoch": 0.24528, "grad_norm": 2.1361799240112305, "learning_rate": 3.77515006002401e-05, "loss": 0.6772, "mean_token_accuracy": 0.7856314182281494, "num_tokens": 318073528.0, "step": 30660 }, { "entropy": 0.6828620791435241, "epoch": 0.24536, "grad_norm": 2.956242322921753, "learning_rate": 3.774749899959984e-05, "loss": 0.6823, "mean_token_accuracy": 0.8081115067005158, "num_tokens": 318160513.0, "step": 30670 }, { "entropy": 0.7365793824195862, "epoch": 0.24544, "grad_norm": 1.5300770998001099, "learning_rate": 3.7743497398959586e-05, "loss": 0.731, "mean_token_accuracy": 0.7913193106651306, "num_tokens": 318255711.0, "step": 30680 }, { "entropy": 0.7393994003534317, "epoch": 0.24552, "grad_norm": 2.706007957458496, "learning_rate": 3.773949579831933e-05, "loss": 0.7421, "mean_token_accuracy": 0.7788161635398865, "num_tokens": 318394017.0, "step": 30690 }, { "entropy": 0.7925901591777802, "epoch": 0.2456, "grad_norm": 4.653609275817871, "learning_rate": 3.7735494197679074e-05, "loss": 0.7856, "mean_token_accuracy": 0.7943347334861756, "num_tokens": 318435084.0, "step": 30700 }, { "entropy": 0.6616117298603058, "epoch": 0.24568, "grad_norm": 1.5033907890319824, "learning_rate": 3.773149259703882e-05, "loss": 0.6489, "mean_token_accuracy": 0.7940828084945679, "num_tokens": 318598924.0, "step": 30710 }, { "entropy": 0.6649498164653778, "epoch": 0.24576, "grad_norm": 3.448817729949951, "learning_rate": 3.772749099639856e-05, "loss": 0.663, "mean_token_accuracy": 0.8092827379703522, "num_tokens": 318694699.0, "step": 30720 }, { "entropy": 0.6691945374011994, "epoch": 0.24584, "grad_norm": 1.4433579444885254, "learning_rate": 3.7723489395758305e-05, "loss": 0.6703, "mean_token_accuracy": 0.8060293793678284, "num_tokens": 318789079.0, "step": 30730 }, { "entropy": 0.7359948515892029, "epoch": 0.24592, "grad_norm": 2.9491727352142334, "learning_rate": 3.771948779511805e-05, "loss": 0.7255, "mean_token_accuracy": 0.7786312162876129, "num_tokens": 318928755.0, "step": 30740 }, { "entropy": 0.722878035902977, "epoch": 0.246, "grad_norm": 5.219223976135254, "learning_rate": 3.771548619447779e-05, "loss": 0.7259, "mean_token_accuracy": 0.8066924273967743, "num_tokens": 318967025.0, "step": 30750 }, { "entropy": 0.7460953176021576, "epoch": 0.24608, "grad_norm": 2.244075298309326, "learning_rate": 3.7711484593837536e-05, "loss": 0.7418, "mean_token_accuracy": 0.7748908221721649, "num_tokens": 319130526.0, "step": 30760 }, { "entropy": 0.5918509632349014, "epoch": 0.24616, "grad_norm": 3.1413614749908447, "learning_rate": 3.770748299319728e-05, "loss": 0.5772, "mean_token_accuracy": 0.8295235574245453, "num_tokens": 319209524.0, "step": 30770 }, { "entropy": 0.652065908908844, "epoch": 0.24624, "grad_norm": 1.5742086172103882, "learning_rate": 3.770348139255703e-05, "loss": 0.6548, "mean_token_accuracy": 0.8077960312366486, "num_tokens": 319303887.0, "step": 30780 }, { "entropy": 0.7254481792449952, "epoch": 0.24632, "grad_norm": 2.1296558380126953, "learning_rate": 3.769947979191677e-05, "loss": 0.7282, "mean_token_accuracy": 0.7807018220424652, "num_tokens": 319443023.0, "step": 30790 }, { "entropy": 0.7640116721391678, "epoch": 0.2464, "grad_norm": 5.092154026031494, "learning_rate": 3.769547819127651e-05, "loss": 0.746, "mean_token_accuracy": 0.8005515158176422, "num_tokens": 319480388.0, "step": 30800 }, { "entropy": 0.7312660455703736, "epoch": 0.24648, "grad_norm": 2.089869260787964, "learning_rate": 3.7691476590636255e-05, "loss": 0.7205, "mean_token_accuracy": 0.7802857160568237, "num_tokens": 319641693.0, "step": 30810 }, { "entropy": 0.6144313365221024, "epoch": 0.24656, "grad_norm": 3.2794082164764404, "learning_rate": 3.7687474989996005e-05, "loss": 0.622, "mean_token_accuracy": 0.8190390110015869, "num_tokens": 319714213.0, "step": 30820 }, { "entropy": 0.6784069240093231, "epoch": 0.24664, "grad_norm": 1.8709361553192139, "learning_rate": 3.768347338935574e-05, "loss": 0.6893, "mean_token_accuracy": 0.8022107005119323, "num_tokens": 319807803.0, "step": 30830 }, { "entropy": 0.6616003692150116, "epoch": 0.24672, "grad_norm": 2.1286497116088867, "learning_rate": 3.7679471788715486e-05, "loss": 0.6496, "mean_token_accuracy": 0.7964877963066102, "num_tokens": 319950243.0, "step": 30840 }, { "entropy": 0.640373808145523, "epoch": 0.2468, "grad_norm": 7.062947750091553, "learning_rate": 3.767547018807523e-05, "loss": 0.6389, "mean_token_accuracy": 0.8227182269096375, "num_tokens": 319991631.0, "step": 30850 }, { "entropy": 0.6535553634166718, "epoch": 0.24688, "grad_norm": 1.3503942489624023, "learning_rate": 3.767146858743498e-05, "loss": 0.652, "mean_token_accuracy": 0.7926661014556885, "num_tokens": 320155471.0, "step": 30860 }, { "entropy": 0.6911949217319489, "epoch": 0.24696, "grad_norm": 4.2441205978393555, "learning_rate": 3.766746698679472e-05, "loss": 0.6877, "mean_token_accuracy": 0.807195496559143, "num_tokens": 320233319.0, "step": 30870 }, { "entropy": 0.6857247650623322, "epoch": 0.24704, "grad_norm": 1.9198671579360962, "learning_rate": 3.766346538615446e-05, "loss": 0.6983, "mean_token_accuracy": 0.7983223378658295, "num_tokens": 320327122.0, "step": 30880 }, { "entropy": 0.6602204620838166, "epoch": 0.24712, "grad_norm": 2.8895153999328613, "learning_rate": 3.765946378551421e-05, "loss": 0.645, "mean_token_accuracy": 0.8015794217586517, "num_tokens": 320462163.0, "step": 30890 }, { "entropy": 0.6584017753601075, "epoch": 0.2472, "grad_norm": 5.357555866241455, "learning_rate": 3.7655462184873955e-05, "loss": 0.6425, "mean_token_accuracy": 0.8212615311145782, "num_tokens": 320500670.0, "step": 30900 }, { "entropy": 0.6738174140453339, "epoch": 0.24728, "grad_norm": 1.952819585800171, "learning_rate": 3.765146058423369e-05, "loss": 0.6804, "mean_token_accuracy": 0.7865168571472168, "num_tokens": 320664510.0, "step": 30910 }, { "entropy": 0.693743360042572, "epoch": 0.24736, "grad_norm": 4.071038722991943, "learning_rate": 3.7647458983593435e-05, "loss": 0.6764, "mean_token_accuracy": 0.802759337425232, "num_tokens": 320750298.0, "step": 30920 }, { "entropy": 0.6860406696796417, "epoch": 0.24744, "grad_norm": 2.932060718536377, "learning_rate": 3.7643457382953186e-05, "loss": 0.7006, "mean_token_accuracy": 0.7995186448097229, "num_tokens": 320844011.0, "step": 30930 }, { "entropy": 0.6644232392311096, "epoch": 0.24752, "grad_norm": 3.0816996097564697, "learning_rate": 3.763945578231293e-05, "loss": 0.6563, "mean_token_accuracy": 0.7960717439651489, "num_tokens": 320986852.0, "step": 30940 }, { "entropy": 0.7443441212177276, "epoch": 0.2476, "grad_norm": 4.672768592834473, "learning_rate": 3.7635454181672667e-05, "loss": 0.7235, "mean_token_accuracy": 0.8047987520694733, "num_tokens": 321026151.0, "step": 30950 }, { "entropy": 0.6592734277248382, "epoch": 0.24768, "grad_norm": 1.9299172163009644, "learning_rate": 3.763145258103242e-05, "loss": 0.6617, "mean_token_accuracy": 0.7907152116298676, "num_tokens": 321188824.0, "step": 30960 }, { "entropy": 0.6785060405731201, "epoch": 0.24776, "grad_norm": 2.8866915702819824, "learning_rate": 3.762745098039216e-05, "loss": 0.678, "mean_token_accuracy": 0.804982715845108, "num_tokens": 321270850.0, "step": 30970 }, { "entropy": 0.7464880347251892, "epoch": 0.24784, "grad_norm": 1.5145431756973267, "learning_rate": 3.7623449379751905e-05, "loss": 0.7597, "mean_token_accuracy": 0.7858839571475983, "num_tokens": 321366284.0, "step": 30980 }, { "entropy": 0.7365483999252319, "epoch": 0.24792, "grad_norm": 3.000631809234619, "learning_rate": 3.761944777911164e-05, "loss": 0.7336, "mean_token_accuracy": 0.7822255969047547, "num_tokens": 321493756.0, "step": 30990 }, { "entropy": 0.6843784928321839, "epoch": 0.248, "grad_norm": 8.730759620666504, "learning_rate": 3.761544617847139e-05, "loss": 0.6906, "mean_token_accuracy": 0.8170080602169036, "num_tokens": 321529818.0, "step": 31000 }, { "entropy": 0.6801569283008575, "epoch": 0.24808, "grad_norm": 1.9571399688720703, "learning_rate": 3.7611444577831136e-05, "loss": 0.6764, "mean_token_accuracy": 0.7854665398597718, "num_tokens": 321693658.0, "step": 31010 }, { "entropy": 0.6731355726718903, "epoch": 0.24816, "grad_norm": 3.8839168548583984, "learning_rate": 3.760744297719088e-05, "loss": 0.6631, "mean_token_accuracy": 0.8060202419757843, "num_tokens": 321791489.0, "step": 31020 }, { "entropy": 0.7179898858070374, "epoch": 0.24824, "grad_norm": 2.5646796226501465, "learning_rate": 3.760344137655062e-05, "loss": 0.716, "mean_token_accuracy": 0.796361643075943, "num_tokens": 321884675.0, "step": 31030 }, { "entropy": 0.6279147505760193, "epoch": 0.24832, "grad_norm": 2.7716410160064697, "learning_rate": 3.759943977591037e-05, "loss": 0.623, "mean_token_accuracy": 0.8096747815608978, "num_tokens": 322012776.0, "step": 31040 }, { "entropy": 0.6268857777118683, "epoch": 0.2484, "grad_norm": 5.698651313781738, "learning_rate": 3.759543817527011e-05, "loss": 0.6173, "mean_token_accuracy": 0.8341599404811859, "num_tokens": 322044728.0, "step": 31050 }, { "entropy": 0.6888859093189239, "epoch": 0.24848, "grad_norm": 1.9714487791061401, "learning_rate": 3.7591436574629854e-05, "loss": 0.6915, "mean_token_accuracy": 0.784770405292511, "num_tokens": 322208568.0, "step": 31060 }, { "entropy": 0.6686871707439422, "epoch": 0.24856, "grad_norm": 3.484221935272217, "learning_rate": 3.75874349739896e-05, "loss": 0.6638, "mean_token_accuracy": 0.8096802651882171, "num_tokens": 322296792.0, "step": 31070 }, { "entropy": 0.6732590734958649, "epoch": 0.24864, "grad_norm": 2.1776394844055176, "learning_rate": 3.758343337334934e-05, "loss": 0.6747, "mean_token_accuracy": 0.8086455702781677, "num_tokens": 322390108.0, "step": 31080 }, { "entropy": 0.6677290469408035, "epoch": 0.24872, "grad_norm": 2.5565009117126465, "learning_rate": 3.7579431772709085e-05, "loss": 0.6669, "mean_token_accuracy": 0.7976166367530823, "num_tokens": 322522302.0, "step": 31090 }, { "entropy": 0.6678585171699524, "epoch": 0.2488, "grad_norm": 4.466791152954102, "learning_rate": 3.757543017206883e-05, "loss": 0.6615, "mean_token_accuracy": 0.8222806930541993, "num_tokens": 322559436.0, "step": 31100 }, { "entropy": 0.6326424419879914, "epoch": 0.24888, "grad_norm": 1.7481451034545898, "learning_rate": 3.757142857142857e-05, "loss": 0.6294, "mean_token_accuracy": 0.8010749518871307, "num_tokens": 322723231.0, "step": 31110 }, { "entropy": 0.5934151589870453, "epoch": 0.24896, "grad_norm": 2.6774375438690186, "learning_rate": 3.7567426970788316e-05, "loss": 0.5914, "mean_token_accuracy": 0.8269140362739563, "num_tokens": 322804488.0, "step": 31120 }, { "entropy": 0.7447859168052673, "epoch": 0.24904, "grad_norm": 1.948538899421692, "learning_rate": 3.756342537014806e-05, "loss": 0.7405, "mean_token_accuracy": 0.7921926379203796, "num_tokens": 322896514.0, "step": 31130 }, { "entropy": 0.7115286588668823, "epoch": 0.24912, "grad_norm": 2.4858951568603516, "learning_rate": 3.7559423769507804e-05, "loss": 0.7124, "mean_token_accuracy": 0.7871362566947937, "num_tokens": 323035222.0, "step": 31140 }, { "entropy": 0.6702906131744385, "epoch": 0.2492, "grad_norm": 5.54602575302124, "learning_rate": 3.755542216886755e-05, "loss": 0.6566, "mean_token_accuracy": 0.8212243437767028, "num_tokens": 323072666.0, "step": 31150 }, { "entropy": 0.6605241447687149, "epoch": 0.24928, "grad_norm": 2.482697010040283, "learning_rate": 3.755142056822729e-05, "loss": 0.6518, "mean_token_accuracy": 0.7930447041988373, "num_tokens": 323236506.0, "step": 31160 }, { "entropy": 0.6487137109041214, "epoch": 0.24936, "grad_norm": 3.3306703567504883, "learning_rate": 3.754741896758704e-05, "loss": 0.6483, "mean_token_accuracy": 0.8132934987545013, "num_tokens": 323328110.0, "step": 31170 }, { "entropy": 0.6714668512344361, "epoch": 0.24944, "grad_norm": 1.8055839538574219, "learning_rate": 3.754341736694678e-05, "loss": 0.6788, "mean_token_accuracy": 0.8056685745716095, "num_tokens": 323420891.0, "step": 31180 }, { "entropy": 0.6842451572418213, "epoch": 0.24952, "grad_norm": 2.3758904933929443, "learning_rate": 3.753941576630652e-05, "loss": 0.6772, "mean_token_accuracy": 0.7934255182743073, "num_tokens": 323564636.0, "step": 31190 }, { "entropy": 0.6577907502651215, "epoch": 0.2496, "grad_norm": 4.564128398895264, "learning_rate": 3.7535414165666266e-05, "loss": 0.6549, "mean_token_accuracy": 0.8219186067581177, "num_tokens": 323604792.0, "step": 31200 }, { "entropy": 0.656028437614441, "epoch": 0.24968, "grad_norm": 1.5438226461410522, "learning_rate": 3.753141256502602e-05, "loss": 0.6571, "mean_token_accuracy": 0.7935271263122559, "num_tokens": 323768632.0, "step": 31210 }, { "entropy": 0.6398509591817856, "epoch": 0.24976, "grad_norm": 3.097520589828491, "learning_rate": 3.7527410964385754e-05, "loss": 0.6327, "mean_token_accuracy": 0.8110521495342254, "num_tokens": 323859456.0, "step": 31220 }, { "entropy": 0.65718994140625, "epoch": 0.24984, "grad_norm": 2.2895309925079346, "learning_rate": 3.75234093637455e-05, "loss": 0.6449, "mean_token_accuracy": 0.8092311441898346, "num_tokens": 323954196.0, "step": 31230 }, { "entropy": 0.6889060318470002, "epoch": 0.24992, "grad_norm": 2.0604476928710938, "learning_rate": 3.751940776310525e-05, "loss": 0.6922, "mean_token_accuracy": 0.7944556474685669, "num_tokens": 324086298.0, "step": 31240 }, { "entropy": 0.7225409388542176, "epoch": 0.25, "grad_norm": 4.780895709991455, "learning_rate": 3.751540616246499e-05, "loss": 0.7267, "mean_token_accuracy": 0.8097398817539215, "num_tokens": 324120417.0, "step": 31250 }, { "entropy": 0.6466030716896057, "epoch": 0.25008, "grad_norm": 2.004157781600952, "learning_rate": 3.751140456182473e-05, "loss": 0.6419, "mean_token_accuracy": 0.7938751876354218, "num_tokens": 324284257.0, "step": 31260 }, { "entropy": 0.679465788602829, "epoch": 0.25016, "grad_norm": 2.9807116985321045, "learning_rate": 3.750740296118447e-05, "loss": 0.6675, "mean_token_accuracy": 0.8078491687774658, "num_tokens": 324369839.0, "step": 31270 }, { "entropy": 0.7773918211460114, "epoch": 0.25024, "grad_norm": 1.731088638305664, "learning_rate": 3.750340136054422e-05, "loss": 0.7881, "mean_token_accuracy": 0.7805634558200836, "num_tokens": 324463743.0, "step": 31280 }, { "entropy": 0.7081584453582763, "epoch": 0.25032, "grad_norm": 2.4834396839141846, "learning_rate": 3.7499399759903966e-05, "loss": 0.6993, "mean_token_accuracy": 0.7906081855297089, "num_tokens": 324598727.0, "step": 31290 }, { "entropy": 0.7342389404773713, "epoch": 0.2504, "grad_norm": 3.7234485149383545, "learning_rate": 3.74953981592637e-05, "loss": 0.7255, "mean_token_accuracy": 0.8089250802993775, "num_tokens": 324636135.0, "step": 31300 }, { "entropy": 0.6352397203445435, "epoch": 0.25048, "grad_norm": 1.4202115535736084, "learning_rate": 3.7491396558623454e-05, "loss": 0.6428, "mean_token_accuracy": 0.7968673706054688, "num_tokens": 324799975.0, "step": 31310 }, { "entropy": 0.6961323380470276, "epoch": 0.25056, "grad_norm": 2.8423991203308105, "learning_rate": 3.74873949579832e-05, "loss": 0.6858, "mean_token_accuracy": 0.804455828666687, "num_tokens": 324897175.0, "step": 31320 }, { "entropy": 0.6773220658302307, "epoch": 0.25064, "grad_norm": 1.6004832983016968, "learning_rate": 3.748339335734294e-05, "loss": 0.6719, "mean_token_accuracy": 0.8060353517532348, "num_tokens": 324991167.0, "step": 31330 }, { "entropy": 0.6903883337974548, "epoch": 0.25072, "grad_norm": 2.37528657913208, "learning_rate": 3.747939175670268e-05, "loss": 0.6881, "mean_token_accuracy": 0.7919620275497437, "num_tokens": 325129332.0, "step": 31340 }, { "entropy": 0.7796079099178315, "epoch": 0.2508, "grad_norm": 6.083924293518066, "learning_rate": 3.747539015606243e-05, "loss": 0.7705, "mean_token_accuracy": 0.7965651273727417, "num_tokens": 325169985.0, "step": 31350 }, { "entropy": 0.6787006855010986, "epoch": 0.25088, "grad_norm": 1.5806201696395874, "learning_rate": 3.747138855542217e-05, "loss": 0.6774, "mean_token_accuracy": 0.7877870082855225, "num_tokens": 325333825.0, "step": 31360 }, { "entropy": 0.6856040477752685, "epoch": 0.25096, "grad_norm": 3.733116388320923, "learning_rate": 3.7467386954781916e-05, "loss": 0.6846, "mean_token_accuracy": 0.8030597865581512, "num_tokens": 325413418.0, "step": 31370 }, { "entropy": 0.7035608351230621, "epoch": 0.25104, "grad_norm": 1.931124210357666, "learning_rate": 3.746338535414165e-05, "loss": 0.691, "mean_token_accuracy": 0.8023366451263427, "num_tokens": 325507023.0, "step": 31380 }, { "entropy": 0.7372954726219177, "epoch": 0.25112, "grad_norm": 2.5779266357421875, "learning_rate": 3.7459383753501404e-05, "loss": 0.735, "mean_token_accuracy": 0.7855711996555328, "num_tokens": 325638328.0, "step": 31390 }, { "entropy": 0.6541209936141967, "epoch": 0.2512, "grad_norm": 5.751926422119141, "learning_rate": 3.745538215286115e-05, "loss": 0.6536, "mean_token_accuracy": 0.8191418468952179, "num_tokens": 325673694.0, "step": 31400 }, { "entropy": 0.6810270071029663, "epoch": 0.25128, "grad_norm": 2.577451229095459, "learning_rate": 3.745138055222089e-05, "loss": 0.6813, "mean_token_accuracy": 0.7834697067737579, "num_tokens": 325837534.0, "step": 31410 }, { "entropy": 0.7202918738126755, "epoch": 0.25136, "grad_norm": 2.71696400642395, "learning_rate": 3.7447378951580635e-05, "loss": 0.7247, "mean_token_accuracy": 0.7959991335868836, "num_tokens": 325921269.0, "step": 31420 }, { "entropy": 0.7366423308849335, "epoch": 0.25144, "grad_norm": 2.0422801971435547, "learning_rate": 3.744337735094038e-05, "loss": 0.7316, "mean_token_accuracy": 0.7940287113189697, "num_tokens": 326014427.0, "step": 31430 }, { "entropy": 0.7142162322998047, "epoch": 0.25152, "grad_norm": 2.877228021621704, "learning_rate": 3.743937575030012e-05, "loss": 0.7087, "mean_token_accuracy": 0.784845906496048, "num_tokens": 326157073.0, "step": 31440 }, { "entropy": 0.5748073488473893, "epoch": 0.2516, "grad_norm": 6.283869743347168, "learning_rate": 3.7435374149659866e-05, "loss": 0.5808, "mean_token_accuracy": 0.8416566908359527, "num_tokens": 326197897.0, "step": 31450 }, { "entropy": 0.7362278401851654, "epoch": 0.25168, "grad_norm": 3.2085609436035156, "learning_rate": 3.743137254901961e-05, "loss": 0.734, "mean_token_accuracy": 0.7746635794639587, "num_tokens": 326356766.0, "step": 31460 }, { "entropy": 0.7177590608596802, "epoch": 0.25176, "grad_norm": 3.6422653198242188, "learning_rate": 3.742737094837935e-05, "loss": 0.713, "mean_token_accuracy": 0.7993848383426666, "num_tokens": 326426540.0, "step": 31470 }, { "entropy": 0.705428010225296, "epoch": 0.25184, "grad_norm": 1.417973279953003, "learning_rate": 3.74233693477391e-05, "loss": 0.699, "mean_token_accuracy": 0.7963885605335236, "num_tokens": 326520796.0, "step": 31480 }, { "entropy": 0.6860973417758942, "epoch": 0.25192, "grad_norm": 2.702331304550171, "learning_rate": 3.741936774709884e-05, "loss": 0.6798, "mean_token_accuracy": 0.7932705104351043, "num_tokens": 326659659.0, "step": 31490 }, { "entropy": 0.6551717519760132, "epoch": 0.252, "grad_norm": 5.4489240646362305, "learning_rate": 3.7415366146458584e-05, "loss": 0.6586, "mean_token_accuracy": 0.8220104694366455, "num_tokens": 326698358.0, "step": 31500 }, { "entropy": 0.6494275033473969, "epoch": 0.25208, "grad_norm": 1.3701701164245605, "learning_rate": 3.741136454581833e-05, "loss": 0.647, "mean_token_accuracy": 0.7937164187431336, "num_tokens": 326862198.0, "step": 31510 }, { "entropy": 0.6815127700567245, "epoch": 0.25216, "grad_norm": 3.6757304668426514, "learning_rate": 3.740736294517807e-05, "loss": 0.6768, "mean_token_accuracy": 0.8040644466876984, "num_tokens": 326943367.0, "step": 31520 }, { "entropy": 0.7182527482509613, "epoch": 0.25224, "grad_norm": 2.058807373046875, "learning_rate": 3.7403361344537816e-05, "loss": 0.7206, "mean_token_accuracy": 0.7966147363185883, "num_tokens": 327035332.0, "step": 31530 }, { "entropy": 0.7112121045589447, "epoch": 0.25232, "grad_norm": 2.6124801635742188, "learning_rate": 3.739935974389756e-05, "loss": 0.6999, "mean_token_accuracy": 0.7809869885444641, "num_tokens": 327182844.0, "step": 31540 }, { "entropy": 0.6461658239364624, "epoch": 0.2524, "grad_norm": 4.317339897155762, "learning_rate": 3.73953581432573e-05, "loss": 0.6379, "mean_token_accuracy": 0.8241378128528595, "num_tokens": 327228082.0, "step": 31550 }, { "entropy": 0.6295448333024979, "epoch": 0.25248, "grad_norm": 1.9442243576049805, "learning_rate": 3.7391356542617053e-05, "loss": 0.6353, "mean_token_accuracy": 0.797227656841278, "num_tokens": 327391922.0, "step": 31560 }, { "entropy": 0.6347694247961044, "epoch": 0.25256, "grad_norm": 2.734825372695923, "learning_rate": 3.738735494197679e-05, "loss": 0.6318, "mean_token_accuracy": 0.8090350151062011, "num_tokens": 327491185.0, "step": 31570 }, { "entropy": 0.7152287483215332, "epoch": 0.25264, "grad_norm": 2.046046733856201, "learning_rate": 3.7383353341336534e-05, "loss": 0.7077, "mean_token_accuracy": 0.7980102837085724, "num_tokens": 327588145.0, "step": 31580 }, { "entropy": 0.6776697933673859, "epoch": 0.25272, "grad_norm": 2.3085241317749023, "learning_rate": 3.737935174069628e-05, "loss": 0.6641, "mean_token_accuracy": 0.8007562756538391, "num_tokens": 327721733.0, "step": 31590 }, { "entropy": 0.6320505768060685, "epoch": 0.2528, "grad_norm": 5.020009994506836, "learning_rate": 3.737535014005603e-05, "loss": 0.6385, "mean_token_accuracy": 0.8260193586349487, "num_tokens": 327763186.0, "step": 31600 }, { "entropy": 0.6751487374305725, "epoch": 0.25288, "grad_norm": 2.229644775390625, "learning_rate": 3.7371348539415765e-05, "loss": 0.6745, "mean_token_accuracy": 0.7868160784244538, "num_tokens": 327927026.0, "step": 31610 }, { "entropy": 0.6490286529064179, "epoch": 0.25296, "grad_norm": 3.01476788520813, "learning_rate": 3.736734693877551e-05, "loss": 0.6523, "mean_token_accuracy": 0.8094755828380584, "num_tokens": 328016518.0, "step": 31620 }, { "entropy": 0.7656527757644653, "epoch": 0.25304, "grad_norm": 2.3726842403411865, "learning_rate": 3.736334533813526e-05, "loss": 0.7698, "mean_token_accuracy": 0.7895405769348145, "num_tokens": 328110372.0, "step": 31630 }, { "entropy": 0.7215629875659942, "epoch": 0.25312, "grad_norm": 2.0807688236236572, "learning_rate": 3.7359343737495e-05, "loss": 0.7164, "mean_token_accuracy": 0.7836608231067658, "num_tokens": 328253880.0, "step": 31640 }, { "entropy": 0.6917785346508026, "epoch": 0.2532, "grad_norm": 4.269363880157471, "learning_rate": 3.735534213685474e-05, "loss": 0.6804, "mean_token_accuracy": 0.8183288156986237, "num_tokens": 328292198.0, "step": 31650 }, { "entropy": 0.6232268452644348, "epoch": 0.25328, "grad_norm": 1.7694079875946045, "learning_rate": 3.7351340536214484e-05, "loss": 0.6282, "mean_token_accuracy": 0.799712997674942, "num_tokens": 328456038.0, "step": 31660 }, { "entropy": 0.6777512907981873, "epoch": 0.25336, "grad_norm": 3.1799192428588867, "learning_rate": 3.7347338935574234e-05, "loss": 0.6566, "mean_token_accuracy": 0.8092521667480469, "num_tokens": 328537219.0, "step": 31670 }, { "entropy": 0.7299015283584595, "epoch": 0.25344, "grad_norm": 1.787122368812561, "learning_rate": 3.734333733493398e-05, "loss": 0.7442, "mean_token_accuracy": 0.7891404807567597, "num_tokens": 328629365.0, "step": 31680 }, { "entropy": 0.7138407766819, "epoch": 0.25352, "grad_norm": 2.4223570823669434, "learning_rate": 3.7339335734293715e-05, "loss": 0.7172, "mean_token_accuracy": 0.7801499962806702, "num_tokens": 328779188.0, "step": 31690 }, { "entropy": 0.7291511237621308, "epoch": 0.2536, "grad_norm": 6.305508136749268, "learning_rate": 3.7335334133653465e-05, "loss": 0.714, "mean_token_accuracy": 0.8051181077957154, "num_tokens": 328821786.0, "step": 31700 }, { "entropy": 0.6686290144920349, "epoch": 0.25368, "grad_norm": 1.826375126838684, "learning_rate": 3.733133253301321e-05, "loss": 0.664, "mean_token_accuracy": 0.7895958364009857, "num_tokens": 328985112.0, "step": 31710 }, { "entropy": 0.6462254047393798, "epoch": 0.25376, "grad_norm": 3.713730812072754, "learning_rate": 3.732733093237295e-05, "loss": 0.6453, "mean_token_accuracy": 0.8146406948566437, "num_tokens": 329056841.0, "step": 31720 }, { "entropy": 0.6648178696632385, "epoch": 0.25384, "grad_norm": 1.6205863952636719, "learning_rate": 3.732332933173269e-05, "loss": 0.681, "mean_token_accuracy": 0.8064903318881989, "num_tokens": 329150734.0, "step": 31730 }, { "entropy": 0.7200847625732422, "epoch": 0.25392, "grad_norm": 1.8336495161056519, "learning_rate": 3.731932773109244e-05, "loss": 0.7113, "mean_token_accuracy": 0.7840258777141571, "num_tokens": 329297882.0, "step": 31740 }, { "entropy": 0.6348013937473297, "epoch": 0.254, "grad_norm": 4.971519947052002, "learning_rate": 3.7315326130452184e-05, "loss": 0.6442, "mean_token_accuracy": 0.8200665235519409, "num_tokens": 329343249.0, "step": 31750 }, { "entropy": 0.6767984509468079, "epoch": 0.25408, "grad_norm": 2.2417619228363037, "learning_rate": 3.731132452981193e-05, "loss": 0.6762, "mean_token_accuracy": 0.7911700010299683, "num_tokens": 329507089.0, "step": 31760 }, { "entropy": 0.696458899974823, "epoch": 0.25416, "grad_norm": 3.3090922832489014, "learning_rate": 3.730732292917167e-05, "loss": 0.6864, "mean_token_accuracy": 0.803805011510849, "num_tokens": 329588479.0, "step": 31770 }, { "entropy": 0.7996962130069732, "epoch": 0.25424, "grad_norm": 1.473128318786621, "learning_rate": 3.7303321328531415e-05, "loss": 0.8054, "mean_token_accuracy": 0.7813696801662445, "num_tokens": 329681819.0, "step": 31780 }, { "entropy": 0.6649147748947144, "epoch": 0.25432, "grad_norm": 2.8926334381103516, "learning_rate": 3.729931972789116e-05, "loss": 0.6565, "mean_token_accuracy": 0.7969438135623932, "num_tokens": 329822869.0, "step": 31790 }, { "entropy": 0.7736962080001831, "epoch": 0.2544, "grad_norm": 4.381381511688232, "learning_rate": 3.72953181272509e-05, "loss": 0.7749, "mean_token_accuracy": 0.7949620664119721, "num_tokens": 329864807.0, "step": 31800 }, { "entropy": 0.6061192125082016, "epoch": 0.25448, "grad_norm": 2.0286428928375244, "learning_rate": 3.7291316526610646e-05, "loss": 0.6073, "mean_token_accuracy": 0.8027418196201325, "num_tokens": 330028647.0, "step": 31810 }, { "entropy": 0.7039299815893173, "epoch": 0.25456, "grad_norm": 2.969355344772339, "learning_rate": 3.728731492597039e-05, "loss": 0.6958, "mean_token_accuracy": 0.8033713340759278, "num_tokens": 330120107.0, "step": 31820 }, { "entropy": 0.6646128058433532, "epoch": 0.25464, "grad_norm": 1.7669514417648315, "learning_rate": 3.7283313325330134e-05, "loss": 0.677, "mean_token_accuracy": 0.8009298145771027, "num_tokens": 330214542.0, "step": 31830 }, { "entropy": 0.6895746529102326, "epoch": 0.25472, "grad_norm": 3.127424716949463, "learning_rate": 3.727931172468988e-05, "loss": 0.68, "mean_token_accuracy": 0.7895779490470887, "num_tokens": 330357470.0, "step": 31840 }, { "entropy": 0.7123332351446152, "epoch": 0.2548, "grad_norm": 4.094571590423584, "learning_rate": 3.727531012404962e-05, "loss": 0.7033, "mean_token_accuracy": 0.8143802464008332, "num_tokens": 330396169.0, "step": 31850 }, { "entropy": 0.691479104757309, "epoch": 0.25488, "grad_norm": 1.4619766473770142, "learning_rate": 3.7271308523409365e-05, "loss": 0.6903, "mean_token_accuracy": 0.7858207166194916, "num_tokens": 330560009.0, "step": 31860 }, { "entropy": 0.6595564603805542, "epoch": 0.25496, "grad_norm": 3.0096137523651123, "learning_rate": 3.726730692276911e-05, "loss": 0.6476, "mean_token_accuracy": 0.8112211406230927, "num_tokens": 330654543.0, "step": 31870 }, { "entropy": 0.6726113677024841, "epoch": 0.25504, "grad_norm": 1.6408036947250366, "learning_rate": 3.726330532212885e-05, "loss": 0.669, "mean_token_accuracy": 0.8016937017440796, "num_tokens": 330750559.0, "step": 31880 }, { "entropy": 0.7306706249713898, "epoch": 0.25512, "grad_norm": 3.777956008911133, "learning_rate": 3.7259303721488596e-05, "loss": 0.7187, "mean_token_accuracy": 0.7854531168937683, "num_tokens": 330885775.0, "step": 31890 }, { "entropy": 0.704708781838417, "epoch": 0.2552, "grad_norm": 4.95559024810791, "learning_rate": 3.725530212084834e-05, "loss": 0.7069, "mean_token_accuracy": 0.8120712339878082, "num_tokens": 330925883.0, "step": 31900 }, { "entropy": 0.6683619916439056, "epoch": 0.25528, "grad_norm": 1.6967363357543945, "learning_rate": 3.7251300520208083e-05, "loss": 0.6663, "mean_token_accuracy": 0.7909929156303406, "num_tokens": 331089723.0, "step": 31910 }, { "entropy": 0.7379725396633148, "epoch": 0.25536, "grad_norm": 3.578615665435791, "learning_rate": 3.724729891956783e-05, "loss": 0.7238, "mean_token_accuracy": 0.794802474975586, "num_tokens": 331173123.0, "step": 31920 }, { "entropy": 0.6237025678157806, "epoch": 0.25544, "grad_norm": 1.773651361465454, "learning_rate": 3.724329731892757e-05, "loss": 0.6263, "mean_token_accuracy": 0.8171829044818878, "num_tokens": 331267143.0, "step": 31930 }, { "entropy": 0.7128619432449341, "epoch": 0.25552, "grad_norm": 2.670947551727295, "learning_rate": 3.7239295718287315e-05, "loss": 0.7171, "mean_token_accuracy": 0.7875543713569642, "num_tokens": 331398417.0, "step": 31940 }, { "entropy": 0.7361980140209198, "epoch": 0.2556, "grad_norm": 5.045295238494873, "learning_rate": 3.7235294117647065e-05, "loss": 0.7299, "mean_token_accuracy": 0.8035780549049377, "num_tokens": 331434702.0, "step": 31950 }, { "entropy": 0.6903707265853882, "epoch": 0.25568, "grad_norm": 1.5418200492858887, "learning_rate": 3.72312925170068e-05, "loss": 0.6929, "mean_token_accuracy": 0.7831747353076934, "num_tokens": 331598513.0, "step": 31960 }, { "entropy": 0.6017300963401795, "epoch": 0.25576, "grad_norm": 3.600842237472534, "learning_rate": 3.7227290916366546e-05, "loss": 0.5921, "mean_token_accuracy": 0.8295577347278595, "num_tokens": 331671606.0, "step": 31970 }, { "entropy": 0.6707580089569092, "epoch": 0.25584, "grad_norm": 1.917473316192627, "learning_rate": 3.722328931572629e-05, "loss": 0.6693, "mean_token_accuracy": 0.806057620048523, "num_tokens": 331764698.0, "step": 31980 }, { "entropy": 0.6543224334716797, "epoch": 0.25592, "grad_norm": 2.385591983795166, "learning_rate": 3.721928771508604e-05, "loss": 0.66, "mean_token_accuracy": 0.7974667608737945, "num_tokens": 331898622.0, "step": 31990 }, { "entropy": 0.7198842614889145, "epoch": 0.256, "grad_norm": 4.27111291885376, "learning_rate": 3.721528611444578e-05, "loss": 0.7068, "mean_token_accuracy": 0.8103286266326905, "num_tokens": 331934686.0, "step": 32000 }, { "entropy": 0.6388371378183365, "epoch": 0.25608, "grad_norm": 2.1418824195861816, "learning_rate": 3.721128451380552e-05, "loss": 0.6456, "mean_token_accuracy": 0.795609426498413, "num_tokens": 332098526.0, "step": 32010 }, { "entropy": 0.683032539486885, "epoch": 0.25616, "grad_norm": 2.7064366340637207, "learning_rate": 3.720728291316527e-05, "loss": 0.672, "mean_token_accuracy": 0.8056384027004242, "num_tokens": 332184500.0, "step": 32020 }, { "entropy": 0.6408526420593261, "epoch": 0.25624, "grad_norm": 1.3546907901763916, "learning_rate": 3.7203281312525015e-05, "loss": 0.6342, "mean_token_accuracy": 0.8145363211631775, "num_tokens": 332276736.0, "step": 32030 }, { "entropy": 0.661655068397522, "epoch": 0.25632, "grad_norm": 2.8239893913269043, "learning_rate": 3.719927971188475e-05, "loss": 0.6636, "mean_token_accuracy": 0.7955256223678588, "num_tokens": 332417779.0, "step": 32040 }, { "entropy": 0.6956782042980194, "epoch": 0.2564, "grad_norm": 4.390460014343262, "learning_rate": 3.7195278111244495e-05, "loss": 0.6839, "mean_token_accuracy": 0.811653345823288, "num_tokens": 332460081.0, "step": 32050 }, { "entropy": 0.7023217380046844, "epoch": 0.25648, "grad_norm": 1.8802567720413208, "learning_rate": 3.7191276510604246e-05, "loss": 0.7088, "mean_token_accuracy": 0.780459213256836, "num_tokens": 332623921.0, "step": 32060 }, { "entropy": 0.6075511574745178, "epoch": 0.25656, "grad_norm": 4.237981796264648, "learning_rate": 3.718727490996399e-05, "loss": 0.5977, "mean_token_accuracy": 0.8201306700706482, "num_tokens": 332706183.0, "step": 32070 }, { "entropy": 0.6874142944812774, "epoch": 0.25664, "grad_norm": 1.8424795866012573, "learning_rate": 3.7183273309323727e-05, "loss": 0.6949, "mean_token_accuracy": 0.7998904109001159, "num_tokens": 332797880.0, "step": 32080 }, { "entropy": 0.775694876909256, "epoch": 0.25672, "grad_norm": 2.5804450511932373, "learning_rate": 3.717927170868348e-05, "loss": 0.7655, "mean_token_accuracy": 0.7758914411067963, "num_tokens": 332938249.0, "step": 32090 }, { "entropy": 0.6487040787935257, "epoch": 0.2568, "grad_norm": 4.820244312286377, "learning_rate": 3.717527010804322e-05, "loss": 0.6429, "mean_token_accuracy": 0.8247537612915039, "num_tokens": 332978884.0, "step": 32100 }, { "entropy": 0.7404762208461761, "epoch": 0.25688, "grad_norm": 2.1938650608062744, "learning_rate": 3.7171268507402964e-05, "loss": 0.7389, "mean_token_accuracy": 0.7804897427558899, "num_tokens": 333142724.0, "step": 32110 }, { "entropy": 0.689547723531723, "epoch": 0.25696, "grad_norm": 3.460434913635254, "learning_rate": 3.71672669067627e-05, "loss": 0.6783, "mean_token_accuracy": 0.8064441621303559, "num_tokens": 333230924.0, "step": 32120 }, { "entropy": 0.72629354596138, "epoch": 0.25704, "grad_norm": 1.7980526685714722, "learning_rate": 3.716326530612245e-05, "loss": 0.7257, "mean_token_accuracy": 0.7931035280227661, "num_tokens": 333325035.0, "step": 32130 }, { "entropy": 0.7103115737438201, "epoch": 0.25712, "grad_norm": 2.492116928100586, "learning_rate": 3.7159263705482196e-05, "loss": 0.7111, "mean_token_accuracy": 0.7835209727287292, "num_tokens": 333464380.0, "step": 32140 }, { "entropy": 0.6710884362459183, "epoch": 0.2572, "grad_norm": 6.624636650085449, "learning_rate": 3.715526210484194e-05, "loss": 0.6623, "mean_token_accuracy": 0.8216086685657501, "num_tokens": 333504046.0, "step": 32150 }, { "entropy": 0.6727196514606476, "epoch": 0.25728, "grad_norm": 1.5546637773513794, "learning_rate": 3.715126050420168e-05, "loss": 0.668, "mean_token_accuracy": 0.7878847181797027, "num_tokens": 333667886.0, "step": 32160 }, { "entropy": 0.636841744184494, "epoch": 0.25736, "grad_norm": 3.192629814147949, "learning_rate": 3.714725890356143e-05, "loss": 0.6337, "mean_token_accuracy": 0.8138393819332123, "num_tokens": 333748964.0, "step": 32170 }, { "entropy": 0.6563754320144654, "epoch": 0.25744, "grad_norm": 2.1433866024017334, "learning_rate": 3.714325730292117e-05, "loss": 0.6596, "mean_token_accuracy": 0.8064315676689148, "num_tokens": 333841667.0, "step": 32180 }, { "entropy": 0.7426337480545044, "epoch": 0.25752, "grad_norm": 2.546751022338867, "learning_rate": 3.7139255702280914e-05, "loss": 0.7247, "mean_token_accuracy": 0.7815365850925445, "num_tokens": 333975549.0, "step": 32190 }, { "entropy": 0.6475804656744003, "epoch": 0.2576, "grad_norm": 4.908761978149414, "learning_rate": 3.713525410164066e-05, "loss": 0.6487, "mean_token_accuracy": 0.8228693783283234, "num_tokens": 334016710.0, "step": 32200 }, { "entropy": 0.6756360709667206, "epoch": 0.25768, "grad_norm": 1.543364405632019, "learning_rate": 3.71312525010004e-05, "loss": 0.6811, "mean_token_accuracy": 0.785875678062439, "num_tokens": 334180550.0, "step": 32210 }, { "entropy": 0.6501977503299713, "epoch": 0.25776, "grad_norm": 3.3127987384796143, "learning_rate": 3.7127250900360145e-05, "loss": 0.6492, "mean_token_accuracy": 0.8088283479213715, "num_tokens": 334263023.0, "step": 32220 }, { "entropy": 0.6813743770122528, "epoch": 0.25784, "grad_norm": 2.27596378326416, "learning_rate": 3.712324929971989e-05, "loss": 0.6652, "mean_token_accuracy": 0.8058581411838531, "num_tokens": 334356439.0, "step": 32230 }, { "entropy": 0.7208330810070038, "epoch": 0.25792, "grad_norm": 2.7931952476501465, "learning_rate": 3.711924769907963e-05, "loss": 0.7208, "mean_token_accuracy": 0.7868095815181733, "num_tokens": 334485578.0, "step": 32240 }, { "entropy": 0.7195711851119995, "epoch": 0.258, "grad_norm": 4.784846305847168, "learning_rate": 3.7115246098439376e-05, "loss": 0.7146, "mean_token_accuracy": 0.8083656191825866, "num_tokens": 334519296.0, "step": 32250 }, { "entropy": 0.6469059765338898, "epoch": 0.25808, "grad_norm": 1.867344856262207, "learning_rate": 3.711124449779912e-05, "loss": 0.6418, "mean_token_accuracy": 0.7953834891319275, "num_tokens": 334683136.0, "step": 32260 }, { "entropy": 0.7372127294540405, "epoch": 0.25816, "grad_norm": 3.0912413597106934, "learning_rate": 3.7107242897158864e-05, "loss": 0.7339, "mean_token_accuracy": 0.7955348014831543, "num_tokens": 334762578.0, "step": 32270 }, { "entropy": 0.7251943469047546, "epoch": 0.25824, "grad_norm": 1.6434110403060913, "learning_rate": 3.710324129651861e-05, "loss": 0.722, "mean_token_accuracy": 0.7914376080036163, "num_tokens": 334856018.0, "step": 32280 }, { "entropy": 0.7197401583194732, "epoch": 0.25832, "grad_norm": 3.753599166870117, "learning_rate": 3.709923969587835e-05, "loss": 0.7092, "mean_token_accuracy": 0.7892275750637054, "num_tokens": 334991304.0, "step": 32290 }, { "entropy": 0.5661188393831253, "epoch": 0.2584, "grad_norm": 3.786781072616577, "learning_rate": 3.70952380952381e-05, "loss": 0.5705, "mean_token_accuracy": 0.841360330581665, "num_tokens": 335032277.0, "step": 32300 }, { "entropy": 0.6620908200740814, "epoch": 0.25848, "grad_norm": 1.9188227653503418, "learning_rate": 3.709123649459784e-05, "loss": 0.6617, "mean_token_accuracy": 0.7927393734455108, "num_tokens": 335196117.0, "step": 32310 }, { "entropy": 0.7417608439922333, "epoch": 0.25856, "grad_norm": 3.276724338531494, "learning_rate": 3.708723489395758e-05, "loss": 0.7456, "mean_token_accuracy": 0.7929269194602966, "num_tokens": 335283292.0, "step": 32320 }, { "entropy": 0.8433230340480804, "epoch": 0.25864, "grad_norm": 1.6513478755950928, "learning_rate": 3.7083233293317326e-05, "loss": 0.8385, "mean_token_accuracy": 0.7706797063350678, "num_tokens": 335378978.0, "step": 32330 }, { "entropy": 0.7179188787937164, "epoch": 0.25872, "grad_norm": 2.1365489959716797, "learning_rate": 3.707923169267708e-05, "loss": 0.7207, "mean_token_accuracy": 0.780931168794632, "num_tokens": 335518826.0, "step": 32340 }, { "entropy": 0.6926670074462891, "epoch": 0.2588, "grad_norm": 5.6356658935546875, "learning_rate": 3.7075230092036814e-05, "loss": 0.6689, "mean_token_accuracy": 0.8206695914268494, "num_tokens": 335556478.0, "step": 32350 }, { "entropy": 0.7771078705787658, "epoch": 0.25888, "grad_norm": 2.0115296840667725, "learning_rate": 3.707122849139656e-05, "loss": 0.7797, "mean_token_accuracy": 0.7692782163619996, "num_tokens": 335720318.0, "step": 32360 }, { "entropy": 0.6109399616718292, "epoch": 0.25896, "grad_norm": 2.5546793937683105, "learning_rate": 3.706722689075631e-05, "loss": 0.6037, "mean_token_accuracy": 0.8257455646991729, "num_tokens": 335815158.0, "step": 32370 }, { "entropy": 0.7097907900810242, "epoch": 0.25904, "grad_norm": 1.7551230192184448, "learning_rate": 3.706322529011605e-05, "loss": 0.7115, "mean_token_accuracy": 0.7937045395374298, "num_tokens": 335911464.0, "step": 32380 }, { "entropy": 0.6430157005786896, "epoch": 0.25912, "grad_norm": 2.410532236099243, "learning_rate": 3.705922368947579e-05, "loss": 0.6384, "mean_token_accuracy": 0.8035290658473968, "num_tokens": 336039900.0, "step": 32390 }, { "entropy": 0.6982715249061584, "epoch": 0.2592, "grad_norm": 5.073999404907227, "learning_rate": 3.705522208883553e-05, "loss": 0.6833, "mean_token_accuracy": 0.8141562163829803, "num_tokens": 336074593.0, "step": 32400 }, { "entropy": 0.6497740864753723, "epoch": 0.25928, "grad_norm": 1.912560224533081, "learning_rate": 3.705122048819528e-05, "loss": 0.6536, "mean_token_accuracy": 0.7934538424015045, "num_tokens": 336238433.0, "step": 32410 }, { "entropy": 0.666366595029831, "epoch": 0.25936, "grad_norm": 3.3376669883728027, "learning_rate": 3.7047218887555026e-05, "loss": 0.6583, "mean_token_accuracy": 0.8112664520740509, "num_tokens": 336318331.0, "step": 32420 }, { "entropy": 0.6935680210590363, "epoch": 0.25944, "grad_norm": 2.1322386264801025, "learning_rate": 3.704321728691476e-05, "loss": 0.6964, "mean_token_accuracy": 0.7991634249687195, "num_tokens": 336411516.0, "step": 32430 }, { "entropy": 0.7111084461212158, "epoch": 0.25952, "grad_norm": 3.1777141094207764, "learning_rate": 3.7039215686274514e-05, "loss": 0.7088, "mean_token_accuracy": 0.7924131214618683, "num_tokens": 336542499.0, "step": 32440 }, { "entropy": 0.7115533649921417, "epoch": 0.2596, "grad_norm": 5.059079647064209, "learning_rate": 3.703521408563426e-05, "loss": 0.6997, "mean_token_accuracy": 0.8127498090267181, "num_tokens": 336582160.0, "step": 32450 }, { "entropy": 0.6678130090236664, "epoch": 0.25968, "grad_norm": 1.6752212047576904, "learning_rate": 3.7031212484994e-05, "loss": 0.6681, "mean_token_accuracy": 0.7886846601963043, "num_tokens": 336746000.0, "step": 32460 }, { "entropy": 0.6484197795391082, "epoch": 0.25976, "grad_norm": 2.997648239135742, "learning_rate": 3.702721088435374e-05, "loss": 0.6425, "mean_token_accuracy": 0.8119482278823853, "num_tokens": 336834335.0, "step": 32470 }, { "entropy": 0.6888585448265075, "epoch": 0.25984, "grad_norm": 1.5391446352005005, "learning_rate": 3.702320928371349e-05, "loss": 0.6833, "mean_token_accuracy": 0.8061542570590973, "num_tokens": 336927648.0, "step": 32480 }, { "entropy": 0.648072612285614, "epoch": 0.25992, "grad_norm": 2.226468324661255, "learning_rate": 3.701920768307323e-05, "loss": 0.6437, "mean_token_accuracy": 0.7995006263256073, "num_tokens": 337080500.0, "step": 32490 }, { "entropy": 0.657991573214531, "epoch": 0.26, "grad_norm": 5.0676589012146, "learning_rate": 3.7015206082432976e-05, "loss": 0.652, "mean_token_accuracy": 0.8217778384685517, "num_tokens": 337121746.0, "step": 32500 }, { "entropy": 0.6801497280597687, "epoch": 0.26008, "grad_norm": 2.4723830223083496, "learning_rate": 3.701120448179271e-05, "loss": 0.6759, "mean_token_accuracy": 0.7875549674034119, "num_tokens": 337285586.0, "step": 32510 }, { "entropy": 0.6414292097091675, "epoch": 0.26016, "grad_norm": 2.9868788719177246, "learning_rate": 3.7007202881152463e-05, "loss": 0.6376, "mean_token_accuracy": 0.8162694096565246, "num_tokens": 337384570.0, "step": 32520 }, { "entropy": 0.7001668214797974, "epoch": 0.26024, "grad_norm": 2.2730042934417725, "learning_rate": 3.700320128051221e-05, "loss": 0.7133, "mean_token_accuracy": 0.795335566997528, "num_tokens": 337479090.0, "step": 32530 }, { "entropy": 0.7246174156665802, "epoch": 0.26032, "grad_norm": 3.5589911937713623, "learning_rate": 3.699919967987195e-05, "loss": 0.7184, "mean_token_accuracy": 0.7848373472690582, "num_tokens": 337614943.0, "step": 32540 }, { "entropy": 0.6321562170982361, "epoch": 0.2604, "grad_norm": 5.310423374176025, "learning_rate": 3.6995198079231695e-05, "loss": 0.6087, "mean_token_accuracy": 0.8309654891490936, "num_tokens": 337648156.0, "step": 32550 }, { "entropy": 0.6167169153690338, "epoch": 0.26048, "grad_norm": 1.3471895456314087, "learning_rate": 3.699119647859144e-05, "loss": 0.618, "mean_token_accuracy": 0.8019601941108704, "num_tokens": 337811996.0, "step": 32560 }, { "entropy": 0.6676767766475677, "epoch": 0.26056, "grad_norm": 3.899521827697754, "learning_rate": 3.698719487795118e-05, "loss": 0.6644, "mean_token_accuracy": 0.8050518929958344, "num_tokens": 337900650.0, "step": 32570 }, { "entropy": 0.7004475712776184, "epoch": 0.26064, "grad_norm": 1.3588173389434814, "learning_rate": 3.6983193277310926e-05, "loss": 0.7174, "mean_token_accuracy": 0.7966309428215027, "num_tokens": 337994803.0, "step": 32580 }, { "entropy": 0.7573413670063018, "epoch": 0.26072, "grad_norm": 2.3631348609924316, "learning_rate": 3.697919167667067e-05, "loss": 0.7436, "mean_token_accuracy": 0.7747149288654327, "num_tokens": 338139559.0, "step": 32590 }, { "entropy": 0.6798717081546783, "epoch": 0.2608, "grad_norm": 4.264556407928467, "learning_rate": 3.697519007603041e-05, "loss": 0.6682, "mean_token_accuracy": 0.8185315072536469, "num_tokens": 338182861.0, "step": 32600 }, { "entropy": 0.6678816437721252, "epoch": 0.26088, "grad_norm": 2.419328451156616, "learning_rate": 3.697118847539016e-05, "loss": 0.6694, "mean_token_accuracy": 0.7886236071586609, "num_tokens": 338346701.0, "step": 32610 }, { "entropy": 0.6907366871833801, "epoch": 0.26096, "grad_norm": 3.1736552715301514, "learning_rate": 3.69671868747499e-05, "loss": 0.6786, "mean_token_accuracy": 0.8069070458412171, "num_tokens": 338426064.0, "step": 32620 }, { "entropy": 0.6947637021541595, "epoch": 0.26104, "grad_norm": 2.1869819164276123, "learning_rate": 3.6963185274109644e-05, "loss": 0.7082, "mean_token_accuracy": 0.7954801023006439, "num_tokens": 338518603.0, "step": 32630 }, { "entropy": 0.6682408213615417, "epoch": 0.26112, "grad_norm": 2.744565010070801, "learning_rate": 3.695918367346939e-05, "loss": 0.658, "mean_token_accuracy": 0.7951499402523041, "num_tokens": 338662504.0, "step": 32640 }, { "entropy": 0.6771468400955201, "epoch": 0.2612, "grad_norm": 5.534759521484375, "learning_rate": 3.695518207282913e-05, "loss": 0.6806, "mean_token_accuracy": 0.8164308071136475, "num_tokens": 338696321.0, "step": 32650 }, { "entropy": 0.6405999004840851, "epoch": 0.26128, "grad_norm": 1.3514472246170044, "learning_rate": 3.6951180472188875e-05, "loss": 0.6433, "mean_token_accuracy": 0.7973009288311005, "num_tokens": 338860161.0, "step": 32660 }, { "entropy": 0.7059321880340577, "epoch": 0.26136, "grad_norm": 3.48808217048645, "learning_rate": 3.694717887154862e-05, "loss": 0.7035, "mean_token_accuracy": 0.8031979858875274, "num_tokens": 338939157.0, "step": 32670 }, { "entropy": 0.6933968544006348, "epoch": 0.26144, "grad_norm": 1.5104230642318726, "learning_rate": 3.694317727090836e-05, "loss": 0.7161, "mean_token_accuracy": 0.794128006696701, "num_tokens": 339032420.0, "step": 32680 }, { "entropy": 0.6971171915531158, "epoch": 0.26152, "grad_norm": 2.7584099769592285, "learning_rate": 3.6939175670268113e-05, "loss": 0.6818, "mean_token_accuracy": 0.7912437856197357, "num_tokens": 339170394.0, "step": 32690 }, { "entropy": 0.6193905174732208, "epoch": 0.2616, "grad_norm": 5.068017482757568, "learning_rate": 3.693517406962785e-05, "loss": 0.6064, "mean_token_accuracy": 0.8343339562416077, "num_tokens": 339209601.0, "step": 32700 }, { "entropy": 0.634787380695343, "epoch": 0.26168, "grad_norm": 1.4019393920898438, "learning_rate": 3.6931172468987594e-05, "loss": 0.6451, "mean_token_accuracy": 0.7952613651752471, "num_tokens": 339373441.0, "step": 32710 }, { "entropy": 0.7182550609111786, "epoch": 0.26176, "grad_norm": 4.255120754241943, "learning_rate": 3.692717086834734e-05, "loss": 0.7027, "mean_token_accuracy": 0.8041011095046997, "num_tokens": 339462002.0, "step": 32720 }, { "entropy": 0.67275949716568, "epoch": 0.26184, "grad_norm": 1.9907491207122803, "learning_rate": 3.692316926770709e-05, "loss": 0.6723, "mean_token_accuracy": 0.806588762998581, "num_tokens": 339555283.0, "step": 32730 }, { "entropy": 0.685362133383751, "epoch": 0.26192, "grad_norm": 3.350454092025757, "learning_rate": 3.6919167667066825e-05, "loss": 0.6843, "mean_token_accuracy": 0.7932580530643463, "num_tokens": 339696770.0, "step": 32740 }, { "entropy": 0.7544173985719681, "epoch": 0.262, "grad_norm": 3.6806704998016357, "learning_rate": 3.691516606642657e-05, "loss": 0.7376, "mean_token_accuracy": 0.8061377882957459, "num_tokens": 339729750.0, "step": 32750 }, { "entropy": 0.6451445639133453, "epoch": 0.26208, "grad_norm": 1.478803277015686, "learning_rate": 3.691116446578632e-05, "loss": 0.6548, "mean_token_accuracy": 0.7958109438419342, "num_tokens": 339893590.0, "step": 32760 }, { "entropy": 0.6672034621238708, "epoch": 0.26216, "grad_norm": 4.037502288818359, "learning_rate": 3.690716286514606e-05, "loss": 0.6599, "mean_token_accuracy": 0.812209403514862, "num_tokens": 339971462.0, "step": 32770 }, { "entropy": 0.7100963532924652, "epoch": 0.26224, "grad_norm": 3.003401041030884, "learning_rate": 3.69031612645058e-05, "loss": 0.7198, "mean_token_accuracy": 0.7958086848258972, "num_tokens": 340063832.0, "step": 32780 }, { "entropy": 0.682402566075325, "epoch": 0.26232, "grad_norm": 2.4854893684387207, "learning_rate": 3.6899159663865544e-05, "loss": 0.6804, "mean_token_accuracy": 0.7967705965042114, "num_tokens": 340188522.0, "step": 32790 }, { "entropy": 0.7265658110380173, "epoch": 0.2624, "grad_norm": 4.4313225746154785, "learning_rate": 3.6895158063225294e-05, "loss": 0.7059, "mean_token_accuracy": 0.8128103792667389, "num_tokens": 340222513.0, "step": 32800 }, { "entropy": 0.616725817322731, "epoch": 0.26248, "grad_norm": 1.6078462600708008, "learning_rate": 3.689115646258504e-05, "loss": 0.6178, "mean_token_accuracy": 0.8014833271503449, "num_tokens": 340386148.0, "step": 32810 }, { "entropy": 0.664924630522728, "epoch": 0.26256, "grad_norm": 3.9062247276306152, "learning_rate": 3.6887154861944775e-05, "loss": 0.6631, "mean_token_accuracy": 0.8079020619392395, "num_tokens": 340469655.0, "step": 32820 }, { "entropy": 0.6496977418661117, "epoch": 0.26264, "grad_norm": 1.2563437223434448, "learning_rate": 3.6883153261304525e-05, "loss": 0.6378, "mean_token_accuracy": 0.8135453283786773, "num_tokens": 340563955.0, "step": 32830 }, { "entropy": 0.7594086289405823, "epoch": 0.26272, "grad_norm": 3.0345630645751953, "learning_rate": 3.687915166066427e-05, "loss": 0.7613, "mean_token_accuracy": 0.7780053615570068, "num_tokens": 340693190.0, "step": 32840 }, { "entropy": 0.7562122225761414, "epoch": 0.2628, "grad_norm": 6.536373615264893, "learning_rate": 3.687515006002401e-05, "loss": 0.7468, "mean_token_accuracy": 0.8014226078987121, "num_tokens": 340725903.0, "step": 32850 }, { "entropy": 0.6738062620162963, "epoch": 0.26288, "grad_norm": 2.8864431381225586, "learning_rate": 3.687114845938375e-05, "loss": 0.6764, "mean_token_accuracy": 0.7887113034725189, "num_tokens": 340889409.0, "step": 32860 }, { "entropy": 0.7460431516170501, "epoch": 0.26296, "grad_norm": 3.193321943283081, "learning_rate": 3.68671468587435e-05, "loss": 0.7428, "mean_token_accuracy": 0.790705406665802, "num_tokens": 340967231.0, "step": 32870 }, { "entropy": 0.7760040640830994, "epoch": 0.26304, "grad_norm": 1.7305210828781128, "learning_rate": 3.6863145258103244e-05, "loss": 0.7703, "mean_token_accuracy": 0.7779346466064453, "num_tokens": 341060881.0, "step": 32880 }, { "entropy": 0.6943585097789764, "epoch": 0.26312, "grad_norm": 2.063619375228882, "learning_rate": 3.685914365746299e-05, "loss": 0.6868, "mean_token_accuracy": 0.7885244727134705, "num_tokens": 341201434.0, "step": 32890 }, { "entropy": 0.640406534075737, "epoch": 0.2632, "grad_norm": 4.526434421539307, "learning_rate": 3.685514205682273e-05, "loss": 0.646, "mean_token_accuracy": 0.8249530017375946, "num_tokens": 341242999.0, "step": 32900 }, { "entropy": 0.6577087938785553, "epoch": 0.26328, "grad_norm": 1.4286491870880127, "learning_rate": 3.6851140456182475e-05, "loss": 0.6537, "mean_token_accuracy": 0.7902723610401153, "num_tokens": 341406839.0, "step": 32910 }, { "entropy": 0.6903402447700501, "epoch": 0.26336, "grad_norm": 2.8097450733184814, "learning_rate": 3.684713885554222e-05, "loss": 0.679, "mean_token_accuracy": 0.8038034856319427, "num_tokens": 341501133.0, "step": 32920 }, { "entropy": 0.7367301404476165, "epoch": 0.26344, "grad_norm": 1.7115554809570312, "learning_rate": 3.684313725490196e-05, "loss": 0.7464, "mean_token_accuracy": 0.7889816164970398, "num_tokens": 341595914.0, "step": 32930 }, { "entropy": 0.6676860868930816, "epoch": 0.26352, "grad_norm": 3.2273237705230713, "learning_rate": 3.6839135654261706e-05, "loss": 0.6576, "mean_token_accuracy": 0.8007477521896362, "num_tokens": 341732481.0, "step": 32940 }, { "entropy": 0.6671037048101425, "epoch": 0.2636, "grad_norm": 5.377406597137451, "learning_rate": 3.683513405362145e-05, "loss": 0.6738, "mean_token_accuracy": 0.8229326605796814, "num_tokens": 341770656.0, "step": 32950 }, { "entropy": 0.6622932851314545, "epoch": 0.26368, "grad_norm": 1.969236969947815, "learning_rate": 3.6831132452981194e-05, "loss": 0.6631, "mean_token_accuracy": 0.7915364027023315, "num_tokens": 341934496.0, "step": 32960 }, { "entropy": 0.6691442728042603, "epoch": 0.26376, "grad_norm": 2.988004684448242, "learning_rate": 3.6827130852340944e-05, "loss": 0.6664, "mean_token_accuracy": 0.8044501423835755, "num_tokens": 342025468.0, "step": 32970 }, { "entropy": 0.6314160346984863, "epoch": 0.26384, "grad_norm": 2.4183924198150635, "learning_rate": 3.682312925170068e-05, "loss": 0.6265, "mean_token_accuracy": 0.8137501835823059, "num_tokens": 342119362.0, "step": 32980 }, { "entropy": 0.7022895336151123, "epoch": 0.26392, "grad_norm": 1.9528900384902954, "learning_rate": 3.6819127651060425e-05, "loss": 0.6926, "mean_token_accuracy": 0.7902791678905488, "num_tokens": 342260440.0, "step": 32990 }, { "entropy": 0.7349655717611313, "epoch": 0.264, "grad_norm": 4.902855396270752, "learning_rate": 3.681512605042017e-05, "loss": 0.7412, "mean_token_accuracy": 0.8023376703262329, "num_tokens": 342302710.0, "step": 33000 }, { "entropy": 0.6518924295902252, "epoch": 0.26408, "grad_norm": 2.2199044227600098, "learning_rate": 3.681112444977992e-05, "loss": 0.6483, "mean_token_accuracy": 0.7953834712505341, "num_tokens": 342465529.0, "step": 33010 }, { "entropy": 0.7287621796131134, "epoch": 0.26416, "grad_norm": 4.190641403198242, "learning_rate": 3.6807122849139656e-05, "loss": 0.7245, "mean_token_accuracy": 0.7979584693908691, "num_tokens": 342536789.0, "step": 33020 }, { "entropy": 0.73758744597435, "epoch": 0.26424, "grad_norm": 1.7032628059387207, "learning_rate": 3.68031212484994e-05, "loss": 0.7565, "mean_token_accuracy": 0.7894982635974884, "num_tokens": 342629055.0, "step": 33030 }, { "entropy": 0.7093162626028061, "epoch": 0.26432, "grad_norm": 1.8481009006500244, "learning_rate": 3.679911964785914e-05, "loss": 0.705, "mean_token_accuracy": 0.7841177940368652, "num_tokens": 342775617.0, "step": 33040 }, { "entropy": 0.6971050858497619, "epoch": 0.2644, "grad_norm": 4.140869617462158, "learning_rate": 3.6795118047218894e-05, "loss": 0.6805, "mean_token_accuracy": 0.8231576979160309, "num_tokens": 342820479.0, "step": 33050 }, { "entropy": 0.6149725019931793, "epoch": 0.26448, "grad_norm": 1.4105523824691772, "learning_rate": 3.679111644657863e-05, "loss": 0.6117, "mean_token_accuracy": 0.8018685877323151, "num_tokens": 342984319.0, "step": 33060 }, { "entropy": 0.6892523288726806, "epoch": 0.26456, "grad_norm": 3.0205307006835938, "learning_rate": 3.6787114845938375e-05, "loss": 0.6624, "mean_token_accuracy": 0.8062424600124359, "num_tokens": 343065498.0, "step": 33070 }, { "entropy": 0.7647667586803436, "epoch": 0.26464, "grad_norm": 2.043976306915283, "learning_rate": 3.6783113245298125e-05, "loss": 0.7771, "mean_token_accuracy": 0.7817632675170898, "num_tokens": 343160604.0, "step": 33080 }, { "entropy": 0.6946893543004989, "epoch": 0.26472, "grad_norm": 2.3113443851470947, "learning_rate": 3.677911164465787e-05, "loss": 0.6944, "mean_token_accuracy": 0.7883393049240113, "num_tokens": 343296314.0, "step": 33090 }, { "entropy": 0.607954940199852, "epoch": 0.2648, "grad_norm": 4.58261775970459, "learning_rate": 3.6775110044017606e-05, "loss": 0.6014, "mean_token_accuracy": 0.8332225143909454, "num_tokens": 343335046.0, "step": 33100 }, { "entropy": 0.655914443731308, "epoch": 0.26488, "grad_norm": 1.6961945295333862, "learning_rate": 3.677110844337735e-05, "loss": 0.6534, "mean_token_accuracy": 0.792238998413086, "num_tokens": 343498866.0, "step": 33110 }, { "entropy": 0.6869303345680237, "epoch": 0.26496, "grad_norm": 3.932115077972412, "learning_rate": 3.67671068427371e-05, "loss": 0.6746, "mean_token_accuracy": 0.8075485110282898, "num_tokens": 343584549.0, "step": 33120 }, { "entropy": 0.7539354383945465, "epoch": 0.26504, "grad_norm": 1.918845534324646, "learning_rate": 3.6763105242096844e-05, "loss": 0.7431, "mean_token_accuracy": 0.7885040462017059, "num_tokens": 343679441.0, "step": 33130 }, { "entropy": 0.762727028131485, "epoch": 0.26512, "grad_norm": 1.9847145080566406, "learning_rate": 3.675910364145658e-05, "loss": 0.7548, "mean_token_accuracy": 0.772531247138977, "num_tokens": 343830506.0, "step": 33140 }, { "entropy": 0.7292027831077575, "epoch": 0.2652, "grad_norm": 5.180929660797119, "learning_rate": 3.675510204081633e-05, "loss": 0.7327, "mean_token_accuracy": 0.7996035993099213, "num_tokens": 343874869.0, "step": 33150 }, { "entropy": 0.6463144451379776, "epoch": 0.26528, "grad_norm": 2.040747880935669, "learning_rate": 3.6751100440176075e-05, "loss": 0.6457, "mean_token_accuracy": 0.7938141167163849, "num_tokens": 344038709.0, "step": 33160 }, { "entropy": 0.7358407080173492, "epoch": 0.26536, "grad_norm": 3.3183255195617676, "learning_rate": 3.674709883953582e-05, "loss": 0.7287, "mean_token_accuracy": 0.7969510138034821, "num_tokens": 344119196.0, "step": 33170 }, { "entropy": 0.7414560854434967, "epoch": 0.26544, "grad_norm": 2.726109266281128, "learning_rate": 3.6743097238895555e-05, "loss": 0.7414, "mean_token_accuracy": 0.7900816857814789, "num_tokens": 344212548.0, "step": 33180 }, { "entropy": 0.6612305760383606, "epoch": 0.26552, "grad_norm": 2.2790794372558594, "learning_rate": 3.6739095638255306e-05, "loss": 0.6608, "mean_token_accuracy": 0.8003126382827759, "num_tokens": 344349372.0, "step": 33190 }, { "entropy": 0.7071099519729614, "epoch": 0.2656, "grad_norm": 4.444447040557861, "learning_rate": 3.673509403761505e-05, "loss": 0.6867, "mean_token_accuracy": 0.8184532701969147, "num_tokens": 344384529.0, "step": 33200 }, { "entropy": 0.6554708570241928, "epoch": 0.26568, "grad_norm": 1.54305899143219, "learning_rate": 3.673109243697479e-05, "loss": 0.6587, "mean_token_accuracy": 0.7956289768218994, "num_tokens": 344546533.0, "step": 33210 }, { "entropy": 0.6837885767221451, "epoch": 0.26576, "grad_norm": 3.4441254138946533, "learning_rate": 3.672709083633454e-05, "loss": 0.6827, "mean_token_accuracy": 0.8119972348213196, "num_tokens": 344621519.0, "step": 33220 }, { "entropy": 0.6848122656345368, "epoch": 0.26584, "grad_norm": 2.32273006439209, "learning_rate": 3.672308923569428e-05, "loss": 0.6817, "mean_token_accuracy": 0.8064129054546356, "num_tokens": 344714101.0, "step": 33230 }, { "entropy": 0.7284399807453156, "epoch": 0.26592, "grad_norm": 3.0838425159454346, "learning_rate": 3.6719087635054024e-05, "loss": 0.7179, "mean_token_accuracy": 0.7812524735927582, "num_tokens": 344857236.0, "step": 33240 }, { "entropy": 0.6427169263362884, "epoch": 0.266, "grad_norm": 5.10303258895874, "learning_rate": 3.671508603441377e-05, "loss": 0.6433, "mean_token_accuracy": 0.8219030976295472, "num_tokens": 344898381.0, "step": 33250 }, { "entropy": 0.6764435946941376, "epoch": 0.26608, "grad_norm": 2.1803016662597656, "learning_rate": 3.671108443377351e-05, "loss": 0.6752, "mean_token_accuracy": 0.7890512228012085, "num_tokens": 345060913.0, "step": 33260 }, { "entropy": 0.710009291768074, "epoch": 0.26616, "grad_norm": 3.377032518386841, "learning_rate": 3.6707082833133256e-05, "loss": 0.7015, "mean_token_accuracy": 0.8019222438335418, "num_tokens": 345131741.0, "step": 33270 }, { "entropy": 0.6733140528202057, "epoch": 0.26624, "grad_norm": 2.075653076171875, "learning_rate": 3.6703081232493e-05, "loss": 0.6784, "mean_token_accuracy": 0.8076699078083038, "num_tokens": 345224229.0, "step": 33280 }, { "entropy": 0.7201551735401154, "epoch": 0.26632, "grad_norm": 2.1233339309692383, "learning_rate": 3.669907963185274e-05, "loss": 0.7211, "mean_token_accuracy": 0.7841022491455079, "num_tokens": 345367464.0, "step": 33290 }, { "entropy": 0.6332358539104461, "epoch": 0.2664, "grad_norm": 4.812973976135254, "learning_rate": 3.669507803121249e-05, "loss": 0.6175, "mean_token_accuracy": 0.8291017830371856, "num_tokens": 345408992.0, "step": 33300 }, { "entropy": 0.6725643634796142, "epoch": 0.26648, "grad_norm": 2.132345676422119, "learning_rate": 3.669107643057223e-05, "loss": 0.6734, "mean_token_accuracy": 0.7875061154365539, "num_tokens": 345572832.0, "step": 33310 }, { "entropy": 0.6502987712621688, "epoch": 0.26656, "grad_norm": 3.0437121391296387, "learning_rate": 3.6687074829931974e-05, "loss": 0.6464, "mean_token_accuracy": 0.8098945260047913, "num_tokens": 345661385.0, "step": 33320 }, { "entropy": 0.6556538045406342, "epoch": 0.26664, "grad_norm": 2.1077349185943604, "learning_rate": 3.668307322929172e-05, "loss": 0.6625, "mean_token_accuracy": 0.8027957141399383, "num_tokens": 345754576.0, "step": 33330 }, { "entropy": 0.701808226108551, "epoch": 0.26672, "grad_norm": 2.5247960090637207, "learning_rate": 3.667907162865146e-05, "loss": 0.6982, "mean_token_accuracy": 0.7856244146823883, "num_tokens": 345890601.0, "step": 33340 }, { "entropy": 0.7290285944938659, "epoch": 0.2668, "grad_norm": 5.466452598571777, "learning_rate": 3.6675070028011205e-05, "loss": 0.7317, "mean_token_accuracy": 0.8059246242046356, "num_tokens": 345926423.0, "step": 33350 }, { "entropy": 0.6253626257181167, "epoch": 0.26688, "grad_norm": 1.7493730783462524, "learning_rate": 3.6671068427370956e-05, "loss": 0.6243, "mean_token_accuracy": 0.798115748167038, "num_tokens": 346090122.0, "step": 33360 }, { "entropy": 0.7121848195791245, "epoch": 0.26696, "grad_norm": 3.3216631412506104, "learning_rate": 3.666706682673069e-05, "loss": 0.7029, "mean_token_accuracy": 0.7987848103046418, "num_tokens": 346174719.0, "step": 33370 }, { "entropy": 0.7172899961471557, "epoch": 0.26704, "grad_norm": 1.8434447050094604, "learning_rate": 3.6663065226090436e-05, "loss": 0.7028, "mean_token_accuracy": 0.7945362448692321, "num_tokens": 346269227.0, "step": 33380 }, { "entropy": 0.7265821039676666, "epoch": 0.26712, "grad_norm": 3.3790817260742188, "learning_rate": 3.665906362545018e-05, "loss": 0.7214, "mean_token_accuracy": 0.7866584360599518, "num_tokens": 346390794.0, "step": 33390 }, { "entropy": 0.7127594202756882, "epoch": 0.2672, "grad_norm": 6.181921482086182, "learning_rate": 3.665506202480993e-05, "loss": 0.726, "mean_token_accuracy": 0.811946016550064, "num_tokens": 346426000.0, "step": 33400 }, { "entropy": 0.6855173766613006, "epoch": 0.26728, "grad_norm": 1.385802149772644, "learning_rate": 3.665106042416967e-05, "loss": 0.6845, "mean_token_accuracy": 0.7876343548297882, "num_tokens": 346589840.0, "step": 33410 }, { "entropy": 0.6118706732988357, "epoch": 0.26736, "grad_norm": 3.4715492725372314, "learning_rate": 3.664705882352941e-05, "loss": 0.6125, "mean_token_accuracy": 0.8176046013832092, "num_tokens": 346675466.0, "step": 33420 }, { "entropy": 0.767383462190628, "epoch": 0.26744, "grad_norm": 1.445390224456787, "learning_rate": 3.664305722288916e-05, "loss": 0.7651, "mean_token_accuracy": 0.7838338553905487, "num_tokens": 346769034.0, "step": 33430 }, { "entropy": 0.6934061527252198, "epoch": 0.26752, "grad_norm": 3.7788193225860596, "learning_rate": 3.6639055622248905e-05, "loss": 0.6859, "mean_token_accuracy": 0.7931586146354676, "num_tokens": 346902152.0, "step": 33440 }, { "entropy": 0.7061992228031159, "epoch": 0.2676, "grad_norm": 4.494108200073242, "learning_rate": 3.663505402160864e-05, "loss": 0.7008, "mean_token_accuracy": 0.8114591598510742, "num_tokens": 346938862.0, "step": 33450 }, { "entropy": 0.6511054635047913, "epoch": 0.26768, "grad_norm": 2.640162944793701, "learning_rate": 3.6631052420968386e-05, "loss": 0.6549, "mean_token_accuracy": 0.7948851227760315, "num_tokens": 347101884.0, "step": 33460 }, { "entropy": 0.6678672462701798, "epoch": 0.26776, "grad_norm": 3.7923240661621094, "learning_rate": 3.6627050820328137e-05, "loss": 0.66, "mean_token_accuracy": 0.812489676475525, "num_tokens": 347182434.0, "step": 33470 }, { "entropy": 0.7225765645503998, "epoch": 0.26784, "grad_norm": 2.3706822395324707, "learning_rate": 3.662304921968788e-05, "loss": 0.7058, "mean_token_accuracy": 0.798636394739151, "num_tokens": 347278741.0, "step": 33480 }, { "entropy": 0.7575514376163482, "epoch": 0.26792, "grad_norm": 3.520284414291382, "learning_rate": 3.661904761904762e-05, "loss": 0.7469, "mean_token_accuracy": 0.7804635643959046, "num_tokens": 347410816.0, "step": 33490 }, { "entropy": 0.7305040478706359, "epoch": 0.268, "grad_norm": 5.739903926849365, "learning_rate": 3.661504601840737e-05, "loss": 0.7275, "mean_token_accuracy": 0.805477398633957, "num_tokens": 347449773.0, "step": 33500 }, { "entropy": 0.6931849539279937, "epoch": 0.26808, "grad_norm": 2.1873013973236084, "learning_rate": 3.661104441776711e-05, "loss": 0.7012, "mean_token_accuracy": 0.7843347132205963, "num_tokens": 347610736.0, "step": 33510 }, { "entropy": 0.667041277885437, "epoch": 0.26816, "grad_norm": 3.2395777702331543, "learning_rate": 3.6607042817126855e-05, "loss": 0.6498, "mean_token_accuracy": 0.8137198686599731, "num_tokens": 347682704.0, "step": 33520 }, { "entropy": 0.763819295167923, "epoch": 0.26824, "grad_norm": 2.303605079650879, "learning_rate": 3.660304121648659e-05, "loss": 0.7756, "mean_token_accuracy": 0.7852422833442688, "num_tokens": 347774903.0, "step": 33530 }, { "entropy": 0.7124989986419678, "epoch": 0.26832, "grad_norm": 2.3232665061950684, "learning_rate": 3.659903961584634e-05, "loss": 0.7085, "mean_token_accuracy": 0.7879848837852478, "num_tokens": 347904034.0, "step": 33540 }, { "entropy": 0.7369004964828492, "epoch": 0.2684, "grad_norm": 4.350417137145996, "learning_rate": 3.6595038015206086e-05, "loss": 0.7422, "mean_token_accuracy": 0.8083331465721131, "num_tokens": 347941537.0, "step": 33550 }, { "entropy": 0.6313310503959656, "epoch": 0.26848, "grad_norm": 2.2571842670440674, "learning_rate": 3.659103641456583e-05, "loss": 0.6381, "mean_token_accuracy": 0.798076456785202, "num_tokens": 348105377.0, "step": 33560 }, { "entropy": 0.6960122287273407, "epoch": 0.26856, "grad_norm": 4.748898506164551, "learning_rate": 3.6587034813925574e-05, "loss": 0.6872, "mean_token_accuracy": 0.8000573873519897, "num_tokens": 348193664.0, "step": 33570 }, { "entropy": 0.671344131231308, "epoch": 0.26864, "grad_norm": 2.2358109951019287, "learning_rate": 3.658303321328532e-05, "loss": 0.6544, "mean_token_accuracy": 0.8078294515609741, "num_tokens": 348287130.0, "step": 33580 }, { "entropy": 0.7073175072669983, "epoch": 0.26872, "grad_norm": 2.1297552585601807, "learning_rate": 3.657903161264506e-05, "loss": 0.7064, "mean_token_accuracy": 0.783602386713028, "num_tokens": 348431280.0, "step": 33590 }, { "entropy": 0.6798636734485626, "epoch": 0.2688, "grad_norm": 4.577816009521484, "learning_rate": 3.6575030012004805e-05, "loss": 0.6731, "mean_token_accuracy": 0.8163486361503601, "num_tokens": 348478052.0, "step": 33600 }, { "entropy": 0.6121896386146546, "epoch": 0.26888, "grad_norm": 1.3386650085449219, "learning_rate": 3.657102841136455e-05, "loss": 0.6098, "mean_token_accuracy": 0.8037249565124511, "num_tokens": 348641892.0, "step": 33610 }, { "entropy": 0.7384300172328949, "epoch": 0.26896, "grad_norm": 2.5730113983154297, "learning_rate": 3.656702681072429e-05, "loss": 0.7333, "mean_token_accuracy": 0.7902854740619659, "num_tokens": 348732621.0, "step": 33620 }, { "entropy": 0.6942206084728241, "epoch": 0.26904, "grad_norm": 1.9748036861419678, "learning_rate": 3.6563025210084036e-05, "loss": 0.6902, "mean_token_accuracy": 0.8026382744312286, "num_tokens": 348827171.0, "step": 33630 }, { "entropy": 0.6210346400737763, "epoch": 0.26912, "grad_norm": 2.0216166973114014, "learning_rate": 3.655902360944378e-05, "loss": 0.6207, "mean_token_accuracy": 0.8093623459339142, "num_tokens": 348966629.0, "step": 33640 }, { "entropy": 0.7332232356071472, "epoch": 0.2692, "grad_norm": 5.177121639251709, "learning_rate": 3.6555022008803523e-05, "loss": 0.731, "mean_token_accuracy": 0.8103385269641876, "num_tokens": 349004279.0, "step": 33650 }, { "entropy": 0.6615281164646148, "epoch": 0.26928, "grad_norm": 2.965123176574707, "learning_rate": 3.655102040816327e-05, "loss": 0.662, "mean_token_accuracy": 0.7931851506233215, "num_tokens": 349168119.0, "step": 33660 }, { "entropy": 0.6162289321422577, "epoch": 0.26936, "grad_norm": 4.773803234100342, "learning_rate": 3.654701880752301e-05, "loss": 0.6108, "mean_token_accuracy": 0.8202298164367676, "num_tokens": 349261317.0, "step": 33670 }, { "entropy": 0.6514840066432953, "epoch": 0.26944, "grad_norm": 1.7008777856826782, "learning_rate": 3.6543017206882755e-05, "loss": 0.6489, "mean_token_accuracy": 0.8130769371986389, "num_tokens": 349355439.0, "step": 33680 }, { "entropy": 0.7270500779151916, "epoch": 0.26952, "grad_norm": 2.344510793685913, "learning_rate": 3.65390156062425e-05, "loss": 0.7199, "mean_token_accuracy": 0.7813433825969696, "num_tokens": 349479371.0, "step": 33690 }, { "entropy": 0.6875588774681092, "epoch": 0.2696, "grad_norm": 4.566030979156494, "learning_rate": 3.653501400560224e-05, "loss": 0.6785, "mean_token_accuracy": 0.8144361972808838, "num_tokens": 349516273.0, "step": 33700 }, { "entropy": 0.6592470109462738, "epoch": 0.26968, "grad_norm": 1.3562294244766235, "learning_rate": 3.6531012404961986e-05, "loss": 0.6545, "mean_token_accuracy": 0.7915486216545105, "num_tokens": 349680113.0, "step": 33710 }, { "entropy": 0.6333114951848984, "epoch": 0.26976, "grad_norm": 2.4488143920898438, "learning_rate": 3.652701080432173e-05, "loss": 0.6291, "mean_token_accuracy": 0.8167032957077026, "num_tokens": 349776541.0, "step": 33720 }, { "entropy": 0.7257685363292694, "epoch": 0.26984, "grad_norm": 2.619191884994507, "learning_rate": 3.652300920368147e-05, "loss": 0.722, "mean_token_accuracy": 0.7936597228050232, "num_tokens": 349872554.0, "step": 33730 }, { "entropy": 0.630198672413826, "epoch": 0.26992, "grad_norm": 2.4313313961029053, "learning_rate": 3.651900760304122e-05, "loss": 0.6331, "mean_token_accuracy": 0.803229832649231, "num_tokens": 350002011.0, "step": 33740 }, { "entropy": 0.7239945441484451, "epoch": 0.27, "grad_norm": 4.6299638748168945, "learning_rate": 3.651500600240097e-05, "loss": 0.7236, "mean_token_accuracy": 0.80870481133461, "num_tokens": 350037578.0, "step": 33750 }, { "entropy": 0.6760689318180084, "epoch": 0.27008, "grad_norm": 1.76910400390625, "learning_rate": 3.6511004401760704e-05, "loss": 0.6725, "mean_token_accuracy": 0.7871519327163696, "num_tokens": 350201418.0, "step": 33760 }, { "entropy": 0.6492465078830719, "epoch": 0.27016, "grad_norm": 3.5600433349609375, "learning_rate": 3.650700280112045e-05, "loss": 0.6449, "mean_token_accuracy": 0.8111008524894714, "num_tokens": 350285888.0, "step": 33770 }, { "entropy": 0.7266257166862488, "epoch": 0.27024, "grad_norm": 1.845380425453186, "learning_rate": 3.650300120048019e-05, "loss": 0.7248, "mean_token_accuracy": 0.791782546043396, "num_tokens": 350378716.0, "step": 33780 }, { "entropy": 0.7241268754005432, "epoch": 0.27032, "grad_norm": 2.6828181743621826, "learning_rate": 3.649899959983994e-05, "loss": 0.7281, "mean_token_accuracy": 0.7822962939739228, "num_tokens": 350511739.0, "step": 33790 }, { "entropy": 0.7371690154075623, "epoch": 0.2704, "grad_norm": 4.517561912536621, "learning_rate": 3.649499799919968e-05, "loss": 0.7125, "mean_token_accuracy": 0.8074596405029297, "num_tokens": 350547867.0, "step": 33800 }, { "entropy": 0.6387527883052826, "epoch": 0.27048, "grad_norm": 1.534494400024414, "learning_rate": 3.649099639855942e-05, "loss": 0.6446, "mean_token_accuracy": 0.7941560864448547, "num_tokens": 350711707.0, "step": 33810 }, { "entropy": 0.66025450527668, "epoch": 0.27056, "grad_norm": 3.5820086002349854, "learning_rate": 3.648699479791917e-05, "loss": 0.6621, "mean_token_accuracy": 0.8083414316177369, "num_tokens": 350800495.0, "step": 33820 }, { "entropy": 0.7479409635066986, "epoch": 0.27064, "grad_norm": 2.0495660305023193, "learning_rate": 3.648299319727892e-05, "loss": 0.7348, "mean_token_accuracy": 0.7920852839946747, "num_tokens": 350894052.0, "step": 33830 }, { "entropy": 0.7137787938117981, "epoch": 0.27072, "grad_norm": 2.538778066635132, "learning_rate": 3.6478991596638654e-05, "loss": 0.7195, "mean_token_accuracy": 0.7845139026641845, "num_tokens": 351026200.0, "step": 33840 }, { "entropy": 0.7466339468955994, "epoch": 0.2708, "grad_norm": 3.9396724700927734, "learning_rate": 3.64749899959984e-05, "loss": 0.7526, "mean_token_accuracy": 0.802480947971344, "num_tokens": 351061299.0, "step": 33850 }, { "entropy": 0.7102341473102569, "epoch": 0.27088, "grad_norm": 1.4849681854248047, "learning_rate": 3.647098839535815e-05, "loss": 0.7114, "mean_token_accuracy": 0.7825912773609162, "num_tokens": 351224455.0, "step": 33860 }, { "entropy": 0.71891950070858, "epoch": 0.27096, "grad_norm": 3.461751699447632, "learning_rate": 3.646698679471789e-05, "loss": 0.6927, "mean_token_accuracy": 0.7993570268154144, "num_tokens": 351297528.0, "step": 33870 }, { "entropy": 0.6514715552330017, "epoch": 0.27104, "grad_norm": 1.605654239654541, "learning_rate": 3.646298519407763e-05, "loss": 0.6541, "mean_token_accuracy": 0.8114885151386261, "num_tokens": 351390470.0, "step": 33880 }, { "entropy": 0.6818810343742371, "epoch": 0.27112, "grad_norm": 2.4401779174804688, "learning_rate": 3.645898359343738e-05, "loss": 0.6827, "mean_token_accuracy": 0.7933464765548706, "num_tokens": 351528392.0, "step": 33890 }, { "entropy": 0.6946196675300598, "epoch": 0.2712, "grad_norm": 5.100098133087158, "learning_rate": 3.645498199279712e-05, "loss": 0.6975, "mean_token_accuracy": 0.8141722977161407, "num_tokens": 351570734.0, "step": 33900 }, { "entropy": 0.6849810183048248, "epoch": 0.27128, "grad_norm": 2.2358241081237793, "learning_rate": 3.645098039215687e-05, "loss": 0.6792, "mean_token_accuracy": 0.7902757525444031, "num_tokens": 351734164.0, "step": 33910 }, { "entropy": 0.6249611884355545, "epoch": 0.27136, "grad_norm": 3.440620183944702, "learning_rate": 3.6446978791516604e-05, "loss": 0.6152, "mean_token_accuracy": 0.8249225556850434, "num_tokens": 351811069.0, "step": 33920 }, { "entropy": 0.7073958277702331, "epoch": 0.27144, "grad_norm": 1.6536269187927246, "learning_rate": 3.6442977190876354e-05, "loss": 0.6988, "mean_token_accuracy": 0.8015464544296265, "num_tokens": 351903762.0, "step": 33930 }, { "entropy": 0.7216625332832336, "epoch": 0.27152, "grad_norm": 2.150146484375, "learning_rate": 3.64389755902361e-05, "loss": 0.7172, "mean_token_accuracy": 0.788016963005066, "num_tokens": 352046752.0, "step": 33940 }, { "entropy": 0.6550917744636535, "epoch": 0.2716, "grad_norm": 4.835210800170898, "learning_rate": 3.643497398959584e-05, "loss": 0.6507, "mean_token_accuracy": 0.8202793717384338, "num_tokens": 352087688.0, "step": 33950 }, { "entropy": 0.6551970541477203, "epoch": 0.27168, "grad_norm": 1.630646824836731, "learning_rate": 3.6430972388955585e-05, "loss": 0.6528, "mean_token_accuracy": 0.7960552096366882, "num_tokens": 352251528.0, "step": 33960 }, { "entropy": 0.6608335763216019, "epoch": 0.27176, "grad_norm": 4.72820520401001, "learning_rate": 3.642697078831533e-05, "loss": 0.6561, "mean_token_accuracy": 0.8104414284229279, "num_tokens": 352340289.0, "step": 33970 }, { "entropy": 0.7210727274417877, "epoch": 0.27184, "grad_norm": 2.3490419387817383, "learning_rate": 3.642296918767507e-05, "loss": 0.7212, "mean_token_accuracy": 0.7929614841938019, "num_tokens": 352434381.0, "step": 33980 }, { "entropy": 0.6958214998245239, "epoch": 0.27192, "grad_norm": 3.491304874420166, "learning_rate": 3.6418967587034816e-05, "loss": 0.6844, "mean_token_accuracy": 0.7907333612442017, "num_tokens": 352567018.0, "step": 33990 }, { "entropy": 0.645607304573059, "epoch": 0.272, "grad_norm": 4.160228729248047, "learning_rate": 3.641496598639456e-05, "loss": 0.6427, "mean_token_accuracy": 0.8226007103919983, "num_tokens": 352603596.0, "step": 34000 }, { "entropy": 0.655224335193634, "epoch": 0.27208, "grad_norm": 1.649152398109436, "learning_rate": 3.6410964385754304e-05, "loss": 0.6585, "mean_token_accuracy": 0.7911272704601288, "num_tokens": 352767436.0, "step": 34010 }, { "entropy": 0.7253628730773926, "epoch": 0.27216, "grad_norm": 3.227020502090454, "learning_rate": 3.640696278511405e-05, "loss": 0.7241, "mean_token_accuracy": 0.793191397190094, "num_tokens": 352864612.0, "step": 34020 }, { "entropy": 0.7778690218925476, "epoch": 0.27224, "grad_norm": 2.934138298034668, "learning_rate": 3.640296118447379e-05, "loss": 0.7727, "mean_token_accuracy": 0.7817770957946777, "num_tokens": 352958705.0, "step": 34030 }, { "entropy": 0.7449288547039032, "epoch": 0.27232, "grad_norm": 2.438206434249878, "learning_rate": 3.6398959583833535e-05, "loss": 0.7515, "mean_token_accuracy": 0.7817762911319732, "num_tokens": 353087846.0, "step": 34040 }, { "entropy": 0.7458377480506897, "epoch": 0.2724, "grad_norm": 4.292140960693359, "learning_rate": 3.639495798319328e-05, "loss": 0.7403, "mean_token_accuracy": 0.8093687772750855, "num_tokens": 353124493.0, "step": 34050 }, { "entropy": 0.7175472021102905, "epoch": 0.27248, "grad_norm": 2.2378616333007812, "learning_rate": 3.639095638255302e-05, "loss": 0.7094, "mean_token_accuracy": 0.7856191992759705, "num_tokens": 353288333.0, "step": 34060 }, { "entropy": 0.6950196474790573, "epoch": 0.27256, "grad_norm": 3.7680225372314453, "learning_rate": 3.6386954781912766e-05, "loss": 0.6858, "mean_token_accuracy": 0.8014541149139405, "num_tokens": 353371854.0, "step": 34070 }, { "entropy": 0.7416689991950989, "epoch": 0.27264, "grad_norm": 1.857533574104309, "learning_rate": 3.638295318127251e-05, "loss": 0.744, "mean_token_accuracy": 0.7960905969142914, "num_tokens": 353465862.0, "step": 34080 }, { "entropy": 0.6866984456777573, "epoch": 0.27272, "grad_norm": 2.4182238578796387, "learning_rate": 3.6378951580632254e-05, "loss": 0.6847, "mean_token_accuracy": 0.792459374666214, "num_tokens": 353600853.0, "step": 34090 }, { "entropy": 0.6921685576438904, "epoch": 0.2728, "grad_norm": 4.643912315368652, "learning_rate": 3.6374949979992004e-05, "loss": 0.6804, "mean_token_accuracy": 0.8164550960063934, "num_tokens": 353640909.0, "step": 34100 }, { "entropy": 0.6955572903156281, "epoch": 0.27288, "grad_norm": 1.677168369293213, "learning_rate": 3.637094837935174e-05, "loss": 0.703, "mean_token_accuracy": 0.7818785071372986, "num_tokens": 353802328.0, "step": 34110 }, { "entropy": 0.6718800961971283, "epoch": 0.27296, "grad_norm": 3.2620034217834473, "learning_rate": 3.6366946778711485e-05, "loss": 0.6613, "mean_token_accuracy": 0.8114057064056397, "num_tokens": 353879839.0, "step": 34120 }, { "entropy": 0.6644150793552399, "epoch": 0.27304, "grad_norm": 1.4801595211029053, "learning_rate": 3.636294517807123e-05, "loss": 0.6641, "mean_token_accuracy": 0.8065618872642517, "num_tokens": 353973988.0, "step": 34130 }, { "entropy": 0.6591017305850982, "epoch": 0.27312, "grad_norm": 3.333665370941162, "learning_rate": 3.635894357743098e-05, "loss": 0.6612, "mean_token_accuracy": 0.7981265008449554, "num_tokens": 354101021.0, "step": 34140 }, { "entropy": 0.784100741147995, "epoch": 0.2732, "grad_norm": 4.334733486175537, "learning_rate": 3.6354941976790716e-05, "loss": 0.781, "mean_token_accuracy": 0.795858371257782, "num_tokens": 354134676.0, "step": 34150 }, { "entropy": 0.7082462191581727, "epoch": 0.27328, "grad_norm": 2.4521830081939697, "learning_rate": 3.635094037615046e-05, "loss": 0.7045, "mean_token_accuracy": 0.783133864402771, "num_tokens": 354298516.0, "step": 34160 }, { "entropy": 0.6889138996601105, "epoch": 0.27336, "grad_norm": 2.8494346141815186, "learning_rate": 3.63469387755102e-05, "loss": 0.6906, "mean_token_accuracy": 0.800429391860962, "num_tokens": 354392775.0, "step": 34170 }, { "entropy": 0.6331130504608155, "epoch": 0.27344, "grad_norm": 1.5266941785812378, "learning_rate": 3.6342937174869954e-05, "loss": 0.6414, "mean_token_accuracy": 0.8112362921237946, "num_tokens": 354487503.0, "step": 34180 }, { "entropy": 0.7151282727718353, "epoch": 0.27352, "grad_norm": 2.5851762294769287, "learning_rate": 3.633893557422969e-05, "loss": 0.7002, "mean_token_accuracy": 0.7898857891559601, "num_tokens": 354630062.0, "step": 34190 }, { "entropy": 0.6473446428775788, "epoch": 0.2736, "grad_norm": 4.661618232727051, "learning_rate": 3.6334933973589434e-05, "loss": 0.635, "mean_token_accuracy": 0.8271297335624694, "num_tokens": 354671575.0, "step": 34200 }, { "entropy": 0.6234721004962921, "epoch": 0.27368, "grad_norm": 1.4560410976409912, "learning_rate": 3.6330932372949185e-05, "loss": 0.629, "mean_token_accuracy": 0.7997429192066192, "num_tokens": 354834856.0, "step": 34210 }, { "entropy": 0.6754721134901047, "epoch": 0.27376, "grad_norm": 3.541595458984375, "learning_rate": 3.632693077230893e-05, "loss": 0.6605, "mean_token_accuracy": 0.8117192566394806, "num_tokens": 354910274.0, "step": 34220 }, { "entropy": 0.6827936947345734, "epoch": 0.27384, "grad_norm": 1.6373560428619385, "learning_rate": 3.6322929171668666e-05, "loss": 0.6809, "mean_token_accuracy": 0.8023592352867126, "num_tokens": 355005006.0, "step": 34230 }, { "entropy": 0.6922662258148193, "epoch": 0.27392, "grad_norm": 3.4780149459838867, "learning_rate": 3.631892757102841e-05, "loss": 0.6978, "mean_token_accuracy": 0.7866712927818298, "num_tokens": 355131935.0, "step": 34240 }, { "entropy": 0.8091672956943512, "epoch": 0.274, "grad_norm": 5.887338638305664, "learning_rate": 3.631492597038816e-05, "loss": 0.8072, "mean_token_accuracy": 0.7946574568748475, "num_tokens": 355170766.0, "step": 34250 }, { "entropy": 0.675929707288742, "epoch": 0.27408, "grad_norm": 2.1452417373657227, "learning_rate": 3.6310924369747904e-05, "loss": 0.6732, "mean_token_accuracy": 0.790724229812622, "num_tokens": 355334606.0, "step": 34260 }, { "entropy": 0.6187348276376724, "epoch": 0.27416, "grad_norm": 3.0120513439178467, "learning_rate": 3.630692276910764e-05, "loss": 0.618, "mean_token_accuracy": 0.8184073567390442, "num_tokens": 355425094.0, "step": 34270 }, { "entropy": 0.7118103265762329, "epoch": 0.27424, "grad_norm": 1.5785270929336548, "learning_rate": 3.630292116846739e-05, "loss": 0.7138, "mean_token_accuracy": 0.7966173708438873, "num_tokens": 355520790.0, "step": 34280 }, { "entropy": 0.7015772521495819, "epoch": 0.27432, "grad_norm": 2.0469086170196533, "learning_rate": 3.6298919567827135e-05, "loss": 0.6927, "mean_token_accuracy": 0.7887726783752441, "num_tokens": 355662317.0, "step": 34290 }, { "entropy": 0.7190587997436524, "epoch": 0.2744, "grad_norm": 3.5661089420318604, "learning_rate": 3.629491796718688e-05, "loss": 0.7093, "mean_token_accuracy": 0.8108061850070953, "num_tokens": 355705327.0, "step": 34300 }, { "entropy": 0.6295597314834595, "epoch": 0.27448, "grad_norm": 2.0502521991729736, "learning_rate": 3.6290916366546615e-05, "loss": 0.6304, "mean_token_accuracy": 0.7995053648948669, "num_tokens": 355869167.0, "step": 34310 }, { "entropy": 0.6365151047706604, "epoch": 0.27456, "grad_norm": 3.439326524734497, "learning_rate": 3.6286914765906366e-05, "loss": 0.6263, "mean_token_accuracy": 0.8101549506187439, "num_tokens": 355973030.0, "step": 34320 }, { "entropy": 0.7337982952594757, "epoch": 0.27464, "grad_norm": 1.9601610898971558, "learning_rate": 3.628291316526611e-05, "loss": 0.731, "mean_token_accuracy": 0.797193729877472, "num_tokens": 356066644.0, "step": 34330 }, { "entropy": 0.6837211608886719, "epoch": 0.27472, "grad_norm": 2.87056827545166, "learning_rate": 3.627891156462585e-05, "loss": 0.6801, "mean_token_accuracy": 0.79256951212883, "num_tokens": 356206234.0, "step": 34340 }, { "entropy": 0.6444379717111588, "epoch": 0.2748, "grad_norm": 4.358171463012695, "learning_rate": 3.62749099639856e-05, "loss": 0.6366, "mean_token_accuracy": 0.8298060297966003, "num_tokens": 356243002.0, "step": 34350 }, { "entropy": 0.6977206289768219, "epoch": 0.27488, "grad_norm": 1.8135493993759155, "learning_rate": 3.627090836334534e-05, "loss": 0.695, "mean_token_accuracy": 0.7833075165748596, "num_tokens": 356406732.0, "step": 34360 }, { "entropy": 0.6273634225130081, "epoch": 0.27496, "grad_norm": 3.057716131210327, "learning_rate": 3.6266906762705084e-05, "loss": 0.6321, "mean_token_accuracy": 0.8132675230503082, "num_tokens": 356496460.0, "step": 34370 }, { "entropy": 0.6984886765480042, "epoch": 0.27504, "grad_norm": 1.2711647748947144, "learning_rate": 3.626290516206483e-05, "loss": 0.7122, "mean_token_accuracy": 0.7949736177921295, "num_tokens": 356591050.0, "step": 34380 }, { "entropy": 0.6883631467819213, "epoch": 0.27512, "grad_norm": 2.9768176078796387, "learning_rate": 3.625890356142457e-05, "loss": 0.684, "mean_token_accuracy": 0.7866048395633698, "num_tokens": 356738227.0, "step": 34390 }, { "entropy": 0.6044253647327423, "epoch": 0.2752, "grad_norm": 4.7537665367126465, "learning_rate": 3.6254901960784316e-05, "loss": 0.5949, "mean_token_accuracy": 0.8399396300315857, "num_tokens": 356781050.0, "step": 34400 }, { "entropy": 0.6453481793403626, "epoch": 0.27528, "grad_norm": 2.1602072715759277, "learning_rate": 3.625090036014406e-05, "loss": 0.6439, "mean_token_accuracy": 0.791908347606659, "num_tokens": 356943595.0, "step": 34410 }, { "entropy": 0.7293302804231644, "epoch": 0.27536, "grad_norm": 3.695237636566162, "learning_rate": 3.62468987595038e-05, "loss": 0.7101, "mean_token_accuracy": 0.8016081392765045, "num_tokens": 357019356.0, "step": 34420 }, { "entropy": 0.7240820705890656, "epoch": 0.27544, "grad_norm": 2.4905221462249756, "learning_rate": 3.624289715886355e-05, "loss": 0.7383, "mean_token_accuracy": 0.7941634953022003, "num_tokens": 357111792.0, "step": 34430 }, { "entropy": 0.6230082213878632, "epoch": 0.27552, "grad_norm": 2.2751080989837646, "learning_rate": 3.623889555822329e-05, "loss": 0.621, "mean_token_accuracy": 0.8053069651126862, "num_tokens": 357253628.0, "step": 34440 }, { "entropy": 0.6445761263370514, "epoch": 0.2756, "grad_norm": 5.939539909362793, "learning_rate": 3.6234893957583034e-05, "loss": 0.6287, "mean_token_accuracy": 0.8271118879318238, "num_tokens": 357293886.0, "step": 34450 }, { "entropy": 0.664597111940384, "epoch": 0.27568, "grad_norm": 1.6531615257263184, "learning_rate": 3.623089235694278e-05, "loss": 0.6736, "mean_token_accuracy": 0.7899652540683746, "num_tokens": 357452301.0, "step": 34460 }, { "entropy": 0.6681137204170227, "epoch": 0.27576, "grad_norm": 2.8781015872955322, "learning_rate": 3.622689075630252e-05, "loss": 0.6605, "mean_token_accuracy": 0.8133291304111481, "num_tokens": 357530153.0, "step": 34470 }, { "entropy": 0.6999089896678925, "epoch": 0.27584, "grad_norm": 1.4608676433563232, "learning_rate": 3.6222889155662265e-05, "loss": 0.6878, "mean_token_accuracy": 0.8004487872123718, "num_tokens": 357624896.0, "step": 34480 }, { "entropy": 0.6478185325860977, "epoch": 0.27592, "grad_norm": 1.9174144268035889, "learning_rate": 3.6218887555022016e-05, "loss": 0.6562, "mean_token_accuracy": 0.7954195857048034, "num_tokens": 357771379.0, "step": 34490 }, { "entropy": 0.6922843664884567, "epoch": 0.276, "grad_norm": 5.763391017913818, "learning_rate": 3.621488595438175e-05, "loss": 0.6753, "mean_token_accuracy": 0.8177542805671691, "num_tokens": 357813886.0, "step": 34500 }, { "entropy": 0.6641625046730042, "epoch": 0.27608, "grad_norm": 1.6215773820877075, "learning_rate": 3.6210884353741496e-05, "loss": 0.6711, "mean_token_accuracy": 0.7892159223556519, "num_tokens": 357977726.0, "step": 34510 }, { "entropy": 0.7465814232826233, "epoch": 0.27616, "grad_norm": 2.9046082496643066, "learning_rate": 3.620688275310124e-05, "loss": 0.7329, "mean_token_accuracy": 0.7957541465759277, "num_tokens": 358071761.0, "step": 34520 }, { "entropy": 0.7636492729187012, "epoch": 0.27624, "grad_norm": 1.744908332824707, "learning_rate": 3.620288115246099e-05, "loss": 0.7561, "mean_token_accuracy": 0.7806949853897095, "num_tokens": 358167160.0, "step": 34530 }, { "entropy": 0.6986987471580506, "epoch": 0.27632, "grad_norm": 2.055143356323242, "learning_rate": 3.619887955182073e-05, "loss": 0.7083, "mean_token_accuracy": 0.7837063431739807, "num_tokens": 358315367.0, "step": 34540 }, { "entropy": 0.6742980003356933, "epoch": 0.2764, "grad_norm": 3.56404709815979, "learning_rate": 3.619487795118047e-05, "loss": 0.6594, "mean_token_accuracy": 0.8228295624256134, "num_tokens": 358355645.0, "step": 34550 }, { "entropy": 0.660798579454422, "epoch": 0.27648, "grad_norm": 1.4952048063278198, "learning_rate": 3.619087635054022e-05, "loss": 0.6626, "mean_token_accuracy": 0.790449446439743, "num_tokens": 358519485.0, "step": 34560 }, { "entropy": 0.6820666074752808, "epoch": 0.27656, "grad_norm": 3.3860106468200684, "learning_rate": 3.6186874749899965e-05, "loss": 0.6782, "mean_token_accuracy": 0.8044248998165131, "num_tokens": 358612375.0, "step": 34570 }, { "entropy": 0.7465920805931091, "epoch": 0.27664, "grad_norm": 1.589476227760315, "learning_rate": 3.61828731492597e-05, "loss": 0.7504, "mean_token_accuracy": 0.7880515635013581, "num_tokens": 358707543.0, "step": 34580 }, { "entropy": 0.7109765827655792, "epoch": 0.27672, "grad_norm": 2.1722874641418457, "learning_rate": 3.6178871548619446e-05, "loss": 0.7068, "mean_token_accuracy": 0.7844734489917755, "num_tokens": 358853373.0, "step": 34590 }, { "entropy": 0.7419956952333451, "epoch": 0.2768, "grad_norm": 4.080550670623779, "learning_rate": 3.6174869947979197e-05, "loss": 0.7291, "mean_token_accuracy": 0.8088881433010101, "num_tokens": 358897979.0, "step": 34600 }, { "entropy": 0.637049549818039, "epoch": 0.27688, "grad_norm": 1.4422529935836792, "learning_rate": 3.617086834733894e-05, "loss": 0.6391, "mean_token_accuracy": 0.7969040155410767, "num_tokens": 359061819.0, "step": 34610 }, { "entropy": 0.6083596587181092, "epoch": 0.27696, "grad_norm": 3.1683621406555176, "learning_rate": 3.616686674669868e-05, "loss": 0.603, "mean_token_accuracy": 0.8230327069759369, "num_tokens": 359148739.0, "step": 34620 }, { "entropy": 0.6795733332633972, "epoch": 0.27704, "grad_norm": 1.4640923738479614, "learning_rate": 3.616286514605843e-05, "loss": 0.6883, "mean_token_accuracy": 0.8056971549987793, "num_tokens": 359242055.0, "step": 34630 }, { "entropy": 0.7023892104625702, "epoch": 0.27712, "grad_norm": 2.06038236618042, "learning_rate": 3.615886354541817e-05, "loss": 0.696, "mean_token_accuracy": 0.7890862464904785, "num_tokens": 359383912.0, "step": 34640 }, { "entropy": 0.6531992554664612, "epoch": 0.2772, "grad_norm": 4.828322887420654, "learning_rate": 3.6154861944777915e-05, "loss": 0.6511, "mean_token_accuracy": 0.8245819866657257, "num_tokens": 359422029.0, "step": 34650 }, { "entropy": 0.7210364460945129, "epoch": 0.27728, "grad_norm": 1.9074143171310425, "learning_rate": 3.615086034413765e-05, "loss": 0.7208, "mean_token_accuracy": 0.7805257022380829, "num_tokens": 359581760.0, "step": 34660 }, { "entropy": 0.691352903842926, "epoch": 0.27736, "grad_norm": 3.407313108444214, "learning_rate": 3.61468587434974e-05, "loss": 0.6809, "mean_token_accuracy": 0.8081143260002136, "num_tokens": 359647553.0, "step": 34670 }, { "entropy": 0.656954312324524, "epoch": 0.27744, "grad_norm": 1.9003095626831055, "learning_rate": 3.6142857142857146e-05, "loss": 0.6599, "mean_token_accuracy": 0.8100811183452606, "num_tokens": 359741647.0, "step": 34680 }, { "entropy": 0.6908973693847656, "epoch": 0.27752, "grad_norm": 2.976045608520508, "learning_rate": 3.613885554221689e-05, "loss": 0.6956, "mean_token_accuracy": 0.7869316339492798, "num_tokens": 359894388.0, "step": 34690 }, { "entropy": 0.6238763064146042, "epoch": 0.2776, "grad_norm": 4.190536022186279, "learning_rate": 3.613485394157663e-05, "loss": 0.6228, "mean_token_accuracy": 0.8299234092235566, "num_tokens": 359938678.0, "step": 34700 }, { "entropy": 0.6380058228969574, "epoch": 0.27768, "grad_norm": 1.6532474756240845, "learning_rate": 3.613085234093638e-05, "loss": 0.6358, "mean_token_accuracy": 0.7973803222179413, "num_tokens": 360102518.0, "step": 34710 }, { "entropy": 0.6561073929071426, "epoch": 0.27776, "grad_norm": 3.9063050746917725, "learning_rate": 3.612685074029612e-05, "loss": 0.6458, "mean_token_accuracy": 0.8092326998710633, "num_tokens": 360190712.0, "step": 34720 }, { "entropy": 0.6953338503837585, "epoch": 0.27784, "grad_norm": 2.147874355316162, "learning_rate": 3.6122849139655865e-05, "loss": 0.6937, "mean_token_accuracy": 0.7991531312465667, "num_tokens": 360284716.0, "step": 34730 }, { "entropy": 0.720032274723053, "epoch": 0.27792, "grad_norm": 3.0696616172790527, "learning_rate": 3.611884753901561e-05, "loss": 0.7172, "mean_token_accuracy": 0.7879740595817566, "num_tokens": 360422899.0, "step": 34740 }, { "entropy": 0.675969910621643, "epoch": 0.278, "grad_norm": 4.188307762145996, "learning_rate": 3.611484593837535e-05, "loss": 0.6621, "mean_token_accuracy": 0.817390650510788, "num_tokens": 360460588.0, "step": 34750 }, { "entropy": 0.662059074640274, "epoch": 0.27808, "grad_norm": 1.4313427209854126, "learning_rate": 3.6110844337735096e-05, "loss": 0.6632, "mean_token_accuracy": 0.7893869042396545, "num_tokens": 360624428.0, "step": 34760 }, { "entropy": 0.6468392252922058, "epoch": 0.27816, "grad_norm": 3.3704187870025635, "learning_rate": 3.610684273709484e-05, "loss": 0.6433, "mean_token_accuracy": 0.8166795372962952, "num_tokens": 360706203.0, "step": 34770 }, { "entropy": 0.6739467263221741, "epoch": 0.27824, "grad_norm": 2.2365503311157227, "learning_rate": 3.6102841136454583e-05, "loss": 0.6734, "mean_token_accuracy": 0.8067776620388031, "num_tokens": 360799802.0, "step": 34780 }, { "entropy": 0.6860574007034301, "epoch": 0.27832, "grad_norm": 3.1711039543151855, "learning_rate": 3.609883953581433e-05, "loss": 0.6904, "mean_token_accuracy": 0.7876031398773193, "num_tokens": 360949113.0, "step": 34790 }, { "entropy": 0.6827875375747681, "epoch": 0.2784, "grad_norm": 5.044741630554199, "learning_rate": 3.609483793517407e-05, "loss": 0.6729, "mean_token_accuracy": 0.8179840624332428, "num_tokens": 360992733.0, "step": 34800 }, { "entropy": 0.6639491975307464, "epoch": 0.27848, "grad_norm": 1.9968266487121582, "learning_rate": 3.6090836334533815e-05, "loss": 0.6545, "mean_token_accuracy": 0.7939240396022796, "num_tokens": 361156573.0, "step": 34810 }, { "entropy": 0.6791399836540222, "epoch": 0.27856, "grad_norm": 4.288295745849609, "learning_rate": 3.608683473389356e-05, "loss": 0.6846, "mean_token_accuracy": 0.8040995180606842, "num_tokens": 361234277.0, "step": 34820 }, { "entropy": 0.6844485521316528, "epoch": 0.27864, "grad_norm": 1.3775734901428223, "learning_rate": 3.60828331332533e-05, "loss": 0.6869, "mean_token_accuracy": 0.8009019315242767, "num_tokens": 361326777.0, "step": 34830 }, { "entropy": 0.6660020828247071, "epoch": 0.27872, "grad_norm": 2.754594564437866, "learning_rate": 3.6078831532613046e-05, "loss": 0.6557, "mean_token_accuracy": 0.802704244852066, "num_tokens": 361453274.0, "step": 34840 }, { "entropy": 0.6864256739616394, "epoch": 0.2788, "grad_norm": 5.44666051864624, "learning_rate": 3.607482993197279e-05, "loss": 0.6944, "mean_token_accuracy": 0.8169527649879456, "num_tokens": 361489637.0, "step": 34850 }, { "entropy": 0.6587628185749054, "epoch": 0.27888, "grad_norm": 1.5354069471359253, "learning_rate": 3.607082833133253e-05, "loss": 0.6574, "mean_token_accuracy": 0.791035670042038, "num_tokens": 361653477.0, "step": 34860 }, { "entropy": 0.7013162791728973, "epoch": 0.27896, "grad_norm": 4.929481506347656, "learning_rate": 3.606682673069228e-05, "loss": 0.6985, "mean_token_accuracy": 0.8016179323196411, "num_tokens": 361738693.0, "step": 34870 }, { "entropy": 0.7024687170982361, "epoch": 0.27904, "grad_norm": 1.837951898574829, "learning_rate": 3.606282513005203e-05, "loss": 0.7003, "mean_token_accuracy": 0.8015885531902314, "num_tokens": 361831438.0, "step": 34880 }, { "entropy": 0.6929817140102387, "epoch": 0.27912, "grad_norm": 2.470979928970337, "learning_rate": 3.6058823529411764e-05, "loss": 0.6872, "mean_token_accuracy": 0.7894944250583649, "num_tokens": 361977562.0, "step": 34890 }, { "entropy": 0.7037529617547988, "epoch": 0.2792, "grad_norm": 4.905517101287842, "learning_rate": 3.605482192877151e-05, "loss": 0.7051, "mean_token_accuracy": 0.8076752960681916, "num_tokens": 362018097.0, "step": 34900 }, { "entropy": 0.6773226141929627, "epoch": 0.27928, "grad_norm": 2.047638177871704, "learning_rate": 3.605082032813125e-05, "loss": 0.6823, "mean_token_accuracy": 0.7941637337207794, "num_tokens": 362181185.0, "step": 34910 }, { "entropy": 0.6894678622484207, "epoch": 0.27936, "grad_norm": 3.027834415435791, "learning_rate": 3.6046818727491e-05, "loss": 0.6727, "mean_token_accuracy": 0.8071701288223266, "num_tokens": 362262652.0, "step": 34920 }, { "entropy": 0.7291107833385467, "epoch": 0.27944, "grad_norm": 1.7135087251663208, "learning_rate": 3.604281712685074e-05, "loss": 0.7295, "mean_token_accuracy": 0.7934471368789673, "num_tokens": 362355746.0, "step": 34930 }, { "entropy": 0.6792014002799988, "epoch": 0.27952, "grad_norm": 2.5341079235076904, "learning_rate": 3.603881552621048e-05, "loss": 0.675, "mean_token_accuracy": 0.7938810765743256, "num_tokens": 362498905.0, "step": 34940 }, { "entropy": 0.7344201326370239, "epoch": 0.2796, "grad_norm": 4.859073638916016, "learning_rate": 3.603481392557023e-05, "loss": 0.7361, "mean_token_accuracy": 0.801488608121872, "num_tokens": 362542975.0, "step": 34950 }, { "entropy": 0.6702920854091644, "epoch": 0.27968, "grad_norm": 3.0555992126464844, "learning_rate": 3.603081232492998e-05, "loss": 0.6716, "mean_token_accuracy": 0.7899322211742401, "num_tokens": 362706543.0, "step": 34960 }, { "entropy": 0.7007283568382263, "epoch": 0.27976, "grad_norm": 3.3211636543273926, "learning_rate": 3.6026810724289714e-05, "loss": 0.696, "mean_token_accuracy": 0.8066449522972107, "num_tokens": 362779063.0, "step": 34970 }, { "entropy": 0.695000845193863, "epoch": 0.27984, "grad_norm": 1.2478805780410767, "learning_rate": 3.602280912364946e-05, "loss": 0.6988, "mean_token_accuracy": 0.7986537456512451, "num_tokens": 362872419.0, "step": 34980 }, { "entropy": 0.6896821618080139, "epoch": 0.27992, "grad_norm": 1.8680002689361572, "learning_rate": 3.601880752300921e-05, "loss": 0.6928, "mean_token_accuracy": 0.7926239490509033, "num_tokens": 363025274.0, "step": 34990 }, { "entropy": 0.7255462139844895, "epoch": 0.28, "grad_norm": 5.832179546356201, "learning_rate": 3.601480592236895e-05, "loss": 0.722, "mean_token_accuracy": 0.803101098537445, "num_tokens": 363072080.0, "step": 35000 }, { "entropy": 0.6221553146839142, "epoch": 0.28008, "grad_norm": 1.4758468866348267, "learning_rate": 3.601080432172869e-05, "loss": 0.6207, "mean_token_accuracy": 0.7998961925506591, "num_tokens": 363235920.0, "step": 35010 }, { "entropy": 0.6411400377750397, "epoch": 0.28016, "grad_norm": 2.5370218753814697, "learning_rate": 3.600680272108844e-05, "loss": 0.6288, "mean_token_accuracy": 0.81977499127388, "num_tokens": 363321067.0, "step": 35020 }, { "entropy": 0.7609111130237579, "epoch": 0.28024, "grad_norm": 2.5127670764923096, "learning_rate": 3.600280112044818e-05, "loss": 0.7624, "mean_token_accuracy": 0.7859686076641083, "num_tokens": 363415634.0, "step": 35030 }, { "entropy": 0.6784458637237549, "epoch": 0.28032, "grad_norm": 3.6137797832489014, "learning_rate": 3.599879951980793e-05, "loss": 0.6701, "mean_token_accuracy": 0.7987538933753967, "num_tokens": 363546845.0, "step": 35040 }, { "entropy": 0.6691643804311752, "epoch": 0.2804, "grad_norm": 4.597382545471191, "learning_rate": 3.5994797919167664e-05, "loss": 0.6579, "mean_token_accuracy": 0.8211012959480286, "num_tokens": 363588006.0, "step": 35050 }, { "entropy": 0.6880087614059448, "epoch": 0.28048, "grad_norm": 1.4523032903671265, "learning_rate": 3.5990796318527414e-05, "loss": 0.6939, "mean_token_accuracy": 0.7872862696647644, "num_tokens": 363751846.0, "step": 35060 }, { "entropy": 0.7290651738643646, "epoch": 0.28056, "grad_norm": 3.0334832668304443, "learning_rate": 3.598679471788716e-05, "loss": 0.7322, "mean_token_accuracy": 0.7935158967971802, "num_tokens": 363834900.0, "step": 35070 }, { "entropy": 0.6758025109767913, "epoch": 0.28064, "grad_norm": 2.17488694190979, "learning_rate": 3.59827931172469e-05, "loss": 0.6747, "mean_token_accuracy": 0.8071822822093964, "num_tokens": 363928050.0, "step": 35080 }, { "entropy": 0.66812105178833, "epoch": 0.28072, "grad_norm": 3.3869502544403076, "learning_rate": 3.5978791516606645e-05, "loss": 0.6605, "mean_token_accuracy": 0.7990141212940216, "num_tokens": 364060982.0, "step": 35090 }, { "entropy": 0.6914672076702117, "epoch": 0.2808, "grad_norm": 4.301660060882568, "learning_rate": 3.597478991596639e-05, "loss": 0.6883, "mean_token_accuracy": 0.8170712172985077, "num_tokens": 364096295.0, "step": 35100 }, { "entropy": 0.6419363677501678, "epoch": 0.28088, "grad_norm": 1.6989741325378418, "learning_rate": 3.597078831532613e-05, "loss": 0.6429, "mean_token_accuracy": 0.7946962773799896, "num_tokens": 364259590.0, "step": 35110 }, { "entropy": 0.67065469622612, "epoch": 0.28096, "grad_norm": 3.565885066986084, "learning_rate": 3.5966786714685876e-05, "loss": 0.665, "mean_token_accuracy": 0.8091191530227662, "num_tokens": 364337005.0, "step": 35120 }, { "entropy": 0.7367430090904236, "epoch": 0.28104, "grad_norm": 1.7261028289794922, "learning_rate": 3.596278511404562e-05, "loss": 0.7356, "mean_token_accuracy": 0.7908133029937744, "num_tokens": 364430152.0, "step": 35130 }, { "entropy": 0.6924340307712555, "epoch": 0.28112, "grad_norm": 2.755587100982666, "learning_rate": 3.5958783513405364e-05, "loss": 0.6881, "mean_token_accuracy": 0.7913350105285645, "num_tokens": 364564948.0, "step": 35140 }, { "entropy": 0.6903338670730591, "epoch": 0.2812, "grad_norm": 5.112057685852051, "learning_rate": 3.595478191276511e-05, "loss": 0.6973, "mean_token_accuracy": 0.8138068675994873, "num_tokens": 364606118.0, "step": 35150 }, { "entropy": 0.6706886351108551, "epoch": 0.28128, "grad_norm": 1.32657790184021, "learning_rate": 3.595078031212485e-05, "loss": 0.675, "mean_token_accuracy": 0.7904738664627076, "num_tokens": 364769958.0, "step": 35160 }, { "entropy": 0.718728244304657, "epoch": 0.28136, "grad_norm": 2.742422580718994, "learning_rate": 3.5946778711484595e-05, "loss": 0.7102, "mean_token_accuracy": 0.7971929907798767, "num_tokens": 364869052.0, "step": 35170 }, { "entropy": 0.7339357495307922, "epoch": 0.28144, "grad_norm": 1.5727043151855469, "learning_rate": 3.594277711084434e-05, "loss": 0.7349, "mean_token_accuracy": 0.7872964680194855, "num_tokens": 364963026.0, "step": 35180 }, { "entropy": 0.6802158534526825, "epoch": 0.28152, "grad_norm": 2.8155903816223145, "learning_rate": 3.593877551020408e-05, "loss": 0.682, "mean_token_accuracy": 0.7936185777187348, "num_tokens": 365093368.0, "step": 35190 }, { "entropy": 0.7791071176528931, "epoch": 0.2816, "grad_norm": 4.598688125610352, "learning_rate": 3.5934773909563826e-05, "loss": 0.7751, "mean_token_accuracy": 0.8035103380680084, "num_tokens": 365127698.0, "step": 35200 }, { "entropy": 0.6391238659620285, "epoch": 0.28168, "grad_norm": 1.519743800163269, "learning_rate": 3.593077230892357e-05, "loss": 0.638, "mean_token_accuracy": 0.7981741547584533, "num_tokens": 365291538.0, "step": 35210 }, { "entropy": 0.6735705524682999, "epoch": 0.28176, "grad_norm": 3.045814275741577, "learning_rate": 3.5926770708283314e-05, "loss": 0.6653, "mean_token_accuracy": 0.8087550640106201, "num_tokens": 365375036.0, "step": 35220 }, { "entropy": 0.7149687051773072, "epoch": 0.28184, "grad_norm": 2.5868427753448486, "learning_rate": 3.5922769107643064e-05, "loss": 0.7119, "mean_token_accuracy": 0.7942354738712311, "num_tokens": 365471053.0, "step": 35230 }, { "entropy": 0.6746323525905609, "epoch": 0.28192, "grad_norm": 3.6276514530181885, "learning_rate": 3.59187675070028e-05, "loss": 0.676, "mean_token_accuracy": 0.7925377547740936, "num_tokens": 365616970.0, "step": 35240 }, { "entropy": 0.6773490458726883, "epoch": 0.282, "grad_norm": 4.450044631958008, "learning_rate": 3.5914765906362545e-05, "loss": 0.6802, "mean_token_accuracy": 0.8132771849632263, "num_tokens": 365656330.0, "step": 35250 }, { "entropy": 0.6997089505195617, "epoch": 0.28208, "grad_norm": 1.9628219604492188, "learning_rate": 3.591076430572229e-05, "loss": 0.7009, "mean_token_accuracy": 0.7819186687469483, "num_tokens": 365820170.0, "step": 35260 }, { "entropy": 0.6971266627311706, "epoch": 0.28216, "grad_norm": 2.9075584411621094, "learning_rate": 3.590676270508204e-05, "loss": 0.6829, "mean_token_accuracy": 0.8016007959842681, "num_tokens": 365926862.0, "step": 35270 }, { "entropy": 0.6847038388252258, "epoch": 0.28224, "grad_norm": 2.527996778488159, "learning_rate": 3.5902761104441776e-05, "loss": 0.6885, "mean_token_accuracy": 0.8001006364822387, "num_tokens": 366023394.0, "step": 35280 }, { "entropy": 0.7516319751739502, "epoch": 0.28232, "grad_norm": 2.0208864212036133, "learning_rate": 3.589875950380152e-05, "loss": 0.7496, "mean_token_accuracy": 0.7697063267230988, "num_tokens": 366173879.0, "step": 35290 }, { "entropy": 0.6045530706644058, "epoch": 0.2824, "grad_norm": 4.52311897277832, "learning_rate": 3.589475790316126e-05, "loss": 0.6065, "mean_token_accuracy": 0.8318007290363312, "num_tokens": 366217466.0, "step": 35300 }, { "entropy": 0.6243908166885376, "epoch": 0.28248, "grad_norm": 1.386630892753601, "learning_rate": 3.5890756302521014e-05, "loss": 0.6233, "mean_token_accuracy": 0.7990046501159668, "num_tokens": 366381306.0, "step": 35310 }, { "entropy": 0.6183651894330978, "epoch": 0.28256, "grad_norm": 2.7811670303344727, "learning_rate": 3.588675470188075e-05, "loss": 0.6013, "mean_token_accuracy": 0.8195754528045655, "num_tokens": 366476611.0, "step": 35320 }, { "entropy": 0.6035670399665832, "epoch": 0.28264, "grad_norm": 1.992924451828003, "learning_rate": 3.5882753101240494e-05, "loss": 0.611, "mean_token_accuracy": 0.8170474708080292, "num_tokens": 366571106.0, "step": 35330 }, { "entropy": 0.6751166999340057, "epoch": 0.28272, "grad_norm": 2.3213002681732178, "learning_rate": 3.5878751500600245e-05, "loss": 0.6696, "mean_token_accuracy": 0.7938340246677399, "num_tokens": 366715965.0, "step": 35340 }, { "entropy": 0.6588767945766449, "epoch": 0.2828, "grad_norm": 5.098400592803955, "learning_rate": 3.587474989995999e-05, "loss": 0.6513, "mean_token_accuracy": 0.8245533287525177, "num_tokens": 366755810.0, "step": 35350 }, { "entropy": 0.6934363067150116, "epoch": 0.28288, "grad_norm": 2.0883920192718506, "learning_rate": 3.5870748299319726e-05, "loss": 0.688, "mean_token_accuracy": 0.78727405667305, "num_tokens": 366919650.0, "step": 35360 }, { "entropy": 0.6675321161746979, "epoch": 0.28296, "grad_norm": 3.1605167388916016, "learning_rate": 3.586674669867947e-05, "loss": 0.6723, "mean_token_accuracy": 0.8050067722797394, "num_tokens": 367009311.0, "step": 35370 }, { "entropy": 0.680218243598938, "epoch": 0.28304, "grad_norm": 1.6062928438186646, "learning_rate": 3.586274509803922e-05, "loss": 0.6753, "mean_token_accuracy": 0.8038256347179413, "num_tokens": 367104257.0, "step": 35380 }, { "entropy": 0.7134048938751221, "epoch": 0.28312, "grad_norm": 2.283043384552002, "learning_rate": 3.5858743497398963e-05, "loss": 0.7112, "mean_token_accuracy": 0.7876749157905578, "num_tokens": 367240841.0, "step": 35390 }, { "entropy": 0.705756026506424, "epoch": 0.2832, "grad_norm": 5.069129467010498, "learning_rate": 3.58547418967587e-05, "loss": 0.7021, "mean_token_accuracy": 0.8137281954288482, "num_tokens": 367283963.0, "step": 35400 }, { "entropy": 0.6686882317066193, "epoch": 0.28328, "grad_norm": 1.6876978874206543, "learning_rate": 3.585074029611845e-05, "loss": 0.6728, "mean_token_accuracy": 0.7913348734378814, "num_tokens": 367447803.0, "step": 35410 }, { "entropy": 0.5882737576961518, "epoch": 0.28336, "grad_norm": 2.641145944595337, "learning_rate": 3.5846738695478195e-05, "loss": 0.5652, "mean_token_accuracy": 0.830255800485611, "num_tokens": 367535997.0, "step": 35420 }, { "entropy": 0.7020791888236999, "epoch": 0.28344, "grad_norm": 1.8709136247634888, "learning_rate": 3.584273709483794e-05, "loss": 0.708, "mean_token_accuracy": 0.7982983112335205, "num_tokens": 367630277.0, "step": 35430 }, { "entropy": 0.7169642627239228, "epoch": 0.28352, "grad_norm": 2.7512762546539307, "learning_rate": 3.5838735494197675e-05, "loss": 0.7206, "mean_token_accuracy": 0.7824209749698638, "num_tokens": 367768286.0, "step": 35440 }, { "entropy": 0.6482817351818084, "epoch": 0.2836, "grad_norm": 4.510401725769043, "learning_rate": 3.5834733893557426e-05, "loss": 0.6428, "mean_token_accuracy": 0.8248153507709504, "num_tokens": 367810557.0, "step": 35450 }, { "entropy": 0.6896522998809814, "epoch": 0.28368, "grad_norm": 1.7516998052597046, "learning_rate": 3.583073229291717e-05, "loss": 0.6841, "mean_token_accuracy": 0.786872124671936, "num_tokens": 367973918.0, "step": 35460 }, { "entropy": 0.6695570230484009, "epoch": 0.28376, "grad_norm": 3.817641019821167, "learning_rate": 3.582673069227691e-05, "loss": 0.6665, "mean_token_accuracy": 0.8102793455123901, "num_tokens": 368052746.0, "step": 35470 }, { "entropy": 0.7329314589500427, "epoch": 0.28384, "grad_norm": 1.5538108348846436, "learning_rate": 3.582272909163666e-05, "loss": 0.7528, "mean_token_accuracy": 0.7843957364559173, "num_tokens": 368145336.0, "step": 35480 }, { "entropy": 0.6910847842693328, "epoch": 0.28392, "grad_norm": 1.9982093572616577, "learning_rate": 3.58187274909964e-05, "loss": 0.6804, "mean_token_accuracy": 0.7880272746086121, "num_tokens": 368295672.0, "step": 35490 }, { "entropy": 0.7250650137662887, "epoch": 0.284, "grad_norm": 5.694874286651611, "learning_rate": 3.5814725890356144e-05, "loss": 0.711, "mean_token_accuracy": 0.8108340620994567, "num_tokens": 368338354.0, "step": 35500 }, { "entropy": 0.6586406111717225, "epoch": 0.28408, "grad_norm": 1.9552395343780518, "learning_rate": 3.581072428971589e-05, "loss": 0.6637, "mean_token_accuracy": 0.7890815913677216, "num_tokens": 368502194.0, "step": 35510 }, { "entropy": 0.6863220393657684, "epoch": 0.28416, "grad_norm": 2.8901782035827637, "learning_rate": 3.580672268907563e-05, "loss": 0.6626, "mean_token_accuracy": 0.8041879177093506, "num_tokens": 368584332.0, "step": 35520 }, { "entropy": 0.6966505646705627, "epoch": 0.28424, "grad_norm": 1.6950081586837769, "learning_rate": 3.5802721088435375e-05, "loss": 0.7016, "mean_token_accuracy": 0.8021006226539612, "num_tokens": 368678303.0, "step": 35530 }, { "entropy": 0.6353561341762543, "epoch": 0.28432, "grad_norm": 3.1700944900512695, "learning_rate": 3.579871948779512e-05, "loss": 0.6343, "mean_token_accuracy": 0.8042982578277588, "num_tokens": 368813826.0, "step": 35540 }, { "entropy": 0.7066162765026093, "epoch": 0.2844, "grad_norm": 5.070940017700195, "learning_rate": 3.579471788715486e-05, "loss": 0.6957, "mean_token_accuracy": 0.8103427767753602, "num_tokens": 368847994.0, "step": 35550 }, { "entropy": 0.7066129148006439, "epoch": 0.28448, "grad_norm": 2.8071742057800293, "learning_rate": 3.579071628651461e-05, "loss": 0.7039, "mean_token_accuracy": 0.7836259067058563, "num_tokens": 369011131.0, "step": 35560 }, { "entropy": 0.5964990735054017, "epoch": 0.28456, "grad_norm": 3.4480462074279785, "learning_rate": 3.578671468587435e-05, "loss": 0.5877, "mean_token_accuracy": 0.8258561670780182, "num_tokens": 369089786.0, "step": 35570 }, { "entropy": 0.7299088716506958, "epoch": 0.28464, "grad_norm": 1.773677945137024, "learning_rate": 3.5782713085234094e-05, "loss": 0.7383, "mean_token_accuracy": 0.7949960589408874, "num_tokens": 369181814.0, "step": 35580 }, { "entropy": 0.7058695554733276, "epoch": 0.28472, "grad_norm": 2.6813526153564453, "learning_rate": 3.577871148459384e-05, "loss": 0.7064, "mean_token_accuracy": 0.7881397306919098, "num_tokens": 369317050.0, "step": 35590 }, { "entropy": 0.7569279372692108, "epoch": 0.2848, "grad_norm": 4.457726001739502, "learning_rate": 3.577470988395358e-05, "loss": 0.7385, "mean_token_accuracy": 0.805418598651886, "num_tokens": 369354275.0, "step": 35600 }, { "entropy": 0.6896574199199677, "epoch": 0.28488, "grad_norm": 2.367922306060791, "learning_rate": 3.5770708283313325e-05, "loss": 0.6955, "mean_token_accuracy": 0.7886297047138214, "num_tokens": 369518115.0, "step": 35610 }, { "entropy": 0.6886617839336395, "epoch": 0.28496, "grad_norm": 3.3384177684783936, "learning_rate": 3.5766706682673076e-05, "loss": 0.6805, "mean_token_accuracy": 0.8085120260715485, "num_tokens": 369597973.0, "step": 35620 }, { "entropy": 0.7109158962965012, "epoch": 0.28504, "grad_norm": 2.514406442642212, "learning_rate": 3.576270508203281e-05, "loss": 0.7128, "mean_token_accuracy": 0.797601068019867, "num_tokens": 369693136.0, "step": 35630 }, { "entropy": 0.6603644609451294, "epoch": 0.28512, "grad_norm": 2.7919132709503174, "learning_rate": 3.5758703481392556e-05, "loss": 0.6548, "mean_token_accuracy": 0.7988789081573486, "num_tokens": 369825903.0, "step": 35640 }, { "entropy": 0.7308733642101288, "epoch": 0.2852, "grad_norm": 5.308852195739746, "learning_rate": 3.57547018807523e-05, "loss": 0.7411, "mean_token_accuracy": 0.8021253168582916, "num_tokens": 369862004.0, "step": 35650 }, { "entropy": 0.6027698367834091, "epoch": 0.28528, "grad_norm": 1.985392451286316, "learning_rate": 3.575070028011205e-05, "loss": 0.6008, "mean_token_accuracy": 0.8091674447059631, "num_tokens": 370025776.0, "step": 35660 }, { "entropy": 0.6536019146442413, "epoch": 0.28536, "grad_norm": 5.014222145080566, "learning_rate": 3.574669867947179e-05, "loss": 0.6436, "mean_token_accuracy": 0.809870046377182, "num_tokens": 370111791.0, "step": 35670 }, { "entropy": 0.7391193687915802, "epoch": 0.28544, "grad_norm": 2.178619384765625, "learning_rate": 3.574269707883153e-05, "loss": 0.7366, "mean_token_accuracy": 0.7875950753688812, "num_tokens": 370206693.0, "step": 35680 }, { "entropy": 0.7219445407390594, "epoch": 0.28552, "grad_norm": 1.9687036275863647, "learning_rate": 3.573869547819128e-05, "loss": 0.7143, "mean_token_accuracy": 0.7779560923576355, "num_tokens": 370356192.0, "step": 35690 }, { "entropy": 0.6708629697561264, "epoch": 0.2856, "grad_norm": 4.575404167175293, "learning_rate": 3.5734693877551025e-05, "loss": 0.6735, "mean_token_accuracy": 0.8140276610851288, "num_tokens": 370406228.0, "step": 35700 }, { "entropy": 0.6387245059013367, "epoch": 0.28568, "grad_norm": 1.9402509927749634, "learning_rate": 3.573069227691076e-05, "loss": 0.638, "mean_token_accuracy": 0.7990657091140747, "num_tokens": 370570068.0, "step": 35710 }, { "entropy": 0.7115154147148133, "epoch": 0.28576, "grad_norm": 3.0701284408569336, "learning_rate": 3.5726690676270506e-05, "loss": 0.7046, "mean_token_accuracy": 0.7991094589233398, "num_tokens": 370655359.0, "step": 35720 }, { "entropy": 0.687364649772644, "epoch": 0.28584, "grad_norm": 1.952388882637024, "learning_rate": 3.5722689075630257e-05, "loss": 0.6961, "mean_token_accuracy": 0.7979823172092437, "num_tokens": 370751202.0, "step": 35730 }, { "entropy": 0.6556975662708282, "epoch": 0.28592, "grad_norm": 2.7951672077178955, "learning_rate": 3.571868747499e-05, "loss": 0.6391, "mean_token_accuracy": 0.8045458495616913, "num_tokens": 370875868.0, "step": 35740 }, { "entropy": 0.743638014793396, "epoch": 0.286, "grad_norm": 4.955875396728516, "learning_rate": 3.571468587434974e-05, "loss": 0.7433, "mean_token_accuracy": 0.8042830765247345, "num_tokens": 370910328.0, "step": 35750 }, { "entropy": 0.6682747900485992, "epoch": 0.28608, "grad_norm": 1.6336430311203003, "learning_rate": 3.571068427370949e-05, "loss": 0.672, "mean_token_accuracy": 0.7885025680065155, "num_tokens": 371072230.0, "step": 35760 }, { "entropy": 0.5358928233385086, "epoch": 0.28616, "grad_norm": 3.130784273147583, "learning_rate": 3.570668267306923e-05, "loss": 0.5182, "mean_token_accuracy": 0.8436139404773713, "num_tokens": 371143688.0, "step": 35770 }, { "entropy": 0.7199786722660064, "epoch": 0.28624, "grad_norm": 1.7537919282913208, "learning_rate": 3.5702681072428975e-05, "loss": 0.7296, "mean_token_accuracy": 0.792664647102356, "num_tokens": 371238320.0, "step": 35780 }, { "entropy": 0.6492435008287429, "epoch": 0.28632, "grad_norm": 3.0022222995758057, "learning_rate": 3.569867947178871e-05, "loss": 0.6468, "mean_token_accuracy": 0.8004576206207276, "num_tokens": 371368489.0, "step": 35790 }, { "entropy": 0.6955648183822631, "epoch": 0.2864, "grad_norm": 3.71942138671875, "learning_rate": 3.569467787114846e-05, "loss": 0.6956, "mean_token_accuracy": 0.8153061211109162, "num_tokens": 371404905.0, "step": 35800 }, { "entropy": 0.6337154924869537, "epoch": 0.28648, "grad_norm": 1.8255914449691772, "learning_rate": 3.5690676270508206e-05, "loss": 0.6283, "mean_token_accuracy": 0.8012866318225861, "num_tokens": 371568716.0, "step": 35810 }, { "entropy": 0.6047454565763474, "epoch": 0.28656, "grad_norm": 3.4558310508728027, "learning_rate": 3.568667466986795e-05, "loss": 0.6042, "mean_token_accuracy": 0.8242192983627319, "num_tokens": 371649818.0, "step": 35820 }, { "entropy": 0.675779390335083, "epoch": 0.28664, "grad_norm": 1.8848927021026611, "learning_rate": 3.568267306922769e-05, "loss": 0.6657, "mean_token_accuracy": 0.8070303499698639, "num_tokens": 371744293.0, "step": 35830 }, { "entropy": 0.7267378509044647, "epoch": 0.28672, "grad_norm": 2.730228900909424, "learning_rate": 3.567867146858744e-05, "loss": 0.7308, "mean_token_accuracy": 0.7798525273799897, "num_tokens": 371886905.0, "step": 35840 }, { "entropy": 0.7369227319955826, "epoch": 0.2868, "grad_norm": 6.871865749359131, "learning_rate": 3.567466986794718e-05, "loss": 0.7255, "mean_token_accuracy": 0.8091054558753967, "num_tokens": 371925823.0, "step": 35850 }, { "entropy": 0.6752192437648773, "epoch": 0.28688, "grad_norm": 1.6258751153945923, "learning_rate": 3.5670668267306925e-05, "loss": 0.6788, "mean_token_accuracy": 0.7864741206169128, "num_tokens": 372089663.0, "step": 35860 }, { "entropy": 0.7107825070619583, "epoch": 0.28696, "grad_norm": 3.5986227989196777, "learning_rate": 3.566666666666667e-05, "loss": 0.6983, "mean_token_accuracy": 0.7974171936511993, "num_tokens": 372179086.0, "step": 35870 }, { "entropy": 0.6914624452590943, "epoch": 0.28704, "grad_norm": 1.836802363395691, "learning_rate": 3.566266506602641e-05, "loss": 0.6849, "mean_token_accuracy": 0.7992295622825623, "num_tokens": 372272485.0, "step": 35880 }, { "entropy": 0.6949404716491699, "epoch": 0.28712, "grad_norm": 2.0830941200256348, "learning_rate": 3.5658663465386156e-05, "loss": 0.6996, "mean_token_accuracy": 0.7925841808319092, "num_tokens": 372400529.0, "step": 35890 }, { "entropy": 0.7126184940338135, "epoch": 0.2872, "grad_norm": 4.588127136230469, "learning_rate": 3.56546618647459e-05, "loss": 0.7073, "mean_token_accuracy": 0.806968504190445, "num_tokens": 372438245.0, "step": 35900 }, { "entropy": 0.7269179463386536, "epoch": 0.28728, "grad_norm": 1.7901487350463867, "learning_rate": 3.565066026410564e-05, "loss": 0.7254, "mean_token_accuracy": 0.7762498557567596, "num_tokens": 372601129.0, "step": 35910 }, { "entropy": 0.7098449051380158, "epoch": 0.28736, "grad_norm": 5.330254077911377, "learning_rate": 3.564665866346539e-05, "loss": 0.7101, "mean_token_accuracy": 0.7981628060340882, "num_tokens": 372681473.0, "step": 35920 }, { "entropy": 0.756537652015686, "epoch": 0.28744, "grad_norm": 2.1503026485443115, "learning_rate": 3.564265706282513e-05, "loss": 0.7577, "mean_token_accuracy": 0.7864811003208161, "num_tokens": 372774854.0, "step": 35930 }, { "entropy": 0.6954641878604889, "epoch": 0.28752, "grad_norm": 3.6229214668273926, "learning_rate": 3.5638655462184875e-05, "loss": 0.696, "mean_token_accuracy": 0.7920300781726837, "num_tokens": 372907889.0, "step": 35940 }, { "entropy": 0.7040481090545654, "epoch": 0.2876, "grad_norm": 4.612942218780518, "learning_rate": 3.563465386154462e-05, "loss": 0.6845, "mean_token_accuracy": 0.8138590276241302, "num_tokens": 372948575.0, "step": 35950 }, { "entropy": 0.6498461663722992, "epoch": 0.28768, "grad_norm": 1.9015532732009888, "learning_rate": 3.563065226090436e-05, "loss": 0.6482, "mean_token_accuracy": 0.792659991979599, "num_tokens": 373112415.0, "step": 35960 }, { "entropy": 0.719274514913559, "epoch": 0.28776, "grad_norm": 3.208639621734619, "learning_rate": 3.5626650660264106e-05, "loss": 0.7117, "mean_token_accuracy": 0.7969782233238221, "num_tokens": 373201591.0, "step": 35970 }, { "entropy": 0.6546317934989929, "epoch": 0.28784, "grad_norm": 2.1026570796966553, "learning_rate": 3.562264905962385e-05, "loss": 0.6639, "mean_token_accuracy": 0.8096394777297974, "num_tokens": 373294942.0, "step": 35980 }, { "entropy": 0.6461835205554962, "epoch": 0.28792, "grad_norm": 2.154398202896118, "learning_rate": 3.561864745898359e-05, "loss": 0.6392, "mean_token_accuracy": 0.7991821646690369, "num_tokens": 373444355.0, "step": 35990 }, { "entropy": 0.7218305766582489, "epoch": 0.288, "grad_norm": 5.136142730712891, "learning_rate": 3.561464585834334e-05, "loss": 0.725, "mean_token_accuracy": 0.8089985966682434, "num_tokens": 373485362.0, "step": 36000 }, { "entropy": 0.6946505308151245, "epoch": 0.28808, "grad_norm": 1.8170242309570312, "learning_rate": 3.561064425770309e-05, "loss": 0.6901, "mean_token_accuracy": 0.786358094215393, "num_tokens": 373649202.0, "step": 36010 }, { "entropy": 0.7478019952774048, "epoch": 0.28816, "grad_norm": 4.090123176574707, "learning_rate": 3.5606642657062824e-05, "loss": 0.7386, "mean_token_accuracy": 0.7959008693695069, "num_tokens": 373732884.0, "step": 36020 }, { "entropy": 0.6954157173633575, "epoch": 0.28824, "grad_norm": 1.5194168090820312, "learning_rate": 3.560264105642257e-05, "loss": 0.6952, "mean_token_accuracy": 0.7993364870548249, "num_tokens": 373826719.0, "step": 36030 }, { "entropy": 0.6657898366451264, "epoch": 0.28832, "grad_norm": 2.1374309062957764, "learning_rate": 3.559863945578231e-05, "loss": 0.6662, "mean_token_accuracy": 0.7966009676456451, "num_tokens": 373974708.0, "step": 36040 }, { "entropy": 0.7071797758340835, "epoch": 0.2884, "grad_norm": 5.158850193023682, "learning_rate": 3.559463785514206e-05, "loss": 0.686, "mean_token_accuracy": 0.8115092277526855, "num_tokens": 374014569.0, "step": 36050 }, { "entropy": 0.6401275634765625, "epoch": 0.28848, "grad_norm": 1.7470004558563232, "learning_rate": 3.55906362545018e-05, "loss": 0.6394, "mean_token_accuracy": 0.7966475367546082, "num_tokens": 374178409.0, "step": 36060 }, { "entropy": 0.6895491659641266, "epoch": 0.28856, "grad_norm": 3.0927326679229736, "learning_rate": 3.558663465386154e-05, "loss": 0.6929, "mean_token_accuracy": 0.8038084506988525, "num_tokens": 374265863.0, "step": 36070 }, { "entropy": 0.70427685379982, "epoch": 0.28864, "grad_norm": 2.6375129222869873, "learning_rate": 3.558263305322129e-05, "loss": 0.6954, "mean_token_accuracy": 0.7982909679412842, "num_tokens": 374360817.0, "step": 36080 }, { "entropy": 0.7039732694625854, "epoch": 0.28872, "grad_norm": 2.1509780883789062, "learning_rate": 3.557863145258104e-05, "loss": 0.6965, "mean_token_accuracy": 0.7891928136348725, "num_tokens": 374500895.0, "step": 36090 }, { "entropy": 0.6276291191577912, "epoch": 0.2888, "grad_norm": 5.282353401184082, "learning_rate": 3.5574629851940774e-05, "loss": 0.6327, "mean_token_accuracy": 0.8283812046051026, "num_tokens": 374540075.0, "step": 36100 }, { "entropy": 0.6404716730117798, "epoch": 0.28888, "grad_norm": 1.971622109413147, "learning_rate": 3.557062825130052e-05, "loss": 0.6474, "mean_token_accuracy": 0.794607961177826, "num_tokens": 374703915.0, "step": 36110 }, { "entropy": 0.6942058324813842, "epoch": 0.28896, "grad_norm": 3.1197571754455566, "learning_rate": 3.556662665066027e-05, "loss": 0.6805, "mean_token_accuracy": 0.8066939890384675, "num_tokens": 374777011.0, "step": 36120 }, { "entropy": 0.6399404406547546, "epoch": 0.28904, "grad_norm": 1.7476311922073364, "learning_rate": 3.556262505002001e-05, "loss": 0.6372, "mean_token_accuracy": 0.8186839103698731, "num_tokens": 374869664.0, "step": 36130 }, { "entropy": 0.716188645362854, "epoch": 0.28912, "grad_norm": 3.2550222873687744, "learning_rate": 3.555862344937975e-05, "loss": 0.7164, "mean_token_accuracy": 0.7824483215808868, "num_tokens": 375007621.0, "step": 36140 }, { "entropy": 0.7832793474197388, "epoch": 0.2892, "grad_norm": 5.468112468719482, "learning_rate": 3.55546218487395e-05, "loss": 0.7769, "mean_token_accuracy": 0.7955198228359223, "num_tokens": 375047008.0, "step": 36150 }, { "entropy": 0.655264413356781, "epoch": 0.28928, "grad_norm": 2.446632146835327, "learning_rate": 3.555062024809924e-05, "loss": 0.6551, "mean_token_accuracy": 0.7937680244445801, "num_tokens": 375210250.0, "step": 36160 }, { "entropy": 0.7291269838809967, "epoch": 0.28936, "grad_norm": 3.5504648685455322, "learning_rate": 3.554661864745899e-05, "loss": 0.727, "mean_token_accuracy": 0.7958620846271515, "num_tokens": 375294645.0, "step": 36170 }, { "entropy": 0.7013085424900055, "epoch": 0.28944, "grad_norm": 1.5677372217178345, "learning_rate": 3.5542617046818724e-05, "loss": 0.6954, "mean_token_accuracy": 0.8049417436122894, "num_tokens": 375388455.0, "step": 36180 }, { "entropy": 0.7406546294689178, "epoch": 0.28952, "grad_norm": 2.3680715560913086, "learning_rate": 3.5538615446178474e-05, "loss": 0.739, "mean_token_accuracy": 0.7767703354358673, "num_tokens": 375534336.0, "step": 36190 }, { "entropy": 0.5984640538692474, "epoch": 0.2896, "grad_norm": 7.115092754364014, "learning_rate": 3.553461384553822e-05, "loss": 0.5919, "mean_token_accuracy": 0.8378125011920929, "num_tokens": 375578208.0, "step": 36200 }, { "entropy": 0.6894302666187286, "epoch": 0.28968, "grad_norm": 2.0732648372650146, "learning_rate": 3.553061224489796e-05, "loss": 0.6948, "mean_token_accuracy": 0.7896006345748902, "num_tokens": 375742048.0, "step": 36210 }, { "entropy": 0.6527284324169159, "epoch": 0.28976, "grad_norm": 3.0159828662872314, "learning_rate": 3.5526610644257705e-05, "loss": 0.6314, "mean_token_accuracy": 0.8162875354290009, "num_tokens": 375831313.0, "step": 36220 }, { "entropy": 0.671146422624588, "epoch": 0.28984, "grad_norm": 2.254509449005127, "learning_rate": 3.552260904361745e-05, "loss": 0.6745, "mean_token_accuracy": 0.8027320325374603, "num_tokens": 375925474.0, "step": 36230 }, { "entropy": 0.6775806725025177, "epoch": 0.28992, "grad_norm": 3.508948564529419, "learning_rate": 3.551860744297719e-05, "loss": 0.673, "mean_token_accuracy": 0.7956272423267364, "num_tokens": 376062857.0, "step": 36240 }, { "entropy": 0.6829256474971771, "epoch": 0.29, "grad_norm": 4.273438930511475, "learning_rate": 3.5514605842336936e-05, "loss": 0.67, "mean_token_accuracy": 0.8204146325588226, "num_tokens": 376099163.0, "step": 36250 }, { "entropy": 0.6826465129852295, "epoch": 0.29008, "grad_norm": 2.221810817718506, "learning_rate": 3.551060424169668e-05, "loss": 0.6867, "mean_token_accuracy": 0.7867489159107208, "num_tokens": 376263003.0, "step": 36260 }, { "entropy": 0.6476034879684448, "epoch": 0.29016, "grad_norm": 3.29852032661438, "learning_rate": 3.5506602641056424e-05, "loss": 0.6438, "mean_token_accuracy": 0.8152552843093872, "num_tokens": 376339564.0, "step": 36270 }, { "entropy": 0.7004345417022705, "epoch": 0.29024, "grad_norm": 1.6585980653762817, "learning_rate": 3.550260104041617e-05, "loss": 0.699, "mean_token_accuracy": 0.7996417224407196, "num_tokens": 376431063.0, "step": 36280 }, { "entropy": 0.6798316776752472, "epoch": 0.29032, "grad_norm": 2.4280948638916016, "learning_rate": 3.549859943977592e-05, "loss": 0.6774, "mean_token_accuracy": 0.7928250432014465, "num_tokens": 376577642.0, "step": 36290 }, { "entropy": 0.6674289345741272, "epoch": 0.2904, "grad_norm": 5.581754207611084, "learning_rate": 3.5494597839135655e-05, "loss": 0.6652, "mean_token_accuracy": 0.8187413275241852, "num_tokens": 376620714.0, "step": 36300 }, { "entropy": 0.6190842002630234, "epoch": 0.29048, "grad_norm": 1.949578046798706, "learning_rate": 3.54905962384954e-05, "loss": 0.6179, "mean_token_accuracy": 0.8012640416622162, "num_tokens": 376784554.0, "step": 36310 }, { "entropy": 0.7333730250597, "epoch": 0.29056, "grad_norm": 3.042177677154541, "learning_rate": 3.548659463785514e-05, "loss": 0.735, "mean_token_accuracy": 0.7928633630275727, "num_tokens": 376869736.0, "step": 36320 }, { "entropy": 0.7145168125629425, "epoch": 0.29064, "grad_norm": 1.381277322769165, "learning_rate": 3.548259303721489e-05, "loss": 0.7137, "mean_token_accuracy": 0.7998096346855164, "num_tokens": 376962861.0, "step": 36330 }, { "entropy": 0.710681802034378, "epoch": 0.29072, "grad_norm": 3.2918636798858643, "learning_rate": 3.547859143657463e-05, "loss": 0.7024, "mean_token_accuracy": 0.7903021931648254, "num_tokens": 377092413.0, "step": 36340 }, { "entropy": 0.634982430934906, "epoch": 0.2908, "grad_norm": 4.152463912963867, "learning_rate": 3.5474589835934374e-05, "loss": 0.6252, "mean_token_accuracy": 0.8283346593379974, "num_tokens": 377127306.0, "step": 36350 }, { "entropy": 0.6896333158016205, "epoch": 0.29088, "grad_norm": 1.881300449371338, "learning_rate": 3.5470588235294124e-05, "loss": 0.6915, "mean_token_accuracy": 0.784747737646103, "num_tokens": 377287893.0, "step": 36360 }, { "entropy": 0.6395633786916732, "epoch": 0.29096, "grad_norm": 3.702529191970825, "learning_rate": 3.546658663465387e-05, "loss": 0.6307, "mean_token_accuracy": 0.821854704618454, "num_tokens": 377354165.0, "step": 36370 }, { "entropy": 0.7018932044506073, "epoch": 0.29104, "grad_norm": 2.0003440380096436, "learning_rate": 3.5462585034013605e-05, "loss": 0.6976, "mean_token_accuracy": 0.7981386125087738, "num_tokens": 377447462.0, "step": 36380 }, { "entropy": 0.6750319063663482, "epoch": 0.29112, "grad_norm": 2.323162794113159, "learning_rate": 3.545858343337335e-05, "loss": 0.6692, "mean_token_accuracy": 0.7922355353832244, "num_tokens": 377592432.0, "step": 36390 }, { "entropy": 0.673187130689621, "epoch": 0.2912, "grad_norm": 5.053718090057373, "learning_rate": 3.54545818327331e-05, "loss": 0.6799, "mean_token_accuracy": 0.8211903393268585, "num_tokens": 377634845.0, "step": 36400 }, { "entropy": 0.6277360618114471, "epoch": 0.29128, "grad_norm": 1.6385624408721924, "learning_rate": 3.545058023209284e-05, "loss": 0.6251, "mean_token_accuracy": 0.798088663816452, "num_tokens": 377798685.0, "step": 36410 }, { "entropy": 0.61540607213974, "epoch": 0.29136, "grad_norm": 3.1553447246551514, "learning_rate": 3.544657863145258e-05, "loss": 0.6091, "mean_token_accuracy": 0.818941056728363, "num_tokens": 377887021.0, "step": 36420 }, { "entropy": 0.6688518583774566, "epoch": 0.29144, "grad_norm": 2.1541976928710938, "learning_rate": 3.544257703081232e-05, "loss": 0.6674, "mean_token_accuracy": 0.8046567618846894, "num_tokens": 377982352.0, "step": 36430 }, { "entropy": 0.7268306463956833, "epoch": 0.29152, "grad_norm": 3.783360481262207, "learning_rate": 3.5438575430172074e-05, "loss": 0.722, "mean_token_accuracy": 0.785972636938095, "num_tokens": 378111649.0, "step": 36440 }, { "entropy": 0.6786951899528504, "epoch": 0.2916, "grad_norm": 4.453583717346191, "learning_rate": 3.543457382953182e-05, "loss": 0.6904, "mean_token_accuracy": 0.8182362496852875, "num_tokens": 378146367.0, "step": 36450 }, { "entropy": 0.6361218214035034, "epoch": 0.29168, "grad_norm": 2.2053468227386475, "learning_rate": 3.5430572228891554e-05, "loss": 0.635, "mean_token_accuracy": 0.7959880352020263, "num_tokens": 378310207.0, "step": 36460 }, { "entropy": 0.6180319488048553, "epoch": 0.29176, "grad_norm": 3.4380202293395996, "learning_rate": 3.5426570628251305e-05, "loss": 0.6088, "mean_token_accuracy": 0.8243021130561828, "num_tokens": 378384989.0, "step": 36470 }, { "entropy": 0.7613404095172882, "epoch": 0.29184, "grad_norm": 1.5674620866775513, "learning_rate": 3.542256902761105e-05, "loss": 0.7594, "mean_token_accuracy": 0.7885156691074371, "num_tokens": 378475617.0, "step": 36480 }, { "entropy": 0.6818004548549652, "epoch": 0.29192, "grad_norm": 3.4135994911193848, "learning_rate": 3.541856742697079e-05, "loss": 0.6793, "mean_token_accuracy": 0.791047215461731, "num_tokens": 378611062.0, "step": 36490 }, { "entropy": 0.6435872703790665, "epoch": 0.292, "grad_norm": 4.301786422729492, "learning_rate": 3.541456582633053e-05, "loss": 0.6414, "mean_token_accuracy": 0.8265539407730103, "num_tokens": 378651916.0, "step": 36500 }, { "entropy": 0.628289645910263, "epoch": 0.29208, "grad_norm": 1.6338090896606445, "learning_rate": 3.541056422569028e-05, "loss": 0.6278, "mean_token_accuracy": 0.8009159803390503, "num_tokens": 378815756.0, "step": 36510 }, { "entropy": 0.6872751474380493, "epoch": 0.29216, "grad_norm": 3.4547276496887207, "learning_rate": 3.5406562625050023e-05, "loss": 0.6754, "mean_token_accuracy": 0.8077704012393951, "num_tokens": 378905561.0, "step": 36520 }, { "entropy": 0.6743013799190521, "epoch": 0.29224, "grad_norm": 1.8221461772918701, "learning_rate": 3.540256102440977e-05, "loss": 0.6767, "mean_token_accuracy": 0.8029794335365296, "num_tokens": 378999567.0, "step": 36530 }, { "entropy": 0.6973649621009826, "epoch": 0.29232, "grad_norm": 2.6520276069641113, "learning_rate": 3.539855942376951e-05, "loss": 0.6815, "mean_token_accuracy": 0.7925847291946411, "num_tokens": 379142320.0, "step": 36540 }, { "entropy": 0.6540397018194198, "epoch": 0.2924, "grad_norm": 4.730840682983398, "learning_rate": 3.5394557823129255e-05, "loss": 0.6712, "mean_token_accuracy": 0.8196411192417145, "num_tokens": 379180993.0, "step": 36550 }, { "entropy": 0.6427767634391784, "epoch": 0.29248, "grad_norm": 2.0982038974761963, "learning_rate": 3.5390556222489e-05, "loss": 0.6358, "mean_token_accuracy": 0.7985100209712982, "num_tokens": 379344833.0, "step": 36560 }, { "entropy": 0.5781637132167816, "epoch": 0.29256, "grad_norm": 3.549564838409424, "learning_rate": 3.538655462184874e-05, "loss": 0.5592, "mean_token_accuracy": 0.8316616237163543, "num_tokens": 379426264.0, "step": 36570 }, { "entropy": 0.6661825001239776, "epoch": 0.29264, "grad_norm": 2.5979397296905518, "learning_rate": 3.5382553021208486e-05, "loss": 0.6828, "mean_token_accuracy": 0.8009943962097168, "num_tokens": 379520695.0, "step": 36580 }, { "entropy": 0.7769182980060577, "epoch": 0.29272, "grad_norm": 2.487640142440796, "learning_rate": 3.537855142056823e-05, "loss": 0.7634, "mean_token_accuracy": 0.7761143624782563, "num_tokens": 379659876.0, "step": 36590 }, { "entropy": 0.6228470742702484, "epoch": 0.2928, "grad_norm": 4.252300262451172, "learning_rate": 3.537454981992797e-05, "loss": 0.6097, "mean_token_accuracy": 0.8323217093944549, "num_tokens": 379700338.0, "step": 36600 }, { "entropy": 0.6591989696025848, "epoch": 0.29288, "grad_norm": 1.3637616634368896, "learning_rate": 3.537054821928772e-05, "loss": 0.6668, "mean_token_accuracy": 0.7862603843212128, "num_tokens": 379864178.0, "step": 36610 }, { "entropy": 0.6800535082817077, "epoch": 0.29296, "grad_norm": 3.942953109741211, "learning_rate": 3.536654661864746e-05, "loss": 0.6803, "mean_token_accuracy": 0.8065142393112182, "num_tokens": 379956340.0, "step": 36620 }, { "entropy": 0.6891721844673157, "epoch": 0.29304, "grad_norm": 1.9265778064727783, "learning_rate": 3.5362545018007204e-05, "loss": 0.6814, "mean_token_accuracy": 0.8065204441547393, "num_tokens": 380051685.0, "step": 36630 }, { "entropy": 0.6716832101345063, "epoch": 0.29312, "grad_norm": 4.130398273468018, "learning_rate": 3.535854341736695e-05, "loss": 0.6659, "mean_token_accuracy": 0.7945649683475494, "num_tokens": 380180048.0, "step": 36640 }, { "entropy": 0.7894153654575348, "epoch": 0.2932, "grad_norm": 4.572340488433838, "learning_rate": 3.535454181672669e-05, "loss": 0.7863, "mean_token_accuracy": 0.799797797203064, "num_tokens": 380213727.0, "step": 36650 }, { "entropy": 0.6511576354503632, "epoch": 0.29328, "grad_norm": 1.7992454767227173, "learning_rate": 3.5350540216086435e-05, "loss": 0.6498, "mean_token_accuracy": 0.7933744490146637, "num_tokens": 380377567.0, "step": 36660 }, { "entropy": 0.7364093840122223, "epoch": 0.29336, "grad_norm": 3.024653434753418, "learning_rate": 3.534653861544618e-05, "loss": 0.7289, "mean_token_accuracy": 0.7945333659648895, "num_tokens": 380459268.0, "step": 36670 }, { "entropy": 0.7199224948883056, "epoch": 0.29344, "grad_norm": 1.912965178489685, "learning_rate": 3.534253701480593e-05, "loss": 0.7277, "mean_token_accuracy": 0.7947246193885803, "num_tokens": 380553531.0, "step": 36680 }, { "entropy": 0.6765833914279937, "epoch": 0.29352, "grad_norm": 1.937890648841858, "learning_rate": 3.5338535414165667e-05, "loss": 0.6768, "mean_token_accuracy": 0.7954942226409912, "num_tokens": 380685490.0, "step": 36690 }, { "entropy": 0.6946112453937531, "epoch": 0.2936, "grad_norm": 4.055283546447754, "learning_rate": 3.533453381352541e-05, "loss": 0.6829, "mean_token_accuracy": 0.818247503042221, "num_tokens": 380721655.0, "step": 36700 }, { "entropy": 0.6869126260280609, "epoch": 0.29368, "grad_norm": 2.6177146434783936, "learning_rate": 3.5330532212885154e-05, "loss": 0.6873, "mean_token_accuracy": 0.7805996716022492, "num_tokens": 380885495.0, "step": 36710 }, { "entropy": 0.6408200472593307, "epoch": 0.29376, "grad_norm": 4.556397438049316, "learning_rate": 3.5326530612244904e-05, "loss": 0.6396, "mean_token_accuracy": 0.8138554573059082, "num_tokens": 380958250.0, "step": 36720 }, { "entropy": 0.7214763283729553, "epoch": 0.29384, "grad_norm": 1.8408401012420654, "learning_rate": 3.532252901160464e-05, "loss": 0.7089, "mean_token_accuracy": 0.7957374453544617, "num_tokens": 381050339.0, "step": 36730 }, { "entropy": 0.7339127242565155, "epoch": 0.29392, "grad_norm": 2.086857795715332, "learning_rate": 3.5318527410964385e-05, "loss": 0.7332, "mean_token_accuracy": 0.7776268064975739, "num_tokens": 381193808.0, "step": 36740 }, { "entropy": 0.6634845167398453, "epoch": 0.294, "grad_norm": 5.468991279602051, "learning_rate": 3.5314525810324136e-05, "loss": 0.6632, "mean_token_accuracy": 0.8231385588645935, "num_tokens": 381231321.0, "step": 36750 }, { "entropy": 0.709958678483963, "epoch": 0.29408, "grad_norm": 1.8820641040802002, "learning_rate": 3.531052420968388e-05, "loss": 0.7103, "mean_token_accuracy": 0.781911301612854, "num_tokens": 381395124.0, "step": 36760 }, { "entropy": 0.673516321182251, "epoch": 0.29416, "grad_norm": 3.2322845458984375, "learning_rate": 3.5306522609043616e-05, "loss": 0.6671, "mean_token_accuracy": 0.807893818616867, "num_tokens": 381476515.0, "step": 36770 }, { "entropy": 0.7076229631900788, "epoch": 0.29424, "grad_norm": 1.8037439584732056, "learning_rate": 3.530252100840336e-05, "loss": 0.7094, "mean_token_accuracy": 0.7958472669124603, "num_tokens": 381568520.0, "step": 36780 }, { "entropy": 0.7004903256893158, "epoch": 0.29432, "grad_norm": 2.9372458457946777, "learning_rate": 3.529851940776311e-05, "loss": 0.6978, "mean_token_accuracy": 0.7846579372882843, "num_tokens": 381706970.0, "step": 36790 }, { "entropy": 0.7223540663719177, "epoch": 0.2944, "grad_norm": 4.954202175140381, "learning_rate": 3.5294517807122854e-05, "loss": 0.7259, "mean_token_accuracy": 0.8092132627964019, "num_tokens": 381750198.0, "step": 36800 }, { "entropy": 0.6650327265262603, "epoch": 0.29448, "grad_norm": 1.7962827682495117, "learning_rate": 3.529051620648259e-05, "loss": 0.6608, "mean_token_accuracy": 0.7895884275436401, "num_tokens": 381914038.0, "step": 36810 }, { "entropy": 0.6030425578355789, "epoch": 0.29456, "grad_norm": 2.9545886516571045, "learning_rate": 3.528651460584234e-05, "loss": 0.5902, "mean_token_accuracy": 0.8215523421764374, "num_tokens": 381997323.0, "step": 36820 }, { "entropy": 0.778455626964569, "epoch": 0.29464, "grad_norm": 2.188748359680176, "learning_rate": 3.5282513005202085e-05, "loss": 0.7671, "mean_token_accuracy": 0.7840877771377563, "num_tokens": 382089997.0, "step": 36830 }, { "entropy": 0.6917395710945129, "epoch": 0.29472, "grad_norm": 3.232264995574951, "learning_rate": 3.527851140456183e-05, "loss": 0.6916, "mean_token_accuracy": 0.7897515237331391, "num_tokens": 382234796.0, "step": 36840 }, { "entropy": 0.7384214222431182, "epoch": 0.2948, "grad_norm": 4.417449474334717, "learning_rate": 3.5274509803921566e-05, "loss": 0.7195, "mean_token_accuracy": 0.8049258530139923, "num_tokens": 382275382.0, "step": 36850 }, { "entropy": 0.6841981112957001, "epoch": 0.29488, "grad_norm": 1.969901442527771, "learning_rate": 3.5270508203281316e-05, "loss": 0.6894, "mean_token_accuracy": 0.7860124170780182, "num_tokens": 382438906.0, "step": 36860 }, { "entropy": 0.5690189719200134, "epoch": 0.29496, "grad_norm": 3.2783219814300537, "learning_rate": 3.526650660264106e-05, "loss": 0.5653, "mean_token_accuracy": 0.8332594931125641, "num_tokens": 382514629.0, "step": 36870 }, { "entropy": 0.727502989768982, "epoch": 0.29504, "grad_norm": 2.0434296131134033, "learning_rate": 3.5262505002000804e-05, "loss": 0.729, "mean_token_accuracy": 0.7915780961513519, "num_tokens": 382607992.0, "step": 36880 }, { "entropy": 0.6446196854114532, "epoch": 0.29512, "grad_norm": 2.20867657661438, "learning_rate": 3.525850340136055e-05, "loss": 0.6381, "mean_token_accuracy": 0.8041613578796387, "num_tokens": 382734728.0, "step": 36890 }, { "entropy": 0.6713730812072753, "epoch": 0.2952, "grad_norm": 5.091909408569336, "learning_rate": 3.525450180072029e-05, "loss": 0.6763, "mean_token_accuracy": 0.8226538479328156, "num_tokens": 382766580.0, "step": 36900 }, { "entropy": 0.6350071132183075, "epoch": 0.29528, "grad_norm": 1.9258922338485718, "learning_rate": 3.5250500200080035e-05, "loss": 0.6371, "mean_token_accuracy": 0.7980370879173279, "num_tokens": 382927286.0, "step": 36910 }, { "entropy": 0.6961120665073395, "epoch": 0.29536, "grad_norm": 3.016833782196045, "learning_rate": 3.524649859943978e-05, "loss": 0.6922, "mean_token_accuracy": 0.802030885219574, "num_tokens": 382999599.0, "step": 36920 }, { "entropy": 0.6992147326469421, "epoch": 0.29544, "grad_norm": 1.9628793001174927, "learning_rate": 3.524249699879952e-05, "loss": 0.6966, "mean_token_accuracy": 0.7988811373710633, "num_tokens": 383092534.0, "step": 36930 }, { "entropy": 0.6689881026744843, "epoch": 0.29552, "grad_norm": 2.3282816410064697, "learning_rate": 3.5238495398159266e-05, "loss": 0.6674, "mean_token_accuracy": 0.7928081274032592, "num_tokens": 383229165.0, "step": 36940 }, { "entropy": 0.6096726000308991, "epoch": 0.2956, "grad_norm": 4.480708599090576, "learning_rate": 3.523449379751901e-05, "loss": 0.6117, "mean_token_accuracy": 0.8312718152999878, "num_tokens": 383266177.0, "step": 36950 }, { "entropy": 0.6671097278594971, "epoch": 0.29568, "grad_norm": 1.8482904434204102, "learning_rate": 3.5230492196878754e-05, "loss": 0.6631, "mean_token_accuracy": 0.7897288799285889, "num_tokens": 383430017.0, "step": 36960 }, { "entropy": 0.6464181065559387, "epoch": 0.29576, "grad_norm": 2.669443130493164, "learning_rate": 3.52264905962385e-05, "loss": 0.6409, "mean_token_accuracy": 0.811165452003479, "num_tokens": 383522343.0, "step": 36970 }, { "entropy": 0.7135056257247925, "epoch": 0.29584, "grad_norm": 1.680393099784851, "learning_rate": 3.522248899559824e-05, "loss": 0.7081, "mean_token_accuracy": 0.7969723701477051, "num_tokens": 383617617.0, "step": 36980 }, { "entropy": 0.7062711954116822, "epoch": 0.29592, "grad_norm": 2.230586051940918, "learning_rate": 3.5218487394957985e-05, "loss": 0.6994, "mean_token_accuracy": 0.7868960559368133, "num_tokens": 383760071.0, "step": 36990 }, { "entropy": 0.6677505016326905, "epoch": 0.296, "grad_norm": 4.73370885848999, "learning_rate": 3.521448579431773e-05, "loss": 0.6643, "mean_token_accuracy": 0.8175143122673034, "num_tokens": 383802466.0, "step": 37000 }, { "entropy": 0.6702621340751648, "epoch": 0.29608, "grad_norm": 1.9849789142608643, "learning_rate": 3.521048419367747e-05, "loss": 0.6695, "mean_token_accuracy": 0.7890383541584015, "num_tokens": 383965558.0, "step": 37010 }, { "entropy": 0.7378854990005493, "epoch": 0.29616, "grad_norm": 3.253277540206909, "learning_rate": 3.5206482593037216e-05, "loss": 0.7441, "mean_token_accuracy": 0.793089359998703, "num_tokens": 384033550.0, "step": 37020 }, { "entropy": 0.6754201710224151, "epoch": 0.29624, "grad_norm": 2.0053117275238037, "learning_rate": 3.520248099239696e-05, "loss": 0.6652, "mean_token_accuracy": 0.8038303554058075, "num_tokens": 384126113.0, "step": 37030 }, { "entropy": 0.7396449208259582, "epoch": 0.29632, "grad_norm": 3.051117181777954, "learning_rate": 3.51984793917567e-05, "loss": 0.7373, "mean_token_accuracy": 0.7790970504283905, "num_tokens": 384258245.0, "step": 37040 }, { "entropy": 0.6796013355255127, "epoch": 0.2964, "grad_norm": 4.951813697814941, "learning_rate": 3.519447779111645e-05, "loss": 0.6862, "mean_token_accuracy": 0.81296808719635, "num_tokens": 384297761.0, "step": 37050 }, { "entropy": 0.6720604836940766, "epoch": 0.29648, "grad_norm": 1.7535187005996704, "learning_rate": 3.519047619047619e-05, "loss": 0.6656, "mean_token_accuracy": 0.7907656788825989, "num_tokens": 384461425.0, "step": 37060 }, { "entropy": 0.7130161941051483, "epoch": 0.29656, "grad_norm": 3.5679943561553955, "learning_rate": 3.518647458983594e-05, "loss": 0.7086, "mean_token_accuracy": 0.7985764145851135, "num_tokens": 384543148.0, "step": 37070 }, { "entropy": 0.7031194269657135, "epoch": 0.29664, "grad_norm": 1.7393901348114014, "learning_rate": 3.518247298919568e-05, "loss": 0.7002, "mean_token_accuracy": 0.801368260383606, "num_tokens": 384634956.0, "step": 37080 }, { "entropy": 0.7987996459007263, "epoch": 0.29672, "grad_norm": 2.1919970512390137, "learning_rate": 3.517847138855542e-05, "loss": 0.8013, "mean_token_accuracy": 0.7667636573314667, "num_tokens": 384774524.0, "step": 37090 }, { "entropy": 0.6547859907150269, "epoch": 0.2968, "grad_norm": 4.134953022003174, "learning_rate": 3.5174469787915166e-05, "loss": 0.6449, "mean_token_accuracy": 0.8267544090747834, "num_tokens": 384814227.0, "step": 37100 }, { "entropy": 0.632650825381279, "epoch": 0.29688, "grad_norm": 1.3198270797729492, "learning_rate": 3.5170468187274916e-05, "loss": 0.6337, "mean_token_accuracy": 0.7998534381389618, "num_tokens": 384978067.0, "step": 37110 }, { "entropy": 0.6858431339263916, "epoch": 0.29696, "grad_norm": 3.0587668418884277, "learning_rate": 3.516646658663465e-05, "loss": 0.6819, "mean_token_accuracy": 0.8027980029582977, "num_tokens": 385073398.0, "step": 37120 }, { "entropy": 0.7488427996635437, "epoch": 0.29704, "grad_norm": 1.4778904914855957, "learning_rate": 3.51624649859944e-05, "loss": 0.7349, "mean_token_accuracy": 0.7857530534267425, "num_tokens": 385168829.0, "step": 37130 }, { "entropy": 0.692369920015335, "epoch": 0.29712, "grad_norm": 3.409893035888672, "learning_rate": 3.515846338535415e-05, "loss": 0.6891, "mean_token_accuracy": 0.7891331255435944, "num_tokens": 385309644.0, "step": 37140 }, { "entropy": 0.6736250132322311, "epoch": 0.2972, "grad_norm": 5.9103593826293945, "learning_rate": 3.515446178471389e-05, "loss": 0.6814, "mean_token_accuracy": 0.8153634071350098, "num_tokens": 385348779.0, "step": 37150 }, { "entropy": 0.6454717934131622, "epoch": 0.29728, "grad_norm": 2.1631739139556885, "learning_rate": 3.515046018407363e-05, "loss": 0.6436, "mean_token_accuracy": 0.7980642437934875, "num_tokens": 385512619.0, "step": 37160 }, { "entropy": 0.6504899621009826, "epoch": 0.29736, "grad_norm": 3.3091914653778076, "learning_rate": 3.514645858343337e-05, "loss": 0.638, "mean_token_accuracy": 0.8137158036231995, "num_tokens": 385595692.0, "step": 37170 }, { "entropy": 0.6613021790981293, "epoch": 0.29744, "grad_norm": 1.6979608535766602, "learning_rate": 3.514245698279312e-05, "loss": 0.6717, "mean_token_accuracy": 0.805379056930542, "num_tokens": 385689110.0, "step": 37180 }, { "entropy": 0.7302812099456787, "epoch": 0.29752, "grad_norm": 3.915754556655884, "learning_rate": 3.5138455382152866e-05, "loss": 0.7126, "mean_token_accuracy": 0.7796899437904358, "num_tokens": 385837957.0, "step": 37190 }, { "entropy": 0.6923150062561035, "epoch": 0.2976, "grad_norm": 4.535330295562744, "learning_rate": 3.51344537815126e-05, "loss": 0.6919, "mean_token_accuracy": 0.8168317139148712, "num_tokens": 385880080.0, "step": 37200 }, { "entropy": 0.7022875368595123, "epoch": 0.29768, "grad_norm": 2.7969229221343994, "learning_rate": 3.513045218087235e-05, "loss": 0.7059, "mean_token_accuracy": 0.7799218416213989, "num_tokens": 386043920.0, "step": 37210 }, { "entropy": 0.7651858150959014, "epoch": 0.29776, "grad_norm": 3.2463557720184326, "learning_rate": 3.51264505802321e-05, "loss": 0.7574, "mean_token_accuracy": 0.7869675755500793, "num_tokens": 386131447.0, "step": 37220 }, { "entropy": 0.714882081747055, "epoch": 0.29784, "grad_norm": 2.0610506534576416, "learning_rate": 3.512244897959184e-05, "loss": 0.7242, "mean_token_accuracy": 0.7892498850822449, "num_tokens": 386225306.0, "step": 37230 }, { "entropy": 0.6863507211208344, "epoch": 0.29792, "grad_norm": 2.6686840057373047, "learning_rate": 3.511844737895158e-05, "loss": 0.6754, "mean_token_accuracy": 0.7907460451126098, "num_tokens": 386361866.0, "step": 37240 }, { "entropy": 0.6130269527435303, "epoch": 0.298, "grad_norm": 4.582677841186523, "learning_rate": 3.511444577831133e-05, "loss": 0.5962, "mean_token_accuracy": 0.8320788681507111, "num_tokens": 386398499.0, "step": 37250 }, { "entropy": 0.6235967844724655, "epoch": 0.29808, "grad_norm": 2.3904621601104736, "learning_rate": 3.511044417767107e-05, "loss": 0.6281, "mean_token_accuracy": 0.8014106035232544, "num_tokens": 386562339.0, "step": 37260 }, { "entropy": 0.7132952749729157, "epoch": 0.29816, "grad_norm": 3.0208473205566406, "learning_rate": 3.5106442577030816e-05, "loss": 0.7035, "mean_token_accuracy": 0.7972862005233765, "num_tokens": 386654841.0, "step": 37270 }, { "entropy": 0.6688333958387375, "epoch": 0.29824, "grad_norm": 1.7395716905593872, "learning_rate": 3.510244097639056e-05, "loss": 0.6682, "mean_token_accuracy": 0.8048729717731475, "num_tokens": 386748970.0, "step": 37280 }, { "entropy": 0.6878900289535522, "epoch": 0.29832, "grad_norm": 2.232278347015381, "learning_rate": 3.50984393757503e-05, "loss": 0.6846, "mean_token_accuracy": 0.7923440337181091, "num_tokens": 386891196.0, "step": 37290 }, { "entropy": 0.7366504430770874, "epoch": 0.2984, "grad_norm": 5.518690586090088, "learning_rate": 3.509443777511005e-05, "loss": 0.7314, "mean_token_accuracy": 0.8076850533485412, "num_tokens": 386935441.0, "step": 37300 }, { "entropy": 0.7006134152412414, "epoch": 0.29848, "grad_norm": 1.9262109994888306, "learning_rate": 3.509043617446979e-05, "loss": 0.7024, "mean_token_accuracy": 0.7825156569480896, "num_tokens": 387099225.0, "step": 37310 }, { "entropy": 0.6554401576519012, "epoch": 0.29856, "grad_norm": 3.447761058807373, "learning_rate": 3.5086434573829534e-05, "loss": 0.6442, "mean_token_accuracy": 0.8167496681213379, "num_tokens": 387187854.0, "step": 37320 }, { "entropy": 0.6499742090702056, "epoch": 0.29864, "grad_norm": 2.0592005252838135, "learning_rate": 3.508243297318928e-05, "loss": 0.6471, "mean_token_accuracy": 0.8117820084095001, "num_tokens": 387282412.0, "step": 37330 }, { "entropy": 0.6699997365474701, "epoch": 0.29872, "grad_norm": 2.96199893951416, "learning_rate": 3.507843137254902e-05, "loss": 0.664, "mean_token_accuracy": 0.7944060027599334, "num_tokens": 387432684.0, "step": 37340 }, { "entropy": 0.6706370115280151, "epoch": 0.2988, "grad_norm": 4.933314323425293, "learning_rate": 3.5074429771908765e-05, "loss": 0.6665, "mean_token_accuracy": 0.8172117829322815, "num_tokens": 387475566.0, "step": 37350 }, { "entropy": 0.6546236157417298, "epoch": 0.29888, "grad_norm": 1.642652153968811, "learning_rate": 3.507042817126851e-05, "loss": 0.6528, "mean_token_accuracy": 0.7951244533061981, "num_tokens": 387638174.0, "step": 37360 }, { "entropy": 0.612000024318695, "epoch": 0.29896, "grad_norm": 3.7678096294403076, "learning_rate": 3.506642657062825e-05, "loss": 0.6074, "mean_token_accuracy": 0.8271937370300293, "num_tokens": 387706447.0, "step": 37370 }, { "entropy": 0.7338310301303863, "epoch": 0.29904, "grad_norm": 1.613342046737671, "learning_rate": 3.5062424969987996e-05, "loss": 0.7488, "mean_token_accuracy": 0.7921507954597473, "num_tokens": 387798214.0, "step": 37380 }, { "entropy": 0.6619681775569916, "epoch": 0.29912, "grad_norm": 2.2483417987823486, "learning_rate": 3.505842336934774e-05, "loss": 0.6531, "mean_token_accuracy": 0.8003167271614074, "num_tokens": 387942375.0, "step": 37390 }, { "entropy": 0.7076200008392334, "epoch": 0.2992, "grad_norm": 5.485179901123047, "learning_rate": 3.5054421768707484e-05, "loss": 0.7088, "mean_token_accuracy": 0.8086591482162475, "num_tokens": 387982497.0, "step": 37400 }, { "entropy": 0.6886706948280334, "epoch": 0.29928, "grad_norm": 2.614880323410034, "learning_rate": 3.505042016806723e-05, "loss": 0.6896, "mean_token_accuracy": 0.784154862165451, "num_tokens": 388144130.0, "step": 37410 }, { "entropy": 0.6355902761220932, "epoch": 0.29936, "grad_norm": 3.1048622131347656, "learning_rate": 3.504641856742698e-05, "loss": 0.6298, "mean_token_accuracy": 0.8184501886367798, "num_tokens": 388211372.0, "step": 37420 }, { "entropy": 0.7495626747608185, "epoch": 0.29944, "grad_norm": 1.7579306364059448, "learning_rate": 3.5042416966786715e-05, "loss": 0.7431, "mean_token_accuracy": 0.7882186233997345, "num_tokens": 388304893.0, "step": 37430 }, { "entropy": 0.7132737100124359, "epoch": 0.29952, "grad_norm": 2.8660924434661865, "learning_rate": 3.503841536614646e-05, "loss": 0.708, "mean_token_accuracy": 0.7836978137493134, "num_tokens": 388453276.0, "step": 37440 }, { "entropy": 0.7216740548610687, "epoch": 0.2996, "grad_norm": 4.917655944824219, "learning_rate": 3.50344137655062e-05, "loss": 0.7171, "mean_token_accuracy": 0.8088875770568847, "num_tokens": 388496338.0, "step": 37450 }, { "entropy": 0.6442401647567749, "epoch": 0.29968, "grad_norm": 2.140576124191284, "learning_rate": 3.503041216486595e-05, "loss": 0.6346, "mean_token_accuracy": 0.7963483214378357, "num_tokens": 388660178.0, "step": 37460 }, { "entropy": 0.6402432709932327, "epoch": 0.29976, "grad_norm": 4.457278251647949, "learning_rate": 3.502641056422569e-05, "loss": 0.6395, "mean_token_accuracy": 0.8124992728233338, "num_tokens": 388755288.0, "step": 37470 }, { "entropy": 0.7240014195442199, "epoch": 0.29984, "grad_norm": 2.2960572242736816, "learning_rate": 3.5022408963585433e-05, "loss": 0.7268, "mean_token_accuracy": 0.7968043625354767, "num_tokens": 388848077.0, "step": 37480 }, { "entropy": 0.6522975027561188, "epoch": 0.29992, "grad_norm": 2.3855860233306885, "learning_rate": 3.501840736294518e-05, "loss": 0.6545, "mean_token_accuracy": 0.7981668055057526, "num_tokens": 388996756.0, "step": 37490 }, { "entropy": 0.7267703831195831, "epoch": 0.3, "grad_norm": 4.377167224884033, "learning_rate": 3.501440576230493e-05, "loss": 0.7184, "mean_token_accuracy": 0.8092951238155365, "num_tokens": 389039099.0, "step": 37500 }, { "entropy": 0.6185397684574128, "epoch": 0.30008, "grad_norm": 1.881282925605774, "learning_rate": 3.5010404161664665e-05, "loss": 0.6208, "mean_token_accuracy": 0.8003480732440948, "num_tokens": 389202939.0, "step": 37510 }, { "entropy": 0.672033628821373, "epoch": 0.30016, "grad_norm": 4.217187404632568, "learning_rate": 3.500640256102441e-05, "loss": 0.6584, "mean_token_accuracy": 0.8094385206699372, "num_tokens": 389294778.0, "step": 37520 }, { "entropy": 0.7434608519077301, "epoch": 0.30024, "grad_norm": 1.58729088306427, "learning_rate": 3.500240096038416e-05, "loss": 0.756, "mean_token_accuracy": 0.7921805918216706, "num_tokens": 389388447.0, "step": 37530 }, { "entropy": 0.6863324344158173, "epoch": 0.30032, "grad_norm": 2.5527384281158447, "learning_rate": 3.49983993597439e-05, "loss": 0.6801, "mean_token_accuracy": 0.7898538708686829, "num_tokens": 389529128.0, "step": 37540 }, { "entropy": 0.7338887721300125, "epoch": 0.3004, "grad_norm": 4.331012725830078, "learning_rate": 3.499439775910364e-05, "loss": 0.733, "mean_token_accuracy": 0.8041574835777283, "num_tokens": 389570177.0, "step": 37550 }, { "entropy": 0.6594594717025757, "epoch": 0.30048, "grad_norm": 1.6605756282806396, "learning_rate": 3.499039615846338e-05, "loss": 0.6565, "mean_token_accuracy": 0.7921531498432159, "num_tokens": 389734017.0, "step": 37560 }, { "entropy": 0.8348277449607849, "epoch": 0.30056, "grad_norm": 4.528436183929443, "learning_rate": 3.4986394557823134e-05, "loss": 0.8382, "mean_token_accuracy": 0.7693300247192383, "num_tokens": 389827138.0, "step": 37570 }, { "entropy": 0.6898598909378052, "epoch": 0.30064, "grad_norm": 1.9555920362472534, "learning_rate": 3.498239295718288e-05, "loss": 0.6899, "mean_token_accuracy": 0.8054659485816955, "num_tokens": 389920172.0, "step": 37580 }, { "entropy": 0.6719742715358734, "epoch": 0.30072, "grad_norm": 1.9777650833129883, "learning_rate": 3.4978391356542614e-05, "loss": 0.6548, "mean_token_accuracy": 0.799226450920105, "num_tokens": 390051051.0, "step": 37590 }, { "entropy": 0.7227178335189819, "epoch": 0.3008, "grad_norm": 4.679501056671143, "learning_rate": 3.4974389755902365e-05, "loss": 0.7201, "mean_token_accuracy": 0.8019687235355377, "num_tokens": 390091387.0, "step": 37600 }, { "entropy": 0.6731508314609528, "epoch": 0.30088, "grad_norm": 1.7374671697616577, "learning_rate": 3.497038815526211e-05, "loss": 0.6715, "mean_token_accuracy": 0.7880790948867797, "num_tokens": 390255064.0, "step": 37610 }, { "entropy": 0.5997707307338714, "epoch": 0.30096, "grad_norm": 3.2475907802581787, "learning_rate": 3.496638655462185e-05, "loss": 0.6008, "mean_token_accuracy": 0.8242222428321838, "num_tokens": 390330204.0, "step": 37620 }, { "entropy": 0.6906122684478759, "epoch": 0.30104, "grad_norm": 1.897915005683899, "learning_rate": 3.496238495398159e-05, "loss": 0.6787, "mean_token_accuracy": 0.8024229288101197, "num_tokens": 390426417.0, "step": 37630 }, { "entropy": 0.7021963417530059, "epoch": 0.30112, "grad_norm": 3.6105310916900635, "learning_rate": 3.495838335334134e-05, "loss": 0.6981, "mean_token_accuracy": 0.7900031924247741, "num_tokens": 390570070.0, "step": 37640 }, { "entropy": 0.6336978286504745, "epoch": 0.3012, "grad_norm": 4.015756607055664, "learning_rate": 3.4954381752701083e-05, "loss": 0.6322, "mean_token_accuracy": 0.8274898171424866, "num_tokens": 390606153.0, "step": 37650 }, { "entropy": 0.6569880902767181, "epoch": 0.30128, "grad_norm": 1.8108443021774292, "learning_rate": 3.495038015206083e-05, "loss": 0.6556, "mean_token_accuracy": 0.7909746050834656, "num_tokens": 390769993.0, "step": 37660 }, { "entropy": 0.6688363373279571, "epoch": 0.30136, "grad_norm": 2.758587598800659, "learning_rate": 3.494637855142057e-05, "loss": 0.6678, "mean_token_accuracy": 0.8013476252555847, "num_tokens": 390864943.0, "step": 37670 }, { "entropy": 0.6985702574253082, "epoch": 0.30144, "grad_norm": 1.7202717065811157, "learning_rate": 3.4942376950780315e-05, "loss": 0.6917, "mean_token_accuracy": 0.7980988562107086, "num_tokens": 390960386.0, "step": 37680 }, { "entropy": 0.7179923534393311, "epoch": 0.30152, "grad_norm": 2.593977451324463, "learning_rate": 3.493837535014006e-05, "loss": 0.7081, "mean_token_accuracy": 0.789135730266571, "num_tokens": 391100206.0, "step": 37690 }, { "entropy": 0.6430986493825912, "epoch": 0.3016, "grad_norm": 5.909607887268066, "learning_rate": 3.49343737494998e-05, "loss": 0.6579, "mean_token_accuracy": 0.8237301588058472, "num_tokens": 391139042.0, "step": 37700 }, { "entropy": 0.7069853484630585, "epoch": 0.30168, "grad_norm": 2.1161749362945557, "learning_rate": 3.4930372148859546e-05, "loss": 0.7028, "mean_token_accuracy": 0.7842573463916779, "num_tokens": 391302827.0, "step": 37710 }, { "entropy": 0.6719306409358978, "epoch": 0.30176, "grad_norm": 3.8850884437561035, "learning_rate": 3.492637054821929e-05, "loss": 0.6677, "mean_token_accuracy": 0.807528692483902, "num_tokens": 391393423.0, "step": 37720 }, { "entropy": 0.6967575073242187, "epoch": 0.30184, "grad_norm": 1.4002500772476196, "learning_rate": 3.492236894757903e-05, "loss": 0.6975, "mean_token_accuracy": 0.7956880807876587, "num_tokens": 391489403.0, "step": 37730 }, { "entropy": 0.701448667049408, "epoch": 0.30192, "grad_norm": 3.1048710346221924, "learning_rate": 3.491836734693878e-05, "loss": 0.6999, "mean_token_accuracy": 0.7833438336849212, "num_tokens": 391636426.0, "step": 37740 }, { "entropy": 0.7699427783489228, "epoch": 0.302, "grad_norm": 4.688963413238525, "learning_rate": 3.491436574629852e-05, "loss": 0.7559, "mean_token_accuracy": 0.7991350531578064, "num_tokens": 391678356.0, "step": 37750 }, { "entropy": 0.6746651589870453, "epoch": 0.30208, "grad_norm": 1.7838640213012695, "learning_rate": 3.4910364145658264e-05, "loss": 0.6789, "mean_token_accuracy": 0.7908144414424896, "num_tokens": 391842131.0, "step": 37760 }, { "entropy": 0.6798747032880783, "epoch": 0.30216, "grad_norm": 2.939403533935547, "learning_rate": 3.490636254501801e-05, "loss": 0.6675, "mean_token_accuracy": 0.8067712843418121, "num_tokens": 391923197.0, "step": 37770 }, { "entropy": 0.6772729337215424, "epoch": 0.30224, "grad_norm": 1.6262551546096802, "learning_rate": 3.490236094437775e-05, "loss": 0.6641, "mean_token_accuracy": 0.8032174229621887, "num_tokens": 392016232.0, "step": 37780 }, { "entropy": 0.7385958671569824, "epoch": 0.30232, "grad_norm": 2.838059663772583, "learning_rate": 3.4898359343737495e-05, "loss": 0.7476, "mean_token_accuracy": 0.7756222605705261, "num_tokens": 392153234.0, "step": 37790 }, { "entropy": 0.6125688582658768, "epoch": 0.3024, "grad_norm": 4.678197860717773, "learning_rate": 3.489435774309724e-05, "loss": 0.5926, "mean_token_accuracy": 0.8331767201423645, "num_tokens": 392193720.0, "step": 37800 }, { "entropy": 0.6577578485012054, "epoch": 0.30248, "grad_norm": 2.0903491973876953, "learning_rate": 3.489035614245699e-05, "loss": 0.6541, "mean_token_accuracy": 0.7946323871612548, "num_tokens": 392357560.0, "step": 37810 }, { "entropy": 0.6183215975761414, "epoch": 0.30256, "grad_norm": 3.3816473484039307, "learning_rate": 3.4886354541816727e-05, "loss": 0.6126, "mean_token_accuracy": 0.8183824360370636, "num_tokens": 392445004.0, "step": 37820 }, { "entropy": 0.7687546253204346, "epoch": 0.30264, "grad_norm": 3.05178165435791, "learning_rate": 3.488235294117647e-05, "loss": 0.7653, "mean_token_accuracy": 0.7881555557250977, "num_tokens": 392538734.0, "step": 37830 }, { "entropy": 0.7373194634914398, "epoch": 0.30272, "grad_norm": 2.619614839553833, "learning_rate": 3.4878351340536214e-05, "loss": 0.7415, "mean_token_accuracy": 0.7773948848247528, "num_tokens": 392690623.0, "step": 37840 }, { "entropy": 0.6989430725574494, "epoch": 0.3028, "grad_norm": 5.857180118560791, "learning_rate": 3.4874349739895964e-05, "loss": 0.691, "mean_token_accuracy": 0.8157419741153717, "num_tokens": 392730893.0, "step": 37850 }, { "entropy": 0.6646151959896087, "epoch": 0.30288, "grad_norm": 1.6717976331710815, "learning_rate": 3.48703481392557e-05, "loss": 0.658, "mean_token_accuracy": 0.7950589895248413, "num_tokens": 392894077.0, "step": 37860 }, { "entropy": 0.6610234379768372, "epoch": 0.30296, "grad_norm": 3.163559675216675, "learning_rate": 3.4866346538615445e-05, "loss": 0.6583, "mean_token_accuracy": 0.814027738571167, "num_tokens": 392971491.0, "step": 37870 }, { "entropy": 0.716232281923294, "epoch": 0.30304, "grad_norm": 1.6196764707565308, "learning_rate": 3.4862344937975196e-05, "loss": 0.7125, "mean_token_accuracy": 0.7978079199790955, "num_tokens": 393064973.0, "step": 37880 }, { "entropy": 0.6986759185791016, "epoch": 0.30312, "grad_norm": 2.5151207447052, "learning_rate": 3.485834333733494e-05, "loss": 0.6921, "mean_token_accuracy": 0.7928611397743225, "num_tokens": 393196446.0, "step": 37890 }, { "entropy": 0.6800568729639054, "epoch": 0.3032, "grad_norm": 5.189250469207764, "learning_rate": 3.4854341736694676e-05, "loss": 0.6866, "mean_token_accuracy": 0.8121212303638459, "num_tokens": 393229692.0, "step": 37900 }, { "entropy": 0.6775520086288452, "epoch": 0.30328, "grad_norm": 2.1055407524108887, "learning_rate": 3.485034013605442e-05, "loss": 0.677, "mean_token_accuracy": 0.7893136322498322, "num_tokens": 393393532.0, "step": 37910 }, { "entropy": 0.6829856872558594, "epoch": 0.30336, "grad_norm": 2.887485980987549, "learning_rate": 3.484633853541417e-05, "loss": 0.667, "mean_token_accuracy": 0.8051388204097748, "num_tokens": 393488702.0, "step": 37920 }, { "entropy": 0.6715941369533539, "epoch": 0.30344, "grad_norm": 1.9377317428588867, "learning_rate": 3.4842336934773914e-05, "loss": 0.6785, "mean_token_accuracy": 0.8020134091377258, "num_tokens": 393582773.0, "step": 37930 }, { "entropy": 0.7638733565807343, "epoch": 0.30352, "grad_norm": 2.3761072158813477, "learning_rate": 3.483833533413365e-05, "loss": 0.7438, "mean_token_accuracy": 0.7761877655982972, "num_tokens": 393726664.0, "step": 37940 }, { "entropy": 0.6083025097846985, "epoch": 0.3036, "grad_norm": 5.368999004364014, "learning_rate": 3.48343337334934e-05, "loss": 0.609, "mean_token_accuracy": 0.8307586371898651, "num_tokens": 393771908.0, "step": 37950 }, { "entropy": 0.681055736541748, "epoch": 0.30368, "grad_norm": 1.732973337173462, "learning_rate": 3.4830332132853145e-05, "loss": 0.6849, "mean_token_accuracy": 0.787488853931427, "num_tokens": 393934100.0, "step": 37960 }, { "entropy": 0.6613762080669403, "epoch": 0.30376, "grad_norm": 3.4892899990081787, "learning_rate": 3.482633053221289e-05, "loss": 0.658, "mean_token_accuracy": 0.8104457974433898, "num_tokens": 394007804.0, "step": 37970 }, { "entropy": 0.7427656471729278, "epoch": 0.30384, "grad_norm": 1.929005742073059, "learning_rate": 3.4822328931572626e-05, "loss": 0.7413, "mean_token_accuracy": 0.791242641210556, "num_tokens": 394100989.0, "step": 37980 }, { "entropy": 0.7102053880691528, "epoch": 0.30392, "grad_norm": 2.574941873550415, "learning_rate": 3.4818327330932376e-05, "loss": 0.7071, "mean_token_accuracy": 0.7848516404628754, "num_tokens": 394255480.0, "step": 37990 }, { "entropy": 0.6580081850290298, "epoch": 0.304, "grad_norm": 3.9877378940582275, "learning_rate": 3.481432573029212e-05, "loss": 0.6528, "mean_token_accuracy": 0.8230946362018585, "num_tokens": 394299101.0, "step": 38000 }, { "entropy": 0.7128600597381591, "epoch": 0.30408, "grad_norm": 2.2250475883483887, "learning_rate": 3.4810324129651864e-05, "loss": 0.7142, "mean_token_accuracy": 0.7808378100395202, "num_tokens": 394462941.0, "step": 38010 }, { "entropy": 0.6853418469429016, "epoch": 0.30416, "grad_norm": 4.503905773162842, "learning_rate": 3.480632252901161e-05, "loss": 0.6738, "mean_token_accuracy": 0.8006560981273652, "num_tokens": 394553095.0, "step": 38020 }, { "entropy": 0.7234272837638855, "epoch": 0.30424, "grad_norm": 1.4669677019119263, "learning_rate": 3.480232092837135e-05, "loss": 0.7279, "mean_token_accuracy": 0.7945241749286651, "num_tokens": 394648219.0, "step": 38030 }, { "entropy": 0.7178610324859619, "epoch": 0.30432, "grad_norm": 3.307687759399414, "learning_rate": 3.4798319327731095e-05, "loss": 0.7195, "mean_token_accuracy": 0.7849813878536225, "num_tokens": 394772557.0, "step": 38040 }, { "entropy": 0.7269312620162964, "epoch": 0.3044, "grad_norm": 4.203392028808594, "learning_rate": 3.479431772709084e-05, "loss": 0.7147, "mean_token_accuracy": 0.8085006654262543, "num_tokens": 394806105.0, "step": 38050 }, { "entropy": 0.6876409649848938, "epoch": 0.30448, "grad_norm": 1.9295010566711426, "learning_rate": 3.479031612645058e-05, "loss": 0.6849, "mean_token_accuracy": 0.7860894083976746, "num_tokens": 394969945.0, "step": 38060 }, { "entropy": 0.7255348086357116, "epoch": 0.30456, "grad_norm": 3.998403787612915, "learning_rate": 3.4786314525810326e-05, "loss": 0.7219, "mean_token_accuracy": 0.7942761480808258, "num_tokens": 395055519.0, "step": 38070 }, { "entropy": 0.7212690263986588, "epoch": 0.30464, "grad_norm": 1.9253487586975098, "learning_rate": 3.478231292517007e-05, "loss": 0.7173, "mean_token_accuracy": 0.7941584646701813, "num_tokens": 395149743.0, "step": 38080 }, { "entropy": 0.6894005000591278, "epoch": 0.30472, "grad_norm": 2.7064006328582764, "learning_rate": 3.4778311324529814e-05, "loss": 0.6886, "mean_token_accuracy": 0.7887520015239715, "num_tokens": 395296746.0, "step": 38090 }, { "entropy": 0.6595361232757568, "epoch": 0.3048, "grad_norm": 4.878591537475586, "learning_rate": 3.477430972388956e-05, "loss": 0.659, "mean_token_accuracy": 0.8255167126655578, "num_tokens": 395339885.0, "step": 38100 }, { "entropy": 0.6519383192062378, "epoch": 0.30488, "grad_norm": 1.325437307357788, "learning_rate": 3.47703081232493e-05, "loss": 0.6538, "mean_token_accuracy": 0.7934903740882874, "num_tokens": 395503645.0, "step": 38110 }, { "entropy": 0.6604760318994523, "epoch": 0.30496, "grad_norm": 3.5090320110321045, "learning_rate": 3.4766306522609045e-05, "loss": 0.6516, "mean_token_accuracy": 0.8134288847446441, "num_tokens": 395580256.0, "step": 38120 }, { "entropy": 0.6724536538124084, "epoch": 0.30504, "grad_norm": 1.879382848739624, "learning_rate": 3.476230492196879e-05, "loss": 0.664, "mean_token_accuracy": 0.8125966012477874, "num_tokens": 395672400.0, "step": 38130 }, { "entropy": 0.6714505732059479, "epoch": 0.30512, "grad_norm": 2.557328462600708, "learning_rate": 3.475830332132853e-05, "loss": 0.6737, "mean_token_accuracy": 0.7943371534347534, "num_tokens": 395819328.0, "step": 38140 }, { "entropy": 0.7142693936824799, "epoch": 0.3052, "grad_norm": 5.15997314453125, "learning_rate": 3.4754301720688276e-05, "loss": 0.7187, "mean_token_accuracy": 0.8096964240074158, "num_tokens": 395862057.0, "step": 38150 }, { "entropy": 0.6168624222278595, "epoch": 0.30528, "grad_norm": 1.4707105159759521, "learning_rate": 3.475030012004802e-05, "loss": 0.6184, "mean_token_accuracy": 0.8017342448234558, "num_tokens": 396025897.0, "step": 38160 }, { "entropy": 0.6952254831790924, "epoch": 0.30536, "grad_norm": 3.688607692718506, "learning_rate": 3.474629851940776e-05, "loss": 0.6791, "mean_token_accuracy": 0.8011538505554199, "num_tokens": 396116773.0, "step": 38170 }, { "entropy": 0.6804721832275391, "epoch": 0.30544, "grad_norm": 1.838581919670105, "learning_rate": 3.474229691876751e-05, "loss": 0.6827, "mean_token_accuracy": 0.802372270822525, "num_tokens": 396210867.0, "step": 38180 }, { "entropy": 0.6626975536346436, "epoch": 0.30552, "grad_norm": 2.2460978031158447, "learning_rate": 3.473829531812725e-05, "loss": 0.6669, "mean_token_accuracy": 0.7962694883346557, "num_tokens": 396354496.0, "step": 38190 }, { "entropy": 0.6410064101219177, "epoch": 0.3056, "grad_norm": 4.417910099029541, "learning_rate": 3.4734293717487e-05, "loss": 0.6317, "mean_token_accuracy": 0.8258972525596618, "num_tokens": 396394614.0, "step": 38200 }, { "entropy": 0.6476796180009842, "epoch": 0.30568, "grad_norm": 2.554677963256836, "learning_rate": 3.473029211684674e-05, "loss": 0.6485, "mean_token_accuracy": 0.7968917906284332, "num_tokens": 396558454.0, "step": 38210 }, { "entropy": 0.6119692981243133, "epoch": 0.30576, "grad_norm": 3.113043785095215, "learning_rate": 3.472629051620648e-05, "loss": 0.6096, "mean_token_accuracy": 0.8188958704471588, "num_tokens": 396645140.0, "step": 38220 }, { "entropy": 0.7339744567871094, "epoch": 0.30584, "grad_norm": 1.9273070096969604, "learning_rate": 3.4722288915566226e-05, "loss": 0.7211, "mean_token_accuracy": 0.7930626988410949, "num_tokens": 396739787.0, "step": 38230 }, { "entropy": 0.6906464993953705, "epoch": 0.30592, "grad_norm": 2.361405372619629, "learning_rate": 3.4718287314925976e-05, "loss": 0.6892, "mean_token_accuracy": 0.7845626652240754, "num_tokens": 396885712.0, "step": 38240 }, { "entropy": 0.6502999544143677, "epoch": 0.306, "grad_norm": 4.305227279663086, "learning_rate": 3.471428571428571e-05, "loss": 0.6348, "mean_token_accuracy": 0.8229206800460815, "num_tokens": 396931536.0, "step": 38250 }, { "entropy": 0.6125786006450653, "epoch": 0.30608, "grad_norm": 2.669834613800049, "learning_rate": 3.471028411364546e-05, "loss": 0.6125, "mean_token_accuracy": 0.8026196956634521, "num_tokens": 397095376.0, "step": 38260 }, { "entropy": 0.6747897565364838, "epoch": 0.30616, "grad_norm": 2.98551869392395, "learning_rate": 3.470628251300521e-05, "loss": 0.6726, "mean_token_accuracy": 0.8074978649616241, "num_tokens": 397177779.0, "step": 38270 }, { "entropy": 0.7172129988670349, "epoch": 0.30624, "grad_norm": 2.0520551204681396, "learning_rate": 3.470228091236495e-05, "loss": 0.7163, "mean_token_accuracy": 0.79699986577034, "num_tokens": 397271664.0, "step": 38280 }, { "entropy": 0.7529942333698273, "epoch": 0.30632, "grad_norm": 2.3951456546783447, "learning_rate": 3.469827931172469e-05, "loss": 0.746, "mean_token_accuracy": 0.782472413778305, "num_tokens": 397399484.0, "step": 38290 }, { "entropy": 0.666222208738327, "epoch": 0.3064, "grad_norm": 4.773218631744385, "learning_rate": 3.469427771108443e-05, "loss": 0.6549, "mean_token_accuracy": 0.8241872608661651, "num_tokens": 397438229.0, "step": 38300 }, { "entropy": 0.6677520275115967, "epoch": 0.30648, "grad_norm": 2.1639180183410645, "learning_rate": 3.469027611044418e-05, "loss": 0.6673, "mean_token_accuracy": 0.7886803567409515, "num_tokens": 397601763.0, "step": 38310 }, { "entropy": 0.6643915235996246, "epoch": 0.30656, "grad_norm": 3.2462918758392334, "learning_rate": 3.4686274509803926e-05, "loss": 0.6598, "mean_token_accuracy": 0.8098173141479492, "num_tokens": 397685909.0, "step": 38320 }, { "entropy": 0.7504390954971314, "epoch": 0.30664, "grad_norm": 1.7505744695663452, "learning_rate": 3.468227290916366e-05, "loss": 0.7635, "mean_token_accuracy": 0.7872878968715668, "num_tokens": 397780176.0, "step": 38330 }, { "entropy": 0.7074510216712951, "epoch": 0.30672, "grad_norm": 2.510253667831421, "learning_rate": 3.467827130852341e-05, "loss": 0.7049, "mean_token_accuracy": 0.7859967648983002, "num_tokens": 397919852.0, "step": 38340 }, { "entropy": 0.6907450079917907, "epoch": 0.3068, "grad_norm": 4.301056385040283, "learning_rate": 3.467426970788316e-05, "loss": 0.6762, "mean_token_accuracy": 0.816523402929306, "num_tokens": 397960996.0, "step": 38350 }, { "entropy": 0.6556612074375152, "epoch": 0.30688, "grad_norm": 1.9437406063079834, "learning_rate": 3.46702681072429e-05, "loss": 0.6628, "mean_token_accuracy": 0.7963407754898071, "num_tokens": 398121953.0, "step": 38360 }, { "entropy": 0.6760665595531463, "epoch": 0.30696, "grad_norm": 2.7160754203796387, "learning_rate": 3.466626650660264e-05, "loss": 0.6599, "mean_token_accuracy": 0.8122835457324982, "num_tokens": 398196210.0, "step": 38370 }, { "entropy": 0.680410361289978, "epoch": 0.30704, "grad_norm": 1.7553951740264893, "learning_rate": 3.466226490596239e-05, "loss": 0.6879, "mean_token_accuracy": 0.8089561223983764, "num_tokens": 398290490.0, "step": 38380 }, { "entropy": 0.7064904689788818, "epoch": 0.30712, "grad_norm": 2.172271728515625, "learning_rate": 3.465826330532213e-05, "loss": 0.7031, "mean_token_accuracy": 0.7853672564029693, "num_tokens": 398438771.0, "step": 38390 }, { "entropy": 0.688374787569046, "epoch": 0.3072, "grad_norm": 4.551111698150635, "learning_rate": 3.4654261704681875e-05, "loss": 0.6801, "mean_token_accuracy": 0.8144954562187194, "num_tokens": 398481059.0, "step": 38400 }, { "entropy": 0.6468261778354645, "epoch": 0.30728, "grad_norm": 2.4598302841186523, "learning_rate": 3.465026010404162e-05, "loss": 0.6509, "mean_token_accuracy": 0.7952829122543335, "num_tokens": 398643605.0, "step": 38410 }, { "entropy": 0.7048883676528931, "epoch": 0.30736, "grad_norm": 3.3402843475341797, "learning_rate": 3.464625850340136e-05, "loss": 0.6916, "mean_token_accuracy": 0.8033179938793182, "num_tokens": 398718809.0, "step": 38420 }, { "entropy": 0.6786449730396271, "epoch": 0.30744, "grad_norm": 1.5020530223846436, "learning_rate": 3.4642256902761107e-05, "loss": 0.684, "mean_token_accuracy": 0.8072527527809144, "num_tokens": 398812426.0, "step": 38430 }, { "entropy": 0.665928202867508, "epoch": 0.30752, "grad_norm": 2.9341814517974854, "learning_rate": 3.463825530212085e-05, "loss": 0.6615, "mean_token_accuracy": 0.7964197933673859, "num_tokens": 398957192.0, "step": 38440 }, { "entropy": 0.6597848534584045, "epoch": 0.3076, "grad_norm": 4.423653602600098, "learning_rate": 3.4634253701480594e-05, "loss": 0.66, "mean_token_accuracy": 0.8179219841957093, "num_tokens": 398996915.0, "step": 38450 }, { "entropy": 0.7030025601387024, "epoch": 0.30768, "grad_norm": 1.625704050064087, "learning_rate": 3.463025210084034e-05, "loss": 0.6945, "mean_token_accuracy": 0.7823608219623566, "num_tokens": 399160501.0, "step": 38460 }, { "entropy": 0.7563637614250183, "epoch": 0.30776, "grad_norm": 3.7412126064300537, "learning_rate": 3.462625050020008e-05, "loss": 0.7625, "mean_token_accuracy": 0.7863653004169464, "num_tokens": 399233495.0, "step": 38470 }, { "entropy": 0.6981803774833679, "epoch": 0.30784, "grad_norm": 2.146008253097534, "learning_rate": 3.4622248899559825e-05, "loss": 0.6951, "mean_token_accuracy": 0.8007683515548706, "num_tokens": 399325918.0, "step": 38480 }, { "entropy": 0.7007264614105224, "epoch": 0.30792, "grad_norm": 2.016644239425659, "learning_rate": 3.461824729891957e-05, "loss": 0.7063, "mean_token_accuracy": 0.7824141502380371, "num_tokens": 399471813.0, "step": 38490 }, { "entropy": 0.709452110528946, "epoch": 0.308, "grad_norm": 4.478856086730957, "learning_rate": 3.461424569827931e-05, "loss": 0.7035, "mean_token_accuracy": 0.8113521456718444, "num_tokens": 399514753.0, "step": 38500 }, { "entropy": 0.6618081331253052, "epoch": 0.30808, "grad_norm": 1.9494622945785522, "learning_rate": 3.4610244097639056e-05, "loss": 0.6597, "mean_token_accuracy": 0.7918266713619232, "num_tokens": 399678261.0, "step": 38510 }, { "entropy": 0.6199714034795761, "epoch": 0.30816, "grad_norm": 3.749314308166504, "learning_rate": 3.46062424969988e-05, "loss": 0.6049, "mean_token_accuracy": 0.8262060821056366, "num_tokens": 399753776.0, "step": 38520 }, { "entropy": 0.7117061674594879, "epoch": 0.30824, "grad_norm": 1.2983605861663818, "learning_rate": 3.4602240896358544e-05, "loss": 0.7104, "mean_token_accuracy": 0.8024856567382812, "num_tokens": 399845405.0, "step": 38530 }, { "entropy": 0.6522516310214996, "epoch": 0.30832, "grad_norm": 2.9289262294769287, "learning_rate": 3.459823929571829e-05, "loss": 0.6567, "mean_token_accuracy": 0.7979038417339325, "num_tokens": 399981778.0, "step": 38540 }, { "entropy": 0.695692366361618, "epoch": 0.3084, "grad_norm": 3.8745527267456055, "learning_rate": 3.459423769507804e-05, "loss": 0.6883, "mean_token_accuracy": 0.8121979892253876, "num_tokens": 400019613.0, "step": 38550 }, { "entropy": 0.6909517168998718, "epoch": 0.30848, "grad_norm": 1.8751921653747559, "learning_rate": 3.4590236094437775e-05, "loss": 0.6876, "mean_token_accuracy": 0.7865351796150207, "num_tokens": 400183453.0, "step": 38560 }, { "entropy": 0.5925982058048248, "epoch": 0.30856, "grad_norm": 3.1780338287353516, "learning_rate": 3.458623449379752e-05, "loss": 0.5887, "mean_token_accuracy": 0.8239918291568756, "num_tokens": 400284192.0, "step": 38570 }, { "entropy": 0.7205361187458038, "epoch": 0.30864, "grad_norm": 1.3956056833267212, "learning_rate": 3.458223289315726e-05, "loss": 0.7299, "mean_token_accuracy": 0.7917815804481506, "num_tokens": 400379724.0, "step": 38580 }, { "entropy": 0.6802339375019073, "epoch": 0.30872, "grad_norm": 3.258643627166748, "learning_rate": 3.457823129251701e-05, "loss": 0.6696, "mean_token_accuracy": 0.7938243806362152, "num_tokens": 400514409.0, "step": 38590 }, { "entropy": 0.7261753469705582, "epoch": 0.3088, "grad_norm": 5.167269229888916, "learning_rate": 3.457422969187675e-05, "loss": 0.7194, "mean_token_accuracy": 0.8137327075004578, "num_tokens": 400550629.0, "step": 38600 }, { "entropy": 0.6653572261333466, "epoch": 0.30888, "grad_norm": 1.8301273584365845, "learning_rate": 3.4570228091236493e-05, "loss": 0.6687, "mean_token_accuracy": 0.7894968271255494, "num_tokens": 400714469.0, "step": 38610 }, { "entropy": 0.684073007106781, "epoch": 0.30896, "grad_norm": 3.1600794792175293, "learning_rate": 3.456622649059624e-05, "loss": 0.6703, "mean_token_accuracy": 0.8069852948188782, "num_tokens": 400808370.0, "step": 38620 }, { "entropy": 0.7191279470920563, "epoch": 0.30904, "grad_norm": 1.782247543334961, "learning_rate": 3.456222488995599e-05, "loss": 0.7242, "mean_token_accuracy": 0.7941720306873321, "num_tokens": 400904328.0, "step": 38630 }, { "entropy": 0.7215833485126495, "epoch": 0.30912, "grad_norm": 2.1650798320770264, "learning_rate": 3.4558223289315725e-05, "loss": 0.7119, "mean_token_accuracy": 0.7839128375053406, "num_tokens": 401050075.0, "step": 38640 }, { "entropy": 0.7152044534683227, "epoch": 0.3092, "grad_norm": 4.920242786407471, "learning_rate": 3.455422168867547e-05, "loss": 0.7004, "mean_token_accuracy": 0.8107025861740113, "num_tokens": 401094424.0, "step": 38650 }, { "entropy": 0.664632648229599, "epoch": 0.30928, "grad_norm": 2.6268272399902344, "learning_rate": 3.455022008803522e-05, "loss": 0.6654, "mean_token_accuracy": 0.7938812971115112, "num_tokens": 401258264.0, "step": 38660 }, { "entropy": 0.6923922240734101, "epoch": 0.30936, "grad_norm": 3.2010157108306885, "learning_rate": 3.454621848739496e-05, "loss": 0.6868, "mean_token_accuracy": 0.8066497147083282, "num_tokens": 401346997.0, "step": 38670 }, { "entropy": 0.6896301746368408, "epoch": 0.30944, "grad_norm": 1.8251736164093018, "learning_rate": 3.45422168867547e-05, "loss": 0.6851, "mean_token_accuracy": 0.8032917022705078, "num_tokens": 401439439.0, "step": 38680 }, { "entropy": 0.699242377281189, "epoch": 0.30952, "grad_norm": 2.956483840942383, "learning_rate": 3.453821528611444e-05, "loss": 0.6915, "mean_token_accuracy": 0.7895913362503052, "num_tokens": 401574253.0, "step": 38690 }, { "entropy": 0.6644916713237763, "epoch": 0.3096, "grad_norm": 5.244635581970215, "learning_rate": 3.4534213685474194e-05, "loss": 0.667, "mean_token_accuracy": 0.8155540764331818, "num_tokens": 401611874.0, "step": 38700 }, { "entropy": 0.6901292383670807, "epoch": 0.30968, "grad_norm": 2.2492623329162598, "learning_rate": 3.453021208483394e-05, "loss": 0.6894, "mean_token_accuracy": 0.787849235534668, "num_tokens": 401773866.0, "step": 38710 }, { "entropy": 0.658293092250824, "epoch": 0.30976, "grad_norm": 3.7662715911865234, "learning_rate": 3.4526210484193674e-05, "loss": 0.6618, "mean_token_accuracy": 0.8145524203777313, "num_tokens": 401844199.0, "step": 38720 }, { "entropy": 0.6572319239377975, "epoch": 0.30984, "grad_norm": 1.7092790603637695, "learning_rate": 3.4522208883553425e-05, "loss": 0.6599, "mean_token_accuracy": 0.8063091933727264, "num_tokens": 401937907.0, "step": 38730 }, { "entropy": 0.7423109710216522, "epoch": 0.30992, "grad_norm": 2.299698829650879, "learning_rate": 3.451820728291317e-05, "loss": 0.7277, "mean_token_accuracy": 0.7859553039073944, "num_tokens": 402073439.0, "step": 38740 }, { "entropy": 0.6904032677412033, "epoch": 0.31, "grad_norm": 4.422895908355713, "learning_rate": 3.451420568227291e-05, "loss": 0.685, "mean_token_accuracy": 0.8139399528503418, "num_tokens": 402113054.0, "step": 38750 }, { "entropy": 0.6787952303886413, "epoch": 0.31008, "grad_norm": 1.5467339754104614, "learning_rate": 3.451020408163265e-05, "loss": 0.6785, "mean_token_accuracy": 0.7876038193702698, "num_tokens": 402276894.0, "step": 38760 }, { "entropy": 0.693244582414627, "epoch": 0.31016, "grad_norm": 3.650217294692993, "learning_rate": 3.45062024809924e-05, "loss": 0.6844, "mean_token_accuracy": 0.8013260900974274, "num_tokens": 402355625.0, "step": 38770 }, { "entropy": 0.6909214496612549, "epoch": 0.31024, "grad_norm": 1.5840431451797485, "learning_rate": 3.450220088035214e-05, "loss": 0.7055, "mean_token_accuracy": 0.8012032330036163, "num_tokens": 402447915.0, "step": 38780 }, { "entropy": 0.720099812746048, "epoch": 0.31032, "grad_norm": 2.0401337146759033, "learning_rate": 3.449819927971189e-05, "loss": 0.7112, "mean_token_accuracy": 0.7869202315807342, "num_tokens": 402595620.0, "step": 38790 }, { "entropy": 0.645037043094635, "epoch": 0.3104, "grad_norm": 4.362630844116211, "learning_rate": 3.449419767907163e-05, "loss": 0.6389, "mean_token_accuracy": 0.8275199174880982, "num_tokens": 402633971.0, "step": 38800 }, { "entropy": 0.6423164308071136, "epoch": 0.31048, "grad_norm": 1.504745602607727, "learning_rate": 3.4490196078431374e-05, "loss": 0.6391, "mean_token_accuracy": 0.7958060145378113, "num_tokens": 402795904.0, "step": 38810 }, { "entropy": 0.7459212183952332, "epoch": 0.31056, "grad_norm": 2.9409127235412598, "learning_rate": 3.448619447779112e-05, "loss": 0.7472, "mean_token_accuracy": 0.796296089887619, "num_tokens": 402860902.0, "step": 38820 }, { "entropy": 0.7501584768295289, "epoch": 0.31064, "grad_norm": 1.9700559377670288, "learning_rate": 3.448219287715086e-05, "loss": 0.7401, "mean_token_accuracy": 0.787831449508667, "num_tokens": 402953998.0, "step": 38830 }, { "entropy": 0.6903954684734345, "epoch": 0.31072, "grad_norm": 3.4681646823883057, "learning_rate": 3.4478191276510606e-05, "loss": 0.6885, "mean_token_accuracy": 0.7901893675327301, "num_tokens": 403098127.0, "step": 38840 }, { "entropy": 0.6558971017599106, "epoch": 0.3108, "grad_norm": 4.973380088806152, "learning_rate": 3.447418967587035e-05, "loss": 0.6525, "mean_token_accuracy": 0.8266383886337281, "num_tokens": 403138891.0, "step": 38850 }, { "entropy": 0.6845178067684173, "epoch": 0.31088, "grad_norm": 2.4442272186279297, "learning_rate": 3.447018807523009e-05, "loss": 0.6765, "mean_token_accuracy": 0.7887335121631622, "num_tokens": 403302731.0, "step": 38860 }, { "entropy": 0.653925547003746, "epoch": 0.31096, "grad_norm": 3.8596436977386475, "learning_rate": 3.446618647458984e-05, "loss": 0.6539, "mean_token_accuracy": 0.8122621774673462, "num_tokens": 403392620.0, "step": 38870 }, { "entropy": 0.6598176956176758, "epoch": 0.31104, "grad_norm": 1.4218308925628662, "learning_rate": 3.446218487394958e-05, "loss": 0.6761, "mean_token_accuracy": 0.8016247451305389, "num_tokens": 403485511.0, "step": 38880 }, { "entropy": 0.8194695115089417, "epoch": 0.31112, "grad_norm": 3.063185214996338, "learning_rate": 3.4458183273309324e-05, "loss": 0.7988, "mean_token_accuracy": 0.7691743493080139, "num_tokens": 403624166.0, "step": 38890 }, { "entropy": 0.7215332925319672, "epoch": 0.3112, "grad_norm": 4.5951008796691895, "learning_rate": 3.445418167266907e-05, "loss": 0.7059, "mean_token_accuracy": 0.8078867733478546, "num_tokens": 403668944.0, "step": 38900 }, { "entropy": 0.6457254111766815, "epoch": 0.31128, "grad_norm": 1.488465428352356, "learning_rate": 3.445018007202881e-05, "loss": 0.6525, "mean_token_accuracy": 0.7945774376392365, "num_tokens": 403832784.0, "step": 38910 }, { "entropy": 0.7035377204418183, "epoch": 0.31136, "grad_norm": 3.4211411476135254, "learning_rate": 3.4446178471388555e-05, "loss": 0.7042, "mean_token_accuracy": 0.7958660125732422, "num_tokens": 403922750.0, "step": 38920 }, { "entropy": 0.7221158862113952, "epoch": 0.31144, "grad_norm": 1.6621942520141602, "learning_rate": 3.44421768707483e-05, "loss": 0.7214, "mean_token_accuracy": 0.7961075246334076, "num_tokens": 404017465.0, "step": 38930 }, { "entropy": 0.7169799923896789, "epoch": 0.31152, "grad_norm": 2.240079402923584, "learning_rate": 3.443817527010805e-05, "loss": 0.7095, "mean_token_accuracy": 0.7795134902000427, "num_tokens": 404156873.0, "step": 38940 }, { "entropy": 0.7322612315416336, "epoch": 0.3116, "grad_norm": 5.0546393394470215, "learning_rate": 3.4434173669467786e-05, "loss": 0.7173, "mean_token_accuracy": 0.8073983490467072, "num_tokens": 404194535.0, "step": 38950 }, { "entropy": 0.6514396250247956, "epoch": 0.31168, "grad_norm": 2.044748067855835, "learning_rate": 3.443017206882753e-05, "loss": 0.655, "mean_token_accuracy": 0.7943453788757324, "num_tokens": 404358375.0, "step": 38960 }, { "entropy": 0.6757202744483948, "epoch": 0.31176, "grad_norm": 2.9021999835968018, "learning_rate": 3.4426170468187274e-05, "loss": 0.6708, "mean_token_accuracy": 0.80681112408638, "num_tokens": 404450514.0, "step": 38970 }, { "entropy": 0.692850524187088, "epoch": 0.31184, "grad_norm": 2.0747287273406982, "learning_rate": 3.4422168867547024e-05, "loss": 0.6939, "mean_token_accuracy": 0.7995562553405762, "num_tokens": 404544110.0, "step": 38980 }, { "entropy": 0.6844036817550659, "epoch": 0.31192, "grad_norm": 2.8139939308166504, "learning_rate": 3.441816726690676e-05, "loss": 0.683, "mean_token_accuracy": 0.7941939175128937, "num_tokens": 404677586.0, "step": 38990 }, { "entropy": 0.6912837147712707, "epoch": 0.312, "grad_norm": 5.42689323425293, "learning_rate": 3.4414165666266505e-05, "loss": 0.6874, "mean_token_accuracy": 0.8181859254837036, "num_tokens": 404712052.0, "step": 39000 }, { "entropy": 0.6395231485366821, "epoch": 0.31208, "grad_norm": 1.4192770719528198, "learning_rate": 3.4410164065626256e-05, "loss": 0.6369, "mean_token_accuracy": 0.7944125652313232, "num_tokens": 404875892.0, "step": 39010 }, { "entropy": 0.7334727644920349, "epoch": 0.31216, "grad_norm": 3.617680311203003, "learning_rate": 3.4406162464986e-05, "loss": 0.726, "mean_token_accuracy": 0.7936845898628235, "num_tokens": 404961186.0, "step": 39020 }, { "entropy": 0.6352023988962173, "epoch": 0.31224, "grad_norm": 1.6549803018569946, "learning_rate": 3.4402160864345736e-05, "loss": 0.6289, "mean_token_accuracy": 0.813284718990326, "num_tokens": 405055214.0, "step": 39030 }, { "entropy": 0.622663152217865, "epoch": 0.31232, "grad_norm": 1.7933917045593262, "learning_rate": 3.439815926370548e-05, "loss": 0.6299, "mean_token_accuracy": 0.8025583386421203, "num_tokens": 405209767.0, "step": 39040 }, { "entropy": 0.6711731582880021, "epoch": 0.3124, "grad_norm": 3.1898157596588135, "learning_rate": 3.439415766306523e-05, "loss": 0.6424, "mean_token_accuracy": 0.8219023227691651, "num_tokens": 405261064.0, "step": 39050 }, { "entropy": 0.6705413401126862, "epoch": 0.31248, "grad_norm": 1.5899553298950195, "learning_rate": 3.4390156062424974e-05, "loss": 0.6713, "mean_token_accuracy": 0.7912616074085236, "num_tokens": 405424904.0, "step": 39060 }, { "entropy": 0.6051038771867752, "epoch": 0.31256, "grad_norm": 2.8236172199249268, "learning_rate": 3.438615446178471e-05, "loss": 0.6102, "mean_token_accuracy": 0.8196459472179413, "num_tokens": 405512462.0, "step": 39070 }, { "entropy": 0.7165719747543335, "epoch": 0.31264, "grad_norm": 1.4416108131408691, "learning_rate": 3.438215286114446e-05, "loss": 0.7123, "mean_token_accuracy": 0.7953144490718842, "num_tokens": 405606509.0, "step": 39080 }, { "entropy": 0.7501897513866425, "epoch": 0.31272, "grad_norm": 2.64241886138916, "learning_rate": 3.4378151260504205e-05, "loss": 0.7541, "mean_token_accuracy": 0.7744211077690124, "num_tokens": 405759935.0, "step": 39090 }, { "entropy": 0.6681806832551956, "epoch": 0.3128, "grad_norm": 4.357291221618652, "learning_rate": 3.437414965986395e-05, "loss": 0.6572, "mean_token_accuracy": 0.8195933282375336, "num_tokens": 405806848.0, "step": 39100 }, { "entropy": 0.6896993160247803, "epoch": 0.31288, "grad_norm": 1.7919225692749023, "learning_rate": 3.4370148059223686e-05, "loss": 0.6874, "mean_token_accuracy": 0.7853138744831085, "num_tokens": 405970688.0, "step": 39110 }, { "entropy": 0.672356241941452, "epoch": 0.31296, "grad_norm": 3.038097620010376, "learning_rate": 3.4366146458583436e-05, "loss": 0.6661, "mean_token_accuracy": 0.803856509923935, "num_tokens": 406073856.0, "step": 39120 }, { "entropy": 0.7209257543087005, "epoch": 0.31304, "grad_norm": 2.47175931930542, "learning_rate": 3.436214485794318e-05, "loss": 0.7373, "mean_token_accuracy": 0.7906595528125763, "num_tokens": 406169207.0, "step": 39130 }, { "entropy": 0.7115608990192414, "epoch": 0.31312, "grad_norm": 2.723255157470703, "learning_rate": 3.4358143257302924e-05, "loss": 0.6978, "mean_token_accuracy": 0.7935382962226868, "num_tokens": 406303745.0, "step": 39140 }, { "entropy": 0.6927806735038757, "epoch": 0.3132, "grad_norm": 4.2471723556518555, "learning_rate": 3.435414165666267e-05, "loss": 0.7054, "mean_token_accuracy": 0.8122939050197602, "num_tokens": 406343952.0, "step": 39150 }, { "entropy": 0.6387099385261535, "epoch": 0.31328, "grad_norm": 2.177528142929077, "learning_rate": 3.435014005602241e-05, "loss": 0.6351, "mean_token_accuracy": 0.7973986327648163, "num_tokens": 406507792.0, "step": 39160 }, { "entropy": 0.6827019989490509, "epoch": 0.31336, "grad_norm": 3.841623306274414, "learning_rate": 3.4346138455382155e-05, "loss": 0.6649, "mean_token_accuracy": 0.8091366648674011, "num_tokens": 406593926.0, "step": 39170 }, { "entropy": 0.6583321213722229, "epoch": 0.31344, "grad_norm": 2.82134747505188, "learning_rate": 3.43421368547419e-05, "loss": 0.6795, "mean_token_accuracy": 0.803148889541626, "num_tokens": 406686190.0, "step": 39180 }, { "entropy": 0.7493088603019714, "epoch": 0.31352, "grad_norm": 2.2440037727355957, "learning_rate": 3.433813525410164e-05, "loss": 0.7414, "mean_token_accuracy": 0.7821221470832824, "num_tokens": 406820067.0, "step": 39190 }, { "entropy": 0.7188342303037644, "epoch": 0.3136, "grad_norm": 5.191333770751953, "learning_rate": 3.4334133653461386e-05, "loss": 0.7176, "mean_token_accuracy": 0.8073611676692962, "num_tokens": 406859926.0, "step": 39200 }, { "entropy": 0.6387879073619842, "epoch": 0.31368, "grad_norm": 2.2756664752960205, "learning_rate": 3.433013205282113e-05, "loss": 0.6339, "mean_token_accuracy": 0.8002670705318451, "num_tokens": 407022967.0, "step": 39210 }, { "entropy": 0.7108719706535339, "epoch": 0.31376, "grad_norm": 3.660305976867676, "learning_rate": 3.4326130452180874e-05, "loss": 0.6989, "mean_token_accuracy": 0.7992885291576386, "num_tokens": 407104411.0, "step": 39220 }, { "entropy": 0.6959119141101837, "epoch": 0.31384, "grad_norm": 2.7294816970825195, "learning_rate": 3.432212885154062e-05, "loss": 0.7038, "mean_token_accuracy": 0.7978393614292145, "num_tokens": 407198677.0, "step": 39230 }, { "entropy": 0.6790766060352326, "epoch": 0.31392, "grad_norm": 2.412656307220459, "learning_rate": 3.431812725090036e-05, "loss": 0.6748, "mean_token_accuracy": 0.7957970917224884, "num_tokens": 407330169.0, "step": 39240 }, { "entropy": 0.6903815388679504, "epoch": 0.314, "grad_norm": 3.6998932361602783, "learning_rate": 3.4314125650260105e-05, "loss": 0.6961, "mean_token_accuracy": 0.8147828638553619, "num_tokens": 407370044.0, "step": 39250 }, { "entropy": 0.7210972309112549, "epoch": 0.31408, "grad_norm": 1.4178787469863892, "learning_rate": 3.431012404961985e-05, "loss": 0.7161, "mean_token_accuracy": 0.7805324852466583, "num_tokens": 407533884.0, "step": 39260 }, { "entropy": 0.7072764992713928, "epoch": 0.31416, "grad_norm": 3.3305985927581787, "learning_rate": 3.430612244897959e-05, "loss": 0.7035, "mean_token_accuracy": 0.7994473516941071, "num_tokens": 407623829.0, "step": 39270 }, { "entropy": 0.7153563737869263, "epoch": 0.31424, "grad_norm": 2.0398659706115723, "learning_rate": 3.4302120848339336e-05, "loss": 0.722, "mean_token_accuracy": 0.7958117187023163, "num_tokens": 407718387.0, "step": 39280 }, { "entropy": 0.7097026646137238, "epoch": 0.31432, "grad_norm": 2.172067880630493, "learning_rate": 3.429811924769908e-05, "loss": 0.7027, "mean_token_accuracy": 0.7837118327617645, "num_tokens": 407872008.0, "step": 39290 }, { "entropy": 0.7639414548873902, "epoch": 0.3144, "grad_norm": 6.175773620605469, "learning_rate": 3.429411764705882e-05, "loss": 0.7583, "mean_token_accuracy": 0.8015104949474334, "num_tokens": 407919253.0, "step": 39300 }, { "entropy": 0.6602601170539856, "epoch": 0.31448, "grad_norm": 2.1905171871185303, "learning_rate": 3.429011604641857e-05, "loss": 0.6589, "mean_token_accuracy": 0.79053493142128, "num_tokens": 408083093.0, "step": 39310 }, { "entropy": 0.7276988714933396, "epoch": 0.31456, "grad_norm": 3.380329132080078, "learning_rate": 3.428611444577831e-05, "loss": 0.7126, "mean_token_accuracy": 0.7934292495250702, "num_tokens": 408188909.0, "step": 39320 }, { "entropy": 0.7034312665462494, "epoch": 0.31464, "grad_norm": 2.090177059173584, "learning_rate": 3.428211284513806e-05, "loss": 0.7256, "mean_token_accuracy": 0.7911137700080871, "num_tokens": 408283690.0, "step": 39330 }, { "entropy": 0.7805095434188842, "epoch": 0.31472, "grad_norm": 3.0150253772735596, "learning_rate": 3.42781112444978e-05, "loss": 0.7672, "mean_token_accuracy": 0.7733033418655395, "num_tokens": 408420549.0, "step": 39340 }, { "entropy": 0.648443055152893, "epoch": 0.3148, "grad_norm": 4.535465240478516, "learning_rate": 3.427410964385754e-05, "loss": 0.6275, "mean_token_accuracy": 0.8264647722244263, "num_tokens": 408459446.0, "step": 39350 }, { "entropy": 0.6463352978229523, "epoch": 0.31488, "grad_norm": 1.4557756185531616, "learning_rate": 3.4270108043217286e-05, "loss": 0.6482, "mean_token_accuracy": 0.7953529596328736, "num_tokens": 408623286.0, "step": 39360 }, { "entropy": 0.6892585098743439, "epoch": 0.31496, "grad_norm": 3.1591272354125977, "learning_rate": 3.4266106442577036e-05, "loss": 0.6781, "mean_token_accuracy": 0.8051464676856994, "num_tokens": 408707047.0, "step": 39370 }, { "entropy": 0.6808063328266144, "epoch": 0.31504, "grad_norm": 1.829331874847412, "learning_rate": 3.426210484193677e-05, "loss": 0.6761, "mean_token_accuracy": 0.8083103775978089, "num_tokens": 408799163.0, "step": 39380 }, { "entropy": 0.6767850637435913, "epoch": 0.31512, "grad_norm": 2.5666303634643555, "learning_rate": 3.425810324129652e-05, "loss": 0.672, "mean_token_accuracy": 0.7938725352287292, "num_tokens": 408938182.0, "step": 39390 }, { "entropy": 0.6851371824741364, "epoch": 0.3152, "grad_norm": 4.300533771514893, "learning_rate": 3.425410164065627e-05, "loss": 0.6764, "mean_token_accuracy": 0.8116811633110046, "num_tokens": 408982208.0, "step": 39400 }, { "entropy": 0.6410324692726135, "epoch": 0.31528, "grad_norm": 1.9671976566314697, "learning_rate": 3.425010004001601e-05, "loss": 0.6419, "mean_token_accuracy": 0.7981442809104919, "num_tokens": 409146022.0, "step": 39410 }, { "entropy": 0.6739932298660278, "epoch": 0.31536, "grad_norm": 3.211951732635498, "learning_rate": 3.424609843937575e-05, "loss": 0.6598, "mean_token_accuracy": 0.8104695022106171, "num_tokens": 409229998.0, "step": 39420 }, { "entropy": 0.6875308811664581, "epoch": 0.31544, "grad_norm": 1.8594231605529785, "learning_rate": 3.424209683873549e-05, "loss": 0.7021, "mean_token_accuracy": 0.7961499929428101, "num_tokens": 409324263.0, "step": 39430 }, { "entropy": 0.658226364850998, "epoch": 0.31552, "grad_norm": 3.6229734420776367, "learning_rate": 3.423809523809524e-05, "loss": 0.6533, "mean_token_accuracy": 0.7971680045127869, "num_tokens": 409468446.0, "step": 39440 }, { "entropy": 0.6308485448360444, "epoch": 0.3156, "grad_norm": 5.066274166107178, "learning_rate": 3.4234093637454986e-05, "loss": 0.6217, "mean_token_accuracy": 0.8294447600841522, "num_tokens": 409507186.0, "step": 39450 }, { "entropy": 0.6455936014652253, "epoch": 0.31568, "grad_norm": 1.9298734664916992, "learning_rate": 3.423009203681472e-05, "loss": 0.6427, "mean_token_accuracy": 0.7955239415168762, "num_tokens": 409671026.0, "step": 39460 }, { "entropy": 0.6461831003427505, "epoch": 0.31576, "grad_norm": 2.654059886932373, "learning_rate": 3.422609043617447e-05, "loss": 0.6407, "mean_token_accuracy": 0.8119791507720947, "num_tokens": 409759585.0, "step": 39470 }, { "entropy": 0.7172153711318969, "epoch": 0.31584, "grad_norm": 1.6545922756195068, "learning_rate": 3.422208883553422e-05, "loss": 0.7291, "mean_token_accuracy": 0.7910875976085663, "num_tokens": 409856404.0, "step": 39480 }, { "entropy": 0.682837188243866, "epoch": 0.31592, "grad_norm": 3.3446717262268066, "learning_rate": 3.421808723489396e-05, "loss": 0.6768, "mean_token_accuracy": 0.7950474321842194, "num_tokens": 409985378.0, "step": 39490 }, { "entropy": 0.6675603151321411, "epoch": 0.316, "grad_norm": 4.737468242645264, "learning_rate": 3.42140856342537e-05, "loss": 0.6564, "mean_token_accuracy": 0.8216564774513244, "num_tokens": 410025556.0, "step": 39500 }, { "entropy": 0.6413525462150573, "epoch": 0.31608, "grad_norm": 2.7004189491271973, "learning_rate": 3.421008403361345e-05, "loss": 0.6355, "mean_token_accuracy": 0.7967574596405029, "num_tokens": 410189396.0, "step": 39510 }, { "entropy": 0.7367150962352753, "epoch": 0.31616, "grad_norm": 2.912075996398926, "learning_rate": 3.420608243297319e-05, "loss": 0.7213, "mean_token_accuracy": 0.7939463198184967, "num_tokens": 410279000.0, "step": 39520 }, { "entropy": 0.6670918792486191, "epoch": 0.31624, "grad_norm": 1.500401496887207, "learning_rate": 3.4202080832332935e-05, "loss": 0.6717, "mean_token_accuracy": 0.8076661229133606, "num_tokens": 410371885.0, "step": 39530 }, { "entropy": 0.7065722346305847, "epoch": 0.31632, "grad_norm": 3.635420560836792, "learning_rate": 3.419807923169268e-05, "loss": 0.6984, "mean_token_accuracy": 0.7889763414859772, "num_tokens": 410506864.0, "step": 39540 }, { "entropy": 0.6738668471574784, "epoch": 0.3164, "grad_norm": 5.14093017578125, "learning_rate": 3.419407763105242e-05, "loss": 0.6661, "mean_token_accuracy": 0.8219839811325074, "num_tokens": 410545070.0, "step": 39550 }, { "entropy": 0.7087543427944183, "epoch": 0.31648, "grad_norm": 2.2292752265930176, "learning_rate": 3.4190076030412167e-05, "loss": 0.7077, "mean_token_accuracy": 0.7878724277019501, "num_tokens": 410706020.0, "step": 39560 }, { "entropy": 0.6586189657449723, "epoch": 0.31656, "grad_norm": 3.1041901111602783, "learning_rate": 3.418607442977191e-05, "loss": 0.6393, "mean_token_accuracy": 0.8156343877315522, "num_tokens": 410774173.0, "step": 39570 }, { "entropy": 0.7353340208530426, "epoch": 0.31664, "grad_norm": 3.4737720489501953, "learning_rate": 3.4182072829131654e-05, "loss": 0.7468, "mean_token_accuracy": 0.7897300243377685, "num_tokens": 410867914.0, "step": 39580 }, { "entropy": 0.7498455286026001, "epoch": 0.31672, "grad_norm": 2.993285894393921, "learning_rate": 3.41780712284914e-05, "loss": 0.7441, "mean_token_accuracy": 0.7815604448318482, "num_tokens": 411005645.0, "step": 39590 }, { "entropy": 0.6916905701160431, "epoch": 0.3168, "grad_norm": 4.9019999504089355, "learning_rate": 3.417406962785114e-05, "loss": 0.691, "mean_token_accuracy": 0.8154324173927308, "num_tokens": 411039722.0, "step": 39600 }, { "entropy": 0.6792974054813385, "epoch": 0.31688, "grad_norm": 1.598671555519104, "learning_rate": 3.417006802721089e-05, "loss": 0.6779, "mean_token_accuracy": 0.7879579901695252, "num_tokens": 411203562.0, "step": 39610 }, { "entropy": 0.7149510473012924, "epoch": 0.31696, "grad_norm": 2.8967840671539307, "learning_rate": 3.416606642657063e-05, "loss": 0.7061, "mean_token_accuracy": 0.7980751991271973, "num_tokens": 411295394.0, "step": 39620 }, { "entropy": 0.7024721443653107, "epoch": 0.31704, "grad_norm": 1.94096040725708, "learning_rate": 3.416206482593037e-05, "loss": 0.706, "mean_token_accuracy": 0.7958853244781494, "num_tokens": 411390033.0, "step": 39630 }, { "entropy": 0.6763396263122559, "epoch": 0.31712, "grad_norm": 3.1377172470092773, "learning_rate": 3.4158063225290116e-05, "loss": 0.6681, "mean_token_accuracy": 0.7932274758815765, "num_tokens": 411535549.0, "step": 39640 }, { "entropy": 0.673091596364975, "epoch": 0.3172, "grad_norm": 6.66652250289917, "learning_rate": 3.415406162464987e-05, "loss": 0.6644, "mean_token_accuracy": 0.8173721432685852, "num_tokens": 411575380.0, "step": 39650 }, { "entropy": 0.6719540774822235, "epoch": 0.31728, "grad_norm": 2.1052913665771484, "learning_rate": 3.4150060024009604e-05, "loss": 0.6754, "mean_token_accuracy": 0.7876465559005738, "num_tokens": 411739220.0, "step": 39660 }, { "entropy": 0.7377524167299271, "epoch": 0.31736, "grad_norm": 3.2298974990844727, "learning_rate": 3.414605842336935e-05, "loss": 0.7328, "mean_token_accuracy": 0.7971558451652527, "num_tokens": 411826081.0, "step": 39670 }, { "entropy": 0.7069837868213653, "epoch": 0.31744, "grad_norm": 1.8505953550338745, "learning_rate": 3.41420568227291e-05, "loss": 0.7133, "mean_token_accuracy": 0.7967978835105896, "num_tokens": 411920054.0, "step": 39680 }, { "entropy": 0.7201024115085601, "epoch": 0.31752, "grad_norm": 2.793774366378784, "learning_rate": 3.413805522208884e-05, "loss": 0.7079, "mean_token_accuracy": 0.7869926512241363, "num_tokens": 412045291.0, "step": 39690 }, { "entropy": 0.7019266664981842, "epoch": 0.3176, "grad_norm": 3.9021687507629395, "learning_rate": 3.413405362144858e-05, "loss": 0.6833, "mean_token_accuracy": 0.8191761136054992, "num_tokens": 412082089.0, "step": 39700 }, { "entropy": 0.6187271475791931, "epoch": 0.31768, "grad_norm": 2.369065046310425, "learning_rate": 3.413005202080832e-05, "loss": 0.6313, "mean_token_accuracy": 0.8018991351127625, "num_tokens": 412244670.0, "step": 39710 }, { "entropy": 0.6136825323104859, "epoch": 0.31776, "grad_norm": 3.0034921169281006, "learning_rate": 3.412605042016807e-05, "loss": 0.6, "mean_token_accuracy": 0.8298138737678528, "num_tokens": 412313169.0, "step": 39720 }, { "entropy": 0.6400518953800202, "epoch": 0.31784, "grad_norm": 1.4742883443832397, "learning_rate": 3.4122048819527816e-05, "loss": 0.6387, "mean_token_accuracy": 0.8139857470989227, "num_tokens": 412407429.0, "step": 39730 }, { "entropy": 0.6347264170646667, "epoch": 0.31792, "grad_norm": 2.5462045669555664, "learning_rate": 3.4118047218887553e-05, "loss": 0.6372, "mean_token_accuracy": 0.8044503569602967, "num_tokens": 412535721.0, "step": 39740 }, { "entropy": 0.6562339693307877, "epoch": 0.318, "grad_norm": 6.094049453735352, "learning_rate": 3.41140456182473e-05, "loss": 0.646, "mean_token_accuracy": 0.8243451297283173, "num_tokens": 412571245.0, "step": 39750 }, { "entropy": 0.6289871513843537, "epoch": 0.31808, "grad_norm": 1.7395672798156738, "learning_rate": 3.411004401760705e-05, "loss": 0.615, "mean_token_accuracy": 0.80409135222435, "num_tokens": 412735085.0, "step": 39760 }, { "entropy": 0.6858004868030548, "epoch": 0.31816, "grad_norm": 4.059471130371094, "learning_rate": 3.410604241696679e-05, "loss": 0.6861, "mean_token_accuracy": 0.801856279373169, "num_tokens": 412831570.0, "step": 39770 }, { "entropy": 0.6457832515239715, "epoch": 0.31824, "grad_norm": 1.6032661199569702, "learning_rate": 3.410204081632653e-05, "loss": 0.6448, "mean_token_accuracy": 0.8087230861186981, "num_tokens": 412928111.0, "step": 39780 }, { "entropy": 0.7161656320095062, "epoch": 0.31832, "grad_norm": 2.356139898300171, "learning_rate": 3.409803921568628e-05, "loss": 0.7192, "mean_token_accuracy": 0.7842134296894073, "num_tokens": 413067976.0, "step": 39790 }, { "entropy": 0.6775455176830292, "epoch": 0.3184, "grad_norm": 5.159031391143799, "learning_rate": 3.409403761504602e-05, "loss": 0.6679, "mean_token_accuracy": 0.8206200480461121, "num_tokens": 413111585.0, "step": 39800 }, { "entropy": 0.669771033525467, "epoch": 0.31848, "grad_norm": 2.348501682281494, "learning_rate": 3.4090036014405766e-05, "loss": 0.6613, "mean_token_accuracy": 0.78995481133461, "num_tokens": 413275425.0, "step": 39810 }, { "entropy": 0.6248909115791321, "epoch": 0.31856, "grad_norm": 2.6694271564483643, "learning_rate": 3.40860344137655e-05, "loss": 0.6165, "mean_token_accuracy": 0.8220553517341613, "num_tokens": 413362217.0, "step": 39820 }, { "entropy": 0.680339378118515, "epoch": 0.31864, "grad_norm": 1.8783611059188843, "learning_rate": 3.4082032813125254e-05, "loss": 0.6958, "mean_token_accuracy": 0.7970945417881012, "num_tokens": 413457597.0, "step": 39830 }, { "entropy": 0.7622713029384613, "epoch": 0.31872, "grad_norm": 2.412868022918701, "learning_rate": 3.4078031212485e-05, "loss": 0.7624, "mean_token_accuracy": 0.7762583076953888, "num_tokens": 413596867.0, "step": 39840 }, { "entropy": 0.6412410080432892, "epoch": 0.3188, "grad_norm": 4.14167594909668, "learning_rate": 3.407402961184474e-05, "loss": 0.6315, "mean_token_accuracy": 0.8274441301822663, "num_tokens": 413640181.0, "step": 39850 }, { "entropy": 0.668306690454483, "epoch": 0.31888, "grad_norm": 2.1633009910583496, "learning_rate": 3.4070028011204485e-05, "loss": 0.6694, "mean_token_accuracy": 0.7873473465442657, "num_tokens": 413804021.0, "step": 39860 }, { "entropy": 0.6564489424228668, "epoch": 0.31896, "grad_norm": 3.078000545501709, "learning_rate": 3.406602641056423e-05, "loss": 0.6472, "mean_token_accuracy": 0.8098675966262817, "num_tokens": 413892067.0, "step": 39870 }, { "entropy": 0.7428603410720825, "epoch": 0.31904, "grad_norm": 1.4556081295013428, "learning_rate": 3.406202480992397e-05, "loss": 0.7401, "mean_token_accuracy": 0.7928360819816589, "num_tokens": 413986623.0, "step": 39880 }, { "entropy": 0.6803560197353363, "epoch": 0.31912, "grad_norm": 4.16707181930542, "learning_rate": 3.4058023209283716e-05, "loss": 0.6767, "mean_token_accuracy": 0.7977477252483368, "num_tokens": 414120985.0, "step": 39890 }, { "entropy": 0.7256173610687255, "epoch": 0.3192, "grad_norm": 3.7666220664978027, "learning_rate": 3.405402160864346e-05, "loss": 0.7089, "mean_token_accuracy": 0.8090141892433167, "num_tokens": 414156590.0, "step": 39900 }, { "entropy": 0.6636938929557801, "epoch": 0.31928, "grad_norm": 1.4340592622756958, "learning_rate": 3.40500200080032e-05, "loss": 0.667, "mean_token_accuracy": 0.7894540846347808, "num_tokens": 414320430.0, "step": 39910 }, { "entropy": 0.6166272014379501, "epoch": 0.31936, "grad_norm": 3.6137239933013916, "learning_rate": 3.404601840736295e-05, "loss": 0.6152, "mean_token_accuracy": 0.8163971304893494, "num_tokens": 414410138.0, "step": 39920 }, { "entropy": 0.7052731454372406, "epoch": 0.31944, "grad_norm": 1.8505604267120361, "learning_rate": 3.404201680672269e-05, "loss": 0.6973, "mean_token_accuracy": 0.7992993652820587, "num_tokens": 414504307.0, "step": 39930 }, { "entropy": 0.656644469499588, "epoch": 0.31952, "grad_norm": 1.9643676280975342, "learning_rate": 3.4038015206082434e-05, "loss": 0.6634, "mean_token_accuracy": 0.7932905495166779, "num_tokens": 414650673.0, "step": 39940 }, { "entropy": 0.7036508440971374, "epoch": 0.3196, "grad_norm": 5.238829135894775, "learning_rate": 3.403401360544218e-05, "loss": 0.6933, "mean_token_accuracy": 0.8146582007408142, "num_tokens": 414689234.0, "step": 39950 }, { "entropy": 0.7361275315284729, "epoch": 0.31968, "grad_norm": 1.8802419900894165, "learning_rate": 3.403001200480192e-05, "loss": 0.7362, "mean_token_accuracy": 0.7759281933307648, "num_tokens": 414853074.0, "step": 39960 }, { "entropy": 0.7147104263305664, "epoch": 0.31976, "grad_norm": 3.4240291118621826, "learning_rate": 3.4026010404161666e-05, "loss": 0.7189, "mean_token_accuracy": 0.7975272595882416, "num_tokens": 414937995.0, "step": 39970 }, { "entropy": 0.7422609090805053, "epoch": 0.31984, "grad_norm": 1.6989949941635132, "learning_rate": 3.402200880352141e-05, "loss": 0.7439, "mean_token_accuracy": 0.7858126342296601, "num_tokens": 415030993.0, "step": 39980 }, { "entropy": 0.6784637153148652, "epoch": 0.31992, "grad_norm": 2.2769694328308105, "learning_rate": 3.401800720288115e-05, "loss": 0.6702, "mean_token_accuracy": 0.7902435302734375, "num_tokens": 415179906.0, "step": 39990 }, { "entropy": 0.7620101809501648, "epoch": 0.32, "grad_norm": 4.673679351806641, "learning_rate": 3.4014005602240904e-05, "loss": 0.742, "mean_token_accuracy": 0.8019669592380524, "num_tokens": 415222347.0, "step": 40000 }, { "entropy": 0.6456586360931397, "epoch": 0.32008, "grad_norm": 2.320478677749634, "learning_rate": 3.401000400160064e-05, "loss": 0.6519, "mean_token_accuracy": 0.7955727934837341, "num_tokens": 415386187.0, "step": 40010 }, { "entropy": 0.6637061774730683, "epoch": 0.32016, "grad_norm": 2.3741507530212402, "learning_rate": 3.4006002400960384e-05, "loss": 0.6533, "mean_token_accuracy": 0.8057034313678741, "num_tokens": 415488326.0, "step": 40020 }, { "entropy": 0.7313696384429932, "epoch": 0.32024, "grad_norm": 1.4309407472610474, "learning_rate": 3.400200080032013e-05, "loss": 0.7242, "mean_token_accuracy": 0.7954869627952575, "num_tokens": 415585691.0, "step": 40030 }, { "entropy": 0.6795583605766297, "epoch": 0.32032, "grad_norm": 3.6423134803771973, "learning_rate": 3.399799919967988e-05, "loss": 0.6765, "mean_token_accuracy": 0.8010259747505188, "num_tokens": 415713463.0, "step": 40040 }, { "entropy": 0.6371319532394409, "epoch": 0.3204, "grad_norm": 5.737539291381836, "learning_rate": 3.3993997599039615e-05, "loss": 0.6368, "mean_token_accuracy": 0.8291005432605744, "num_tokens": 415751691.0, "step": 40050 }, { "entropy": 0.6110260188579559, "epoch": 0.32048, "grad_norm": 1.751551866531372, "learning_rate": 3.398999599839936e-05, "loss": 0.6091, "mean_token_accuracy": 0.8031570553779602, "num_tokens": 415915531.0, "step": 40060 }, { "entropy": 0.6948411166667938, "epoch": 0.32056, "grad_norm": 3.1092429161071777, "learning_rate": 3.398599439775911e-05, "loss": 0.6929, "mean_token_accuracy": 0.8022210419178009, "num_tokens": 416006854.0, "step": 40070 }, { "entropy": 0.690678596496582, "epoch": 0.32064, "grad_norm": 2.6734650135040283, "learning_rate": 3.398199279711885e-05, "loss": 0.6768, "mean_token_accuracy": 0.8058981955051422, "num_tokens": 416100086.0, "step": 40080 }, { "entropy": 0.7099099218845367, "epoch": 0.32072, "grad_norm": 2.481213092803955, "learning_rate": 3.397799119647859e-05, "loss": 0.7051, "mean_token_accuracy": 0.7847382724285126, "num_tokens": 416244746.0, "step": 40090 }, { "entropy": 0.641865810751915, "epoch": 0.3208, "grad_norm": 5.254179000854492, "learning_rate": 3.3973989595838334e-05, "loss": 0.6445, "mean_token_accuracy": 0.8231307685375213, "num_tokens": 416287037.0, "step": 40100 }, { "entropy": 0.6426762342453003, "epoch": 0.32088, "grad_norm": 1.959864616394043, "learning_rate": 3.3969987995198084e-05, "loss": 0.6386, "mean_token_accuracy": 0.7980153977870941, "num_tokens": 416450877.0, "step": 40110 }, { "entropy": 0.6851043492555619, "epoch": 0.32096, "grad_norm": 3.0970349311828613, "learning_rate": 3.396598639455783e-05, "loss": 0.6796, "mean_token_accuracy": 0.8075416386127472, "num_tokens": 416535221.0, "step": 40120 }, { "entropy": 0.6462158560752869, "epoch": 0.32104, "grad_norm": 2.059194564819336, "learning_rate": 3.3961984793917565e-05, "loss": 0.6639, "mean_token_accuracy": 0.8088313341140747, "num_tokens": 416629474.0, "step": 40130 }, { "entropy": 0.6950319886207581, "epoch": 0.32112, "grad_norm": 2.999519109725952, "learning_rate": 3.3957983193277316e-05, "loss": 0.6918, "mean_token_accuracy": 0.7893794655799866, "num_tokens": 416768810.0, "step": 40140 }, { "entropy": 0.7184281080961228, "epoch": 0.3212, "grad_norm": 4.862732410430908, "learning_rate": 3.395398159263706e-05, "loss": 0.7106, "mean_token_accuracy": 0.8087545514106751, "num_tokens": 416804497.0, "step": 40150 }, { "entropy": 0.6696970999240875, "epoch": 0.32128, "grad_norm": 2.1408252716064453, "learning_rate": 3.39499799919968e-05, "loss": 0.6709, "mean_token_accuracy": 0.787424874305725, "num_tokens": 416967773.0, "step": 40160 }, { "entropy": 0.669449970126152, "epoch": 0.32136, "grad_norm": 3.7743475437164307, "learning_rate": 3.394597839135654e-05, "loss": 0.6674, "mean_token_accuracy": 0.8062534391880035, "num_tokens": 417043330.0, "step": 40170 }, { "entropy": 0.7829221785068512, "epoch": 0.32144, "grad_norm": 1.8719289302825928, "learning_rate": 3.394197679071629e-05, "loss": 0.7829, "mean_token_accuracy": 0.7833808600902558, "num_tokens": 417135835.0, "step": 40180 }, { "entropy": 0.6427278041839599, "epoch": 0.32152, "grad_norm": 3.2317254543304443, "learning_rate": 3.3937975190076034e-05, "loss": 0.6409, "mean_token_accuracy": 0.8014158487319947, "num_tokens": 417275292.0, "step": 40190 }, { "entropy": 0.6622294127941132, "epoch": 0.3216, "grad_norm": 4.39084005355835, "learning_rate": 3.393397358943578e-05, "loss": 0.6467, "mean_token_accuracy": 0.8250134468078614, "num_tokens": 417314483.0, "step": 40200 }, { "entropy": 0.6016745746135712, "epoch": 0.32168, "grad_norm": 1.4034746885299683, "learning_rate": 3.392997198879552e-05, "loss": 0.598, "mean_token_accuracy": 0.8052576959133149, "num_tokens": 417478323.0, "step": 40210 }, { "entropy": 0.7359448134899139, "epoch": 0.32176, "grad_norm": 3.3664498329162598, "learning_rate": 3.3925970388155265e-05, "loss": 0.7396, "mean_token_accuracy": 0.7908240616321563, "num_tokens": 417564806.0, "step": 40220 }, { "entropy": 0.707502955198288, "epoch": 0.32184, "grad_norm": 2.0524165630340576, "learning_rate": 3.392196878751501e-05, "loss": 0.6898, "mean_token_accuracy": 0.8039863586425782, "num_tokens": 417658603.0, "step": 40230 }, { "entropy": 0.7068132102489472, "epoch": 0.32192, "grad_norm": 2.48663592338562, "learning_rate": 3.391796718687475e-05, "loss": 0.6947, "mean_token_accuracy": 0.7856737792491912, "num_tokens": 417803856.0, "step": 40240 }, { "entropy": 0.6028948307037354, "epoch": 0.322, "grad_norm": 5.727987766265869, "learning_rate": 3.3913965586234496e-05, "loss": 0.5991, "mean_token_accuracy": 0.8358020544052124, "num_tokens": 417845899.0, "step": 40250 }, { "entropy": 0.6981201887130737, "epoch": 0.32208, "grad_norm": 1.704827904701233, "learning_rate": 3.390996398559424e-05, "loss": 0.7032, "mean_token_accuracy": 0.7867321372032166, "num_tokens": 418009156.0, "step": 40260 }, { "entropy": 0.6410729914903641, "epoch": 0.32216, "grad_norm": 3.1239006519317627, "learning_rate": 3.3905962384953984e-05, "loss": 0.6304, "mean_token_accuracy": 0.8183893203735352, "num_tokens": 418090109.0, "step": 40270 }, { "entropy": 0.7537693202495575, "epoch": 0.32224, "grad_norm": 2.289443254470825, "learning_rate": 3.390196078431373e-05, "loss": 0.7721, "mean_token_accuracy": 0.7900238513946534, "num_tokens": 418184379.0, "step": 40280 }, { "entropy": 0.7042227655649185, "epoch": 0.32232, "grad_norm": 2.774383068084717, "learning_rate": 3.389795918367347e-05, "loss": 0.6885, "mean_token_accuracy": 0.7913788020610809, "num_tokens": 418307979.0, "step": 40290 }, { "entropy": 0.7676950633525849, "epoch": 0.3224, "grad_norm": 4.909606456756592, "learning_rate": 3.3893957583033215e-05, "loss": 0.7652, "mean_token_accuracy": 0.8003073930740356, "num_tokens": 418341491.0, "step": 40300 }, { "entropy": 0.6735545635223389, "epoch": 0.32248, "grad_norm": 1.88327157497406, "learning_rate": 3.388995598239296e-05, "loss": 0.6731, "mean_token_accuracy": 0.7867017328739166, "num_tokens": 418504844.0, "step": 40310 }, { "entropy": 0.6358257055282592, "epoch": 0.32256, "grad_norm": 2.8200113773345947, "learning_rate": 3.38859543817527e-05, "loss": 0.6174, "mean_token_accuracy": 0.8221112787723541, "num_tokens": 418582068.0, "step": 40320 }, { "entropy": 0.6772503197193146, "epoch": 0.32264, "grad_norm": 2.9704809188842773, "learning_rate": 3.3881952781112446e-05, "loss": 0.6846, "mean_token_accuracy": 0.806055623292923, "num_tokens": 418677081.0, "step": 40330 }, { "entropy": 0.6942532539367676, "epoch": 0.32272, "grad_norm": 1.8937300443649292, "learning_rate": 3.387795118047219e-05, "loss": 0.6896, "mean_token_accuracy": 0.7893504440784455, "num_tokens": 418819566.0, "step": 40340 }, { "entropy": 0.6484692871570588, "epoch": 0.3228, "grad_norm": 4.964309215545654, "learning_rate": 3.3873949579831933e-05, "loss": 0.6342, "mean_token_accuracy": 0.8210095465183258, "num_tokens": 418864864.0, "step": 40350 }, { "entropy": 0.63529132604599, "epoch": 0.32288, "grad_norm": 2.2236058712005615, "learning_rate": 3.386994797919168e-05, "loss": 0.6348, "mean_token_accuracy": 0.7966719686985015, "num_tokens": 419028704.0, "step": 40360 }, { "entropy": 0.6230601966381073, "epoch": 0.32296, "grad_norm": 4.291067123413086, "learning_rate": 3.386594637855142e-05, "loss": 0.6153, "mean_token_accuracy": 0.8240258693695068, "num_tokens": 419110767.0, "step": 40370 }, { "entropy": 0.6253087043762207, "epoch": 0.32304, "grad_norm": 1.2928718328475952, "learning_rate": 3.3861944777911165e-05, "loss": 0.6303, "mean_token_accuracy": 0.8132668137550354, "num_tokens": 419205724.0, "step": 40380 }, { "entropy": 0.7542495787143707, "epoch": 0.32312, "grad_norm": 2.4906699657440186, "learning_rate": 3.3857943177270915e-05, "loss": 0.7539, "mean_token_accuracy": 0.7754350900650024, "num_tokens": 419342619.0, "step": 40390 }, { "entropy": 0.6986195623874665, "epoch": 0.3232, "grad_norm": 4.737195014953613, "learning_rate": 3.385394157663065e-05, "loss": 0.6928, "mean_token_accuracy": 0.8125240087509156, "num_tokens": 419379894.0, "step": 40400 }, { "entropy": 0.6514365911483765, "epoch": 0.32328, "grad_norm": 1.8703820705413818, "learning_rate": 3.3849939975990396e-05, "loss": 0.651, "mean_token_accuracy": 0.7918539404869079, "num_tokens": 419543734.0, "step": 40410 }, { "entropy": 0.6406893223524094, "epoch": 0.32336, "grad_norm": 3.3927862644195557, "learning_rate": 3.384593837535014e-05, "loss": 0.6315, "mean_token_accuracy": 0.8123408317565918, "num_tokens": 419630621.0, "step": 40420 }, { "entropy": 0.6819296240806579, "epoch": 0.32344, "grad_norm": 1.6614633798599243, "learning_rate": 3.384193677470989e-05, "loss": 0.6768, "mean_token_accuracy": 0.8042268335819245, "num_tokens": 419724580.0, "step": 40430 }, { "entropy": 0.6847860872745514, "epoch": 0.32352, "grad_norm": 2.9844343662261963, "learning_rate": 3.383793517406963e-05, "loss": 0.6851, "mean_token_accuracy": 0.7907501220703125, "num_tokens": 419863713.0, "step": 40440 }, { "entropy": 0.7695650577545166, "epoch": 0.3236, "grad_norm": 5.450088977813721, "learning_rate": 3.383393357342937e-05, "loss": 0.7481, "mean_token_accuracy": 0.8010634005069732, "num_tokens": 419904534.0, "step": 40450 }, { "entropy": 0.611090362071991, "epoch": 0.32368, "grad_norm": 1.6194449663162231, "learning_rate": 3.382993197278912e-05, "loss": 0.6134, "mean_token_accuracy": 0.8057646155357361, "num_tokens": 420068369.0, "step": 40460 }, { "entropy": 0.773590949177742, "epoch": 0.32376, "grad_norm": 3.5249671936035156, "learning_rate": 3.3825930372148865e-05, "loss": 0.7634, "mean_token_accuracy": 0.7864337384700775, "num_tokens": 420153807.0, "step": 40470 }, { "entropy": 0.6996571719646454, "epoch": 0.32384, "grad_norm": 1.3032774925231934, "learning_rate": 3.38219287715086e-05, "loss": 0.6942, "mean_token_accuracy": 0.8005182385444641, "num_tokens": 420247799.0, "step": 40480 }, { "entropy": 0.7003549098968506, "epoch": 0.32392, "grad_norm": 2.9112653732299805, "learning_rate": 3.3817927170868345e-05, "loss": 0.6942, "mean_token_accuracy": 0.7892727553844452, "num_tokens": 420384657.0, "step": 40490 }, { "entropy": 0.7454461991786957, "epoch": 0.324, "grad_norm": 4.4030070304870605, "learning_rate": 3.3813925570228096e-05, "loss": 0.7404, "mean_token_accuracy": 0.802684611082077, "num_tokens": 420423075.0, "step": 40500 }, { "entropy": 0.7020365953445434, "epoch": 0.32408, "grad_norm": 1.9816515445709229, "learning_rate": 3.380992396958784e-05, "loss": 0.6942, "mean_token_accuracy": 0.7852344989776612, "num_tokens": 420586915.0, "step": 40510 }, { "entropy": 0.6935891598463059, "epoch": 0.32416, "grad_norm": 3.3934879302978516, "learning_rate": 3.380592236894758e-05, "loss": 0.6868, "mean_token_accuracy": 0.8021689593791962, "num_tokens": 420667864.0, "step": 40520 }, { "entropy": 0.7348287761211395, "epoch": 0.32424, "grad_norm": 2.085947036743164, "learning_rate": 3.380192076830733e-05, "loss": 0.7247, "mean_token_accuracy": 0.7946167826652527, "num_tokens": 420760944.0, "step": 40530 }, { "entropy": 0.6528979659080505, "epoch": 0.32432, "grad_norm": 2.2125165462493896, "learning_rate": 3.379791916766707e-05, "loss": 0.6576, "mean_token_accuracy": 0.7944053947925568, "num_tokens": 420908755.0, "step": 40540 }, { "entropy": 0.6843193382024765, "epoch": 0.3244, "grad_norm": 4.821384429931641, "learning_rate": 3.3793917567026815e-05, "loss": 0.6714, "mean_token_accuracy": 0.821463805437088, "num_tokens": 420947937.0, "step": 40550 }, { "entropy": 0.7087492227554322, "epoch": 0.32448, "grad_norm": 1.6467925310134888, "learning_rate": 3.378991596638655e-05, "loss": 0.7081, "mean_token_accuracy": 0.7832071363925934, "num_tokens": 421111777.0, "step": 40560 }, { "entropy": 0.6991162121295929, "epoch": 0.32456, "grad_norm": 4.652135372161865, "learning_rate": 3.37859143657463e-05, "loss": 0.6988, "mean_token_accuracy": 0.7969504415988922, "num_tokens": 421209786.0, "step": 40570 }, { "entropy": 0.7082007706165314, "epoch": 0.32464, "grad_norm": 1.7403833866119385, "learning_rate": 3.3781912765106046e-05, "loss": 0.6998, "mean_token_accuracy": 0.7972527623176575, "num_tokens": 421305448.0, "step": 40580 }, { "entropy": 0.6669289410114289, "epoch": 0.32472, "grad_norm": 2.353830337524414, "learning_rate": 3.377791116446579e-05, "loss": 0.6613, "mean_token_accuracy": 0.7999399244785309, "num_tokens": 421435984.0, "step": 40590 }, { "entropy": 0.7796076208353042, "epoch": 0.3248, "grad_norm": 4.827164173126221, "learning_rate": 3.377390956382553e-05, "loss": 0.7806, "mean_token_accuracy": 0.7963618755340576, "num_tokens": 421469026.0, "step": 40600 }, { "entropy": 0.6712067365646363, "epoch": 0.32488, "grad_norm": 1.962319254875183, "learning_rate": 3.376990796318528e-05, "loss": 0.6637, "mean_token_accuracy": 0.7905369758605957, "num_tokens": 421632725.0, "step": 40610 }, { "entropy": 0.6685383677482605, "epoch": 0.32496, "grad_norm": 3.8894386291503906, "learning_rate": 3.376590636254502e-05, "loss": 0.6712, "mean_token_accuracy": 0.8108485460281372, "num_tokens": 421706137.0, "step": 40620 }, { "entropy": 0.7034985423088074, "epoch": 0.32504, "grad_norm": 1.7338179349899292, "learning_rate": 3.3761904761904764e-05, "loss": 0.7045, "mean_token_accuracy": 0.8022399008274078, "num_tokens": 421797098.0, "step": 40630 }, { "entropy": 0.6776346087455749, "epoch": 0.32512, "grad_norm": 2.7057273387908936, "learning_rate": 3.375790316126451e-05, "loss": 0.6791, "mean_token_accuracy": 0.7914558112621307, "num_tokens": 421933019.0, "step": 40640 }, { "entropy": 0.7147197157144547, "epoch": 0.3252, "grad_norm": 5.2614946365356445, "learning_rate": 3.375390156062425e-05, "loss": 0.6995, "mean_token_accuracy": 0.8121759176254273, "num_tokens": 421971423.0, "step": 40650 }, { "entropy": 0.6871761798858642, "epoch": 0.32528, "grad_norm": 1.5511592626571655, "learning_rate": 3.3749899959983995e-05, "loss": 0.6911, "mean_token_accuracy": 0.7879672706127167, "num_tokens": 422134209.0, "step": 40660 }, { "entropy": 0.690034544467926, "epoch": 0.32536, "grad_norm": 3.3430309295654297, "learning_rate": 3.374589835934374e-05, "loss": 0.6822, "mean_token_accuracy": 0.8026576936244965, "num_tokens": 422221441.0, "step": 40670 }, { "entropy": 0.6820744156837464, "epoch": 0.32544, "grad_norm": 2.6552786827087402, "learning_rate": 3.374189675870348e-05, "loss": 0.6813, "mean_token_accuracy": 0.8051266491413116, "num_tokens": 422314176.0, "step": 40680 }, { "entropy": 0.668974968791008, "epoch": 0.32552, "grad_norm": 2.6067841053009033, "learning_rate": 3.3737895158063227e-05, "loss": 0.6712, "mean_token_accuracy": 0.7941785037517548, "num_tokens": 422448604.0, "step": 40690 }, { "entropy": 0.6714840710163117, "epoch": 0.3256, "grad_norm": 4.716619491577148, "learning_rate": 3.373389355742297e-05, "loss": 0.6742, "mean_token_accuracy": 0.8214784383773803, "num_tokens": 422489400.0, "step": 40700 }, { "entropy": 0.7132561862468719, "epoch": 0.32568, "grad_norm": 2.2506535053253174, "learning_rate": 3.3729891956782714e-05, "loss": 0.7167, "mean_token_accuracy": 0.7765022039413452, "num_tokens": 422653240.0, "step": 40710 }, { "entropy": 0.712096244096756, "epoch": 0.32576, "grad_norm": 2.8729405403137207, "learning_rate": 3.372589035614246e-05, "loss": 0.6938, "mean_token_accuracy": 0.7949089467525482, "num_tokens": 422739870.0, "step": 40720 }, { "entropy": 0.7404525876045227, "epoch": 0.32584, "grad_norm": 1.8511018753051758, "learning_rate": 3.37218887555022e-05, "loss": 0.7364, "mean_token_accuracy": 0.7932212710380554, "num_tokens": 422833550.0, "step": 40730 }, { "entropy": 0.7080926895141602, "epoch": 0.32592, "grad_norm": 2.123779058456421, "learning_rate": 3.371788715486195e-05, "loss": 0.7042, "mean_token_accuracy": 0.7852648913860321, "num_tokens": 422979162.0, "step": 40740 }, { "entropy": 0.6738889694213868, "epoch": 0.326, "grad_norm": 4.757043361663818, "learning_rate": 3.371388555422169e-05, "loss": 0.6718, "mean_token_accuracy": 0.8167991578578949, "num_tokens": 423021899.0, "step": 40750 }, { "entropy": 0.6761614620685578, "epoch": 0.32608, "grad_norm": 1.53964364528656, "learning_rate": 3.370988395358143e-05, "loss": 0.6789, "mean_token_accuracy": 0.7864619016647338, "num_tokens": 423185739.0, "step": 40760 }, { "entropy": 0.6767103493213653, "epoch": 0.32616, "grad_norm": 2.86747407913208, "learning_rate": 3.3705882352941176e-05, "loss": 0.6655, "mean_token_accuracy": 0.8116270542144776, "num_tokens": 423268927.0, "step": 40770 }, { "entropy": 0.6688479721546173, "epoch": 0.32624, "grad_norm": 1.4105932712554932, "learning_rate": 3.370188075230093e-05, "loss": 0.673, "mean_token_accuracy": 0.8062446355819702, "num_tokens": 423362558.0, "step": 40780 }, { "entropy": 0.7716330230236054, "epoch": 0.32632, "grad_norm": 2.6151123046875, "learning_rate": 3.3697879151660664e-05, "loss": 0.7726, "mean_token_accuracy": 0.7709412813186646, "num_tokens": 423500467.0, "step": 40790 }, { "entropy": 0.6933540314435959, "epoch": 0.3264, "grad_norm": 4.59841251373291, "learning_rate": 3.369387755102041e-05, "loss": 0.6938, "mean_token_accuracy": 0.812992262840271, "num_tokens": 423538913.0, "step": 40800 }, { "entropy": 0.631111866235733, "epoch": 0.32648, "grad_norm": 2.3451735973358154, "learning_rate": 3.368987595038016e-05, "loss": 0.6262, "mean_token_accuracy": 0.8007816314697266, "num_tokens": 423702753.0, "step": 40810 }, { "entropy": 0.6618846356868744, "epoch": 0.32656, "grad_norm": 3.2915096282958984, "learning_rate": 3.36858743497399e-05, "loss": 0.6476, "mean_token_accuracy": 0.8091462135314942, "num_tokens": 423794434.0, "step": 40820 }, { "entropy": 0.6433731317520142, "epoch": 0.32664, "grad_norm": 2.31607985496521, "learning_rate": 3.368187274909964e-05, "loss": 0.6359, "mean_token_accuracy": 0.8112004578113556, "num_tokens": 423889602.0, "step": 40830 }, { "entropy": 0.6892464458942413, "epoch": 0.32672, "grad_norm": 3.2837066650390625, "learning_rate": 3.367787114845938e-05, "loss": 0.6869, "mean_token_accuracy": 0.7874049961566925, "num_tokens": 424033885.0, "step": 40840 }, { "entropy": 0.6144947856664658, "epoch": 0.3268, "grad_norm": 5.038246154785156, "learning_rate": 3.367386954781913e-05, "loss": 0.6177, "mean_token_accuracy": 0.8318952143192291, "num_tokens": 424073019.0, "step": 40850 }, { "entropy": 0.7239174544811249, "epoch": 0.32688, "grad_norm": 2.1773436069488525, "learning_rate": 3.3669867947178876e-05, "loss": 0.7282, "mean_token_accuracy": 0.7763620018959045, "num_tokens": 424235831.0, "step": 40860 }, { "entropy": 0.6851530581712723, "epoch": 0.32696, "grad_norm": 3.714906930923462, "learning_rate": 3.366586634653861e-05, "loss": 0.6751, "mean_token_accuracy": 0.8112300157546997, "num_tokens": 424315379.0, "step": 40870 }, { "entropy": 0.7048521757125854, "epoch": 0.32704, "grad_norm": 1.451641321182251, "learning_rate": 3.366186474589836e-05, "loss": 0.7146, "mean_token_accuracy": 0.7957947492599488, "num_tokens": 424408064.0, "step": 40880 }, { "entropy": 0.6925738394260407, "epoch": 0.32712, "grad_norm": 1.8954533338546753, "learning_rate": 3.365786314525811e-05, "loss": 0.6911, "mean_token_accuracy": 0.7879141688346862, "num_tokens": 424562503.0, "step": 40890 }, { "entropy": 0.7112644910812378, "epoch": 0.3272, "grad_norm": 5.030673503875732, "learning_rate": 3.365386154461785e-05, "loss": 0.7118, "mean_token_accuracy": 0.8080490112304688, "num_tokens": 424606064.0, "step": 40900 }, { "entropy": 0.6525940895080566, "epoch": 0.32728, "grad_norm": 2.293128490447998, "learning_rate": 3.364985994397759e-05, "loss": 0.6507, "mean_token_accuracy": 0.7893808066844941, "num_tokens": 424769904.0, "step": 40910 }, { "entropy": 0.6789002299308777, "epoch": 0.32736, "grad_norm": 3.6951675415039062, "learning_rate": 3.364585834333734e-05, "loss": 0.6616, "mean_token_accuracy": 0.8142081797122955, "num_tokens": 424856468.0, "step": 40920 }, { "entropy": 0.7297932088375092, "epoch": 0.32744, "grad_norm": 1.5713814496994019, "learning_rate": 3.364185674269708e-05, "loss": 0.7481, "mean_token_accuracy": 0.7911202132701873, "num_tokens": 424949268.0, "step": 40930 }, { "entropy": 0.6352295756340027, "epoch": 0.32752, "grad_norm": 3.4402365684509277, "learning_rate": 3.3637855142056826e-05, "loss": 0.6253, "mean_token_accuracy": 0.8067133903503418, "num_tokens": 425082950.0, "step": 40940 }, { "entropy": 0.6721360146999359, "epoch": 0.3276, "grad_norm": 4.592159271240234, "learning_rate": 3.363385354141656e-05, "loss": 0.6653, "mean_token_accuracy": 0.825300681591034, "num_tokens": 425116954.0, "step": 40950 }, { "entropy": 0.6518850564956665, "epoch": 0.32768, "grad_norm": 3.2025630474090576, "learning_rate": 3.3629851940776314e-05, "loss": 0.6559, "mean_token_accuracy": 0.7930496275424957, "num_tokens": 425280789.0, "step": 40960 }, { "entropy": 0.6399858862161636, "epoch": 0.32776, "grad_norm": 2.7445316314697266, "learning_rate": 3.362585034013606e-05, "loss": 0.6376, "mean_token_accuracy": 0.8181244552135467, "num_tokens": 425359473.0, "step": 40970 }, { "entropy": 0.671688050031662, "epoch": 0.32784, "grad_norm": 1.3574110269546509, "learning_rate": 3.36218487394958e-05, "loss": 0.6584, "mean_token_accuracy": 0.8097981154918671, "num_tokens": 425453299.0, "step": 40980 }, { "entropy": 0.7715485394001007, "epoch": 0.32792, "grad_norm": 2.764526605606079, "learning_rate": 3.3617847138855545e-05, "loss": 0.7751, "mean_token_accuracy": 0.7729495525360107, "num_tokens": 425590350.0, "step": 40990 }, { "entropy": 0.5893778681755066, "epoch": 0.328, "grad_norm": 5.782519340515137, "learning_rate": 3.361384553821529e-05, "loss": 0.577, "mean_token_accuracy": 0.839870136976242, "num_tokens": 425630268.0, "step": 41000 }, { "entropy": 0.6256438255310058, "epoch": 0.32808, "grad_norm": 1.7779654264450073, "learning_rate": 3.360984393757503e-05, "loss": 0.6276, "mean_token_accuracy": 0.7992977499961853, "num_tokens": 425794108.0, "step": 41010 }, { "entropy": 0.6940391361713409, "epoch": 0.32816, "grad_norm": 3.3709909915924072, "learning_rate": 3.3605842336934776e-05, "loss": 0.6966, "mean_token_accuracy": 0.7996284484863281, "num_tokens": 425896828.0, "step": 41020 }, { "entropy": 0.722768884897232, "epoch": 0.32824, "grad_norm": 1.5728600025177002, "learning_rate": 3.360184073629452e-05, "loss": 0.7003, "mean_token_accuracy": 0.798375952243805, "num_tokens": 425992082.0, "step": 41030 }, { "entropy": 0.6647034287452698, "epoch": 0.32832, "grad_norm": 3.5554261207580566, "learning_rate": 3.359783913565426e-05, "loss": 0.6669, "mean_token_accuracy": 0.7932765066623688, "num_tokens": 426127611.0, "step": 41040 }, { "entropy": 0.6879898309707642, "epoch": 0.3284, "grad_norm": 4.534964561462402, "learning_rate": 3.359383753501401e-05, "loss": 0.687, "mean_token_accuracy": 0.8098128616809845, "num_tokens": 426165486.0, "step": 41050 }, { "entropy": 0.6714948952198029, "epoch": 0.32848, "grad_norm": 2.3919010162353516, "learning_rate": 3.358983593437375e-05, "loss": 0.671, "mean_token_accuracy": 0.7879457771778107, "num_tokens": 426329326.0, "step": 41060 }, { "entropy": 0.6699161648750305, "epoch": 0.32856, "grad_norm": 3.4517040252685547, "learning_rate": 3.3585834333733494e-05, "loss": 0.6606, "mean_token_accuracy": 0.808380538225174, "num_tokens": 426418376.0, "step": 41070 }, { "entropy": 0.67820503115654, "epoch": 0.32864, "grad_norm": 1.4246147871017456, "learning_rate": 3.358183273309324e-05, "loss": 0.6833, "mean_token_accuracy": 0.8046168625354767, "num_tokens": 426511673.0, "step": 41080 }, { "entropy": 0.7132906973361969, "epoch": 0.32872, "grad_norm": 2.421757459640503, "learning_rate": 3.357783113245298e-05, "loss": 0.7052, "mean_token_accuracy": 0.7882007002830506, "num_tokens": 426638518.0, "step": 41090 }, { "entropy": 0.6297648876905442, "epoch": 0.3288, "grad_norm": 5.311399459838867, "learning_rate": 3.3573829531812726e-05, "loss": 0.6252, "mean_token_accuracy": 0.8274990975856781, "num_tokens": 426672321.0, "step": 41100 }, { "entropy": 0.6586574792861939, "epoch": 0.32888, "grad_norm": 1.4909346103668213, "learning_rate": 3.356982793117247e-05, "loss": 0.6635, "mean_token_accuracy": 0.7936021327972412, "num_tokens": 426836013.0, "step": 41110 }, { "entropy": 0.6530662029981613, "epoch": 0.32896, "grad_norm": 3.3697450160980225, "learning_rate": 3.356582633053221e-05, "loss": 0.6369, "mean_token_accuracy": 0.8161238253116607, "num_tokens": 426914923.0, "step": 41120 }, { "entropy": 0.7318309009075165, "epoch": 0.32904, "grad_norm": 1.4615004062652588, "learning_rate": 3.3561824729891963e-05, "loss": 0.7296, "mean_token_accuracy": 0.7943034887313842, "num_tokens": 427007367.0, "step": 41130 }, { "entropy": 0.7623670518398284, "epoch": 0.32912, "grad_norm": 2.1462974548339844, "learning_rate": 3.35578231292517e-05, "loss": 0.7582, "mean_token_accuracy": 0.7770801782608032, "num_tokens": 427135369.0, "step": 41140 }, { "entropy": 0.6856754660606384, "epoch": 0.3292, "grad_norm": 5.048924446105957, "learning_rate": 3.3553821528611444e-05, "loss": 0.6982, "mean_token_accuracy": 0.8148654222488403, "num_tokens": 427169323.0, "step": 41150 }, { "entropy": 0.6909382581710816, "epoch": 0.32928, "grad_norm": 1.557726263999939, "learning_rate": 3.354981992797119e-05, "loss": 0.6852, "mean_token_accuracy": 0.7849597036838531, "num_tokens": 427333163.0, "step": 41160 }, { "entropy": 0.7031533688306808, "epoch": 0.32936, "grad_norm": 3.672713279724121, "learning_rate": 3.354581832733094e-05, "loss": 0.6966, "mean_token_accuracy": 0.7981010735034942, "num_tokens": 427424871.0, "step": 41170 }, { "entropy": 0.59080690741539, "epoch": 0.32944, "grad_norm": 1.7341581583023071, "learning_rate": 3.3541816726690675e-05, "loss": 0.5816, "mean_token_accuracy": 0.8256169259548187, "num_tokens": 427519048.0, "step": 41180 }, { "entropy": 0.6529023349285126, "epoch": 0.32952, "grad_norm": 2.9579899311065674, "learning_rate": 3.353781512605042e-05, "loss": 0.6542, "mean_token_accuracy": 0.7968460738658905, "num_tokens": 427651877.0, "step": 41190 }, { "entropy": 0.6917576849460602, "epoch": 0.3296, "grad_norm": 4.321138858795166, "learning_rate": 3.353381352541017e-05, "loss": 0.6783, "mean_token_accuracy": 0.8187558948993683, "num_tokens": 427690790.0, "step": 41200 }, { "entropy": 0.6831702828407288, "epoch": 0.32968, "grad_norm": 1.572921633720398, "learning_rate": 3.352981192476991e-05, "loss": 0.6873, "mean_token_accuracy": 0.7889106035232544, "num_tokens": 427854630.0, "step": 41210 }, { "entropy": 0.76200350522995, "epoch": 0.32976, "grad_norm": 3.717521905899048, "learning_rate": 3.352581032412965e-05, "loss": 0.7582, "mean_token_accuracy": 0.7864996910095214, "num_tokens": 427940640.0, "step": 41220 }, { "entropy": 0.7303453087806702, "epoch": 0.32984, "grad_norm": 1.4139748811721802, "learning_rate": 3.3521808723489394e-05, "loss": 0.7076, "mean_token_accuracy": 0.7962912440299987, "num_tokens": 428032485.0, "step": 41230 }, { "entropy": 0.630944448709488, "epoch": 0.32992, "grad_norm": 2.03342866897583, "learning_rate": 3.3517807122849144e-05, "loss": 0.6372, "mean_token_accuracy": 0.801886922121048, "num_tokens": 428176862.0, "step": 41240 }, { "entropy": 0.6575451701879501, "epoch": 0.33, "grad_norm": 4.654206275939941, "learning_rate": 3.351380552220889e-05, "loss": 0.6626, "mean_token_accuracy": 0.8192992985248566, "num_tokens": 428220436.0, "step": 41250 }, { "entropy": 0.6510414391756058, "epoch": 0.33008, "grad_norm": 2.391187906265259, "learning_rate": 3.3509803921568625e-05, "loss": 0.6471, "mean_token_accuracy": 0.7964826643466949, "num_tokens": 428384276.0, "step": 41260 }, { "entropy": 0.6178059339523315, "epoch": 0.33016, "grad_norm": 2.808314561843872, "learning_rate": 3.3505802320928375e-05, "loss": 0.6087, "mean_token_accuracy": 0.8171680867671967, "num_tokens": 428478378.0, "step": 41270 }, { "entropy": 0.7017308652400971, "epoch": 0.33024, "grad_norm": 1.8799705505371094, "learning_rate": 3.350180072028812e-05, "loss": 0.7057, "mean_token_accuracy": 0.7934721887111664, "num_tokens": 428571869.0, "step": 41280 }, { "entropy": 0.6687336444854737, "epoch": 0.33032, "grad_norm": 2.341564655303955, "learning_rate": 3.349779911964786e-05, "loss": 0.6638, "mean_token_accuracy": 0.7936128854751587, "num_tokens": 428708166.0, "step": 41290 }, { "entropy": 0.676423305273056, "epoch": 0.3304, "grad_norm": 4.032505512237549, "learning_rate": 3.34937975190076e-05, "loss": 0.6754, "mean_token_accuracy": 0.8166162729263305, "num_tokens": 428743340.0, "step": 41300 }, { "entropy": 0.6284830540418624, "epoch": 0.33048, "grad_norm": 1.739556908607483, "learning_rate": 3.348979591836735e-05, "loss": 0.6301, "mean_token_accuracy": 0.7993343949317933, "num_tokens": 428907180.0, "step": 41310 }, { "entropy": 0.71745565533638, "epoch": 0.33056, "grad_norm": 3.066009283065796, "learning_rate": 3.3485794317727094e-05, "loss": 0.7058, "mean_token_accuracy": 0.7965639412403107, "num_tokens": 428992480.0, "step": 41320 }, { "entropy": 0.641595846414566, "epoch": 0.33064, "grad_norm": 1.7551355361938477, "learning_rate": 3.348179271708684e-05, "loss": 0.6518, "mean_token_accuracy": 0.8113477230072021, "num_tokens": 429085232.0, "step": 41330 }, { "entropy": 0.6981523513793946, "epoch": 0.33072, "grad_norm": 2.1822140216827393, "learning_rate": 3.347779111644658e-05, "loss": 0.6991, "mean_token_accuracy": 0.7863575458526612, "num_tokens": 429219757.0, "step": 41340 }, { "entropy": 0.7047262072563172, "epoch": 0.3308, "grad_norm": 4.8710126876831055, "learning_rate": 3.3473789515806325e-05, "loss": 0.6961, "mean_token_accuracy": 0.8128913521766663, "num_tokens": 429257194.0, "step": 41350 }, { "entropy": 0.6474637925624848, "epoch": 0.33088, "grad_norm": 2.287619113922119, "learning_rate": 3.346978791516607e-05, "loss": 0.6501, "mean_token_accuracy": 0.7911211550235748, "num_tokens": 429421034.0, "step": 41360 }, { "entropy": 0.6188450992107392, "epoch": 0.33096, "grad_norm": 2.8905534744262695, "learning_rate": 3.346578631452581e-05, "loss": 0.6035, "mean_token_accuracy": 0.8247344970703125, "num_tokens": 429507105.0, "step": 41370 }, { "entropy": 0.6854870915412903, "epoch": 0.33104, "grad_norm": 1.361847996711731, "learning_rate": 3.3461784713885556e-05, "loss": 0.6697, "mean_token_accuracy": 0.8037718951702117, "num_tokens": 429600730.0, "step": 41380 }, { "entropy": 0.6269960403442383, "epoch": 0.33112, "grad_norm": 2.7981371879577637, "learning_rate": 3.34577831132453e-05, "loss": 0.6331, "mean_token_accuracy": 0.8055046498775482, "num_tokens": 429723570.0, "step": 41390 }, { "entropy": 0.7462003409862519, "epoch": 0.3312, "grad_norm": 5.667169570922852, "learning_rate": 3.3453781512605044e-05, "loss": 0.742, "mean_token_accuracy": 0.8055054306983948, "num_tokens": 429759711.0, "step": 41400 }, { "entropy": 0.6759621798992157, "epoch": 0.33128, "grad_norm": 1.6690762042999268, "learning_rate": 3.344977991196479e-05, "loss": 0.6703, "mean_token_accuracy": 0.7883960127830505, "num_tokens": 429920736.0, "step": 41410 }, { "entropy": 0.676715075969696, "epoch": 0.33136, "grad_norm": 3.4192724227905273, "learning_rate": 3.344577831132453e-05, "loss": 0.6822, "mean_token_accuracy": 0.8157636523246765, "num_tokens": 429986118.0, "step": 41420 }, { "entropy": 0.6873459756374359, "epoch": 0.33144, "grad_norm": 2.225433588027954, "learning_rate": 3.3441776710684275e-05, "loss": 0.6895, "mean_token_accuracy": 0.8086758494377136, "num_tokens": 430078141.0, "step": 41430 }, { "entropy": 0.7389681816101075, "epoch": 0.33152, "grad_norm": 2.1253411769866943, "learning_rate": 3.343777511004402e-05, "loss": 0.7302, "mean_token_accuracy": 0.781948059797287, "num_tokens": 430213443.0, "step": 41440 }, { "entropy": 0.7359329432249069, "epoch": 0.3316, "grad_norm": 4.118691921234131, "learning_rate": 3.343377350940376e-05, "loss": 0.7522, "mean_token_accuracy": 0.799442595243454, "num_tokens": 430250032.0, "step": 41450 }, { "entropy": 0.60589959025383, "epoch": 0.33168, "grad_norm": 1.4190949201583862, "learning_rate": 3.3429771908763506e-05, "loss": 0.6037, "mean_token_accuracy": 0.8059737682342529, "num_tokens": 430413817.0, "step": 41460 }, { "entropy": 0.6970402598381042, "epoch": 0.33176, "grad_norm": 4.281696319580078, "learning_rate": 3.342577030812325e-05, "loss": 0.6747, "mean_token_accuracy": 0.8091373145580292, "num_tokens": 430492300.0, "step": 41470 }, { "entropy": 0.7029081284999847, "epoch": 0.33184, "grad_norm": 2.9116430282592773, "learning_rate": 3.3421768707482993e-05, "loss": 0.7035, "mean_token_accuracy": 0.7994364023208618, "num_tokens": 430587407.0, "step": 41480 }, { "entropy": 0.7142933249473572, "epoch": 0.33192, "grad_norm": 2.923461437225342, "learning_rate": 3.341776710684274e-05, "loss": 0.7004, "mean_token_accuracy": 0.7885481715202332, "num_tokens": 430718798.0, "step": 41490 }, { "entropy": 0.7316471040248871, "epoch": 0.332, "grad_norm": 4.948895454406738, "learning_rate": 3.341376550620248e-05, "loss": 0.7301, "mean_token_accuracy": 0.8105418980121613, "num_tokens": 430756560.0, "step": 41500 }, { "entropy": 0.6604309916496277, "epoch": 0.33208, "grad_norm": 1.7346770763397217, "learning_rate": 3.3409763905562225e-05, "loss": 0.6587, "mean_token_accuracy": 0.7901073336601258, "num_tokens": 430919543.0, "step": 41510 }, { "entropy": 0.7768349528312684, "epoch": 0.33216, "grad_norm": 3.4875710010528564, "learning_rate": 3.3405762304921975e-05, "loss": 0.752, "mean_token_accuracy": 0.7844681799411773, "num_tokens": 431000669.0, "step": 41520 }, { "entropy": 0.6772354722023011, "epoch": 0.33224, "grad_norm": 1.6686627864837646, "learning_rate": 3.340176070428171e-05, "loss": 0.6665, "mean_token_accuracy": 0.8042261302471161, "num_tokens": 431093954.0, "step": 41530 }, { "entropy": 0.6844475448131562, "epoch": 0.33232, "grad_norm": 3.7322282791137695, "learning_rate": 3.3397759103641456e-05, "loss": 0.6915, "mean_token_accuracy": 0.7917014122009277, "num_tokens": 431222914.0, "step": 41540 }, { "entropy": 0.7433199167251587, "epoch": 0.3324, "grad_norm": 6.104884624481201, "learning_rate": 3.33937575030012e-05, "loss": 0.7483, "mean_token_accuracy": 0.804977285861969, "num_tokens": 431256407.0, "step": 41550 }, { "entropy": 0.6141562253236771, "epoch": 0.33248, "grad_norm": 2.265507936477661, "learning_rate": 3.338975590236095e-05, "loss": 0.6119, "mean_token_accuracy": 0.8023299038410187, "num_tokens": 431416680.0, "step": 41560 }, { "entropy": 0.6924970746040344, "epoch": 0.33256, "grad_norm": 3.403353691101074, "learning_rate": 3.338575430172069e-05, "loss": 0.6815, "mean_token_accuracy": 0.8035955011844635, "num_tokens": 431494407.0, "step": 41570 }, { "entropy": 0.6395078301429749, "epoch": 0.33264, "grad_norm": 1.349990963935852, "learning_rate": 3.338175270108043e-05, "loss": 0.6389, "mean_token_accuracy": 0.8081104516983032, "num_tokens": 431587293.0, "step": 41580 }, { "entropy": 0.7129738688468933, "epoch": 0.33272, "grad_norm": 2.7230372428894043, "learning_rate": 3.337775110044018e-05, "loss": 0.7087, "mean_token_accuracy": 0.7840873420238494, "num_tokens": 431730830.0, "step": 41590 }, { "entropy": 0.6808631181716919, "epoch": 0.3328, "grad_norm": 5.504279136657715, "learning_rate": 3.3373749499799925e-05, "loss": 0.6736, "mean_token_accuracy": 0.8197182893753052, "num_tokens": 431770595.0, "step": 41600 }, { "entropy": 0.6954917252063751, "epoch": 0.33288, "grad_norm": 2.092315435409546, "learning_rate": 3.336974789915966e-05, "loss": 0.6926, "mean_token_accuracy": 0.785660570859909, "num_tokens": 431926581.0, "step": 41610 }, { "entropy": 0.6303965836763382, "epoch": 0.33296, "grad_norm": 2.9645509719848633, "learning_rate": 3.3365746298519405e-05, "loss": 0.6213, "mean_token_accuracy": 0.8189868986606598, "num_tokens": 431995412.0, "step": 41620 }, { "entropy": 0.7052009165287018, "epoch": 0.33304, "grad_norm": 2.316502094268799, "learning_rate": 3.3361744697879156e-05, "loss": 0.6965, "mean_token_accuracy": 0.7959635078907012, "num_tokens": 432089764.0, "step": 41630 }, { "entropy": 0.6928629994392395, "epoch": 0.33312, "grad_norm": 3.339066505432129, "learning_rate": 3.33577430972389e-05, "loss": 0.6864, "mean_token_accuracy": 0.792396855354309, "num_tokens": 432219827.0, "step": 41640 }, { "entropy": 0.6484670042991638, "epoch": 0.3332, "grad_norm": 5.341368675231934, "learning_rate": 3.3353741496598637e-05, "loss": 0.6461, "mean_token_accuracy": 0.8243488788604736, "num_tokens": 432256145.0, "step": 41650 }, { "entropy": 0.6464949131011963, "epoch": 0.33328, "grad_norm": 1.8067784309387207, "learning_rate": 3.334973989595839e-05, "loss": 0.6445, "mean_token_accuracy": 0.7956521689891816, "num_tokens": 432419985.0, "step": 41660 }, { "entropy": 0.7002417147159576, "epoch": 0.33336, "grad_norm": 3.672006368637085, "learning_rate": 3.334573829531813e-05, "loss": 0.7014, "mean_token_accuracy": 0.8043440818786621, "num_tokens": 432503051.0, "step": 41670 }, { "entropy": 0.7665541768074036, "epoch": 0.33344, "grad_norm": 1.3858401775360107, "learning_rate": 3.3341736694677874e-05, "loss": 0.7613, "mean_token_accuracy": 0.7899492740631103, "num_tokens": 432596529.0, "step": 41680 }, { "entropy": 0.6831440567970276, "epoch": 0.33352, "grad_norm": 2.5418057441711426, "learning_rate": 3.333773509403761e-05, "loss": 0.6677, "mean_token_accuracy": 0.7949783146381378, "num_tokens": 432737145.0, "step": 41690 }, { "entropy": 0.6406568080186844, "epoch": 0.3336, "grad_norm": 3.995001792907715, "learning_rate": 3.333373349339736e-05, "loss": 0.6402, "mean_token_accuracy": 0.8271744549274445, "num_tokens": 432776850.0, "step": 41700 }, { "entropy": 0.6882316768169403, "epoch": 0.33368, "grad_norm": 1.3724063634872437, "learning_rate": 3.3329731892757106e-05, "loss": 0.6932, "mean_token_accuracy": 0.7874267220497131, "num_tokens": 432940690.0, "step": 41710 }, { "entropy": 0.7857242047786712, "epoch": 0.33376, "grad_norm": 3.566075563430786, "learning_rate": 3.332573029211685e-05, "loss": 0.776, "mean_token_accuracy": 0.7846605479717255, "num_tokens": 433022378.0, "step": 41720 }, { "entropy": 0.6724884450435639, "epoch": 0.33384, "grad_norm": 2.292620897293091, "learning_rate": 3.332172869147659e-05, "loss": 0.6751, "mean_token_accuracy": 0.8086085557937622, "num_tokens": 433114702.0, "step": 41730 }, { "entropy": 0.7388313829898834, "epoch": 0.33392, "grad_norm": 2.389716386795044, "learning_rate": 3.331772709083634e-05, "loss": 0.7417, "mean_token_accuracy": 0.7770920276641846, "num_tokens": 433257205.0, "step": 41740 }, { "entropy": 0.6756143629550934, "epoch": 0.334, "grad_norm": 5.060122489929199, "learning_rate": 3.331372549019608e-05, "loss": 0.6585, "mean_token_accuracy": 0.8244183003902436, "num_tokens": 433297715.0, "step": 41750 }, { "entropy": 0.6498380899429321, "epoch": 0.33408, "grad_norm": 1.8352395296096802, "learning_rate": 3.3309723889555824e-05, "loss": 0.643, "mean_token_accuracy": 0.7970228254795074, "num_tokens": 433460038.0, "step": 41760 }, { "entropy": 0.552009928226471, "epoch": 0.33416, "grad_norm": 3.0494892597198486, "learning_rate": 3.330572228891557e-05, "loss": 0.5522, "mean_token_accuracy": 0.8375332355499268, "num_tokens": 433529977.0, "step": 41770 }, { "entropy": 0.6974670886993408, "epoch": 0.33424, "grad_norm": 2.4154140949249268, "learning_rate": 3.330172068827531e-05, "loss": 0.711, "mean_token_accuracy": 0.8008683383464813, "num_tokens": 433623364.0, "step": 41780 }, { "entropy": 0.6680908560752868, "epoch": 0.33432, "grad_norm": 2.937993288040161, "learning_rate": 3.3297719087635055e-05, "loss": 0.6543, "mean_token_accuracy": 0.7962599277496338, "num_tokens": 433758646.0, "step": 41790 }, { "entropy": 0.6413437932729721, "epoch": 0.3344, "grad_norm": 4.2315497398376465, "learning_rate": 3.32937174869948e-05, "loss": 0.6263, "mean_token_accuracy": 0.826684957742691, "num_tokens": 433799911.0, "step": 41800 }, { "entropy": 0.6734307289123536, "epoch": 0.33448, "grad_norm": 1.5196532011032104, "learning_rate": 3.328971588635454e-05, "loss": 0.6774, "mean_token_accuracy": 0.785045200586319, "num_tokens": 433963751.0, "step": 41810 }, { "entropy": 0.6776940852403641, "epoch": 0.33456, "grad_norm": 4.535605430603027, "learning_rate": 3.3285714285714286e-05, "loss": 0.6724, "mean_token_accuracy": 0.8085277080535889, "num_tokens": 434059844.0, "step": 41820 }, { "entropy": 0.7246111810207367, "epoch": 0.33464, "grad_norm": 1.4102436304092407, "learning_rate": 3.328171268507403e-05, "loss": 0.7242, "mean_token_accuracy": 0.7910111486911774, "num_tokens": 434153831.0, "step": 41830 }, { "entropy": 0.6819873631000519, "epoch": 0.33472, "grad_norm": 3.757840394973755, "learning_rate": 3.3277711084433774e-05, "loss": 0.6724, "mean_token_accuracy": 0.7979904055595398, "num_tokens": 434295675.0, "step": 41840 }, { "entropy": 0.6404609978199005, "epoch": 0.3348, "grad_norm": 4.842682838439941, "learning_rate": 3.327370948379352e-05, "loss": 0.6214, "mean_token_accuracy": 0.8254824340343475, "num_tokens": 434334759.0, "step": 41850 }, { "entropy": 0.6233299314975739, "epoch": 0.33488, "grad_norm": 1.8376421928405762, "learning_rate": 3.326970788315326e-05, "loss": 0.6291, "mean_token_accuracy": 0.7993893623352051, "num_tokens": 434498599.0, "step": 41860 }, { "entropy": 0.733540153503418, "epoch": 0.33496, "grad_norm": 3.0722603797912598, "learning_rate": 3.326570628251301e-05, "loss": 0.7289, "mean_token_accuracy": 0.7923377335071564, "num_tokens": 434592163.0, "step": 41870 }, { "entropy": 0.6632908821105957, "epoch": 0.33504, "grad_norm": 2.0633068084716797, "learning_rate": 3.326170468187275e-05, "loss": 0.6584, "mean_token_accuracy": 0.8080450117588043, "num_tokens": 434685285.0, "step": 41880 }, { "entropy": 0.752543032169342, "epoch": 0.33512, "grad_norm": 2.4449081420898438, "learning_rate": 3.325770308123249e-05, "loss": 0.7494, "mean_token_accuracy": 0.7793180048465729, "num_tokens": 434823304.0, "step": 41890 }, { "entropy": 0.6764477252960205, "epoch": 0.3352, "grad_norm": 6.656118392944336, "learning_rate": 3.3253701480592236e-05, "loss": 0.6799, "mean_token_accuracy": 0.8212884962558746, "num_tokens": 434864662.0, "step": 41900 }, { "entropy": 0.6973893821239472, "epoch": 0.33528, "grad_norm": 2.2422075271606445, "learning_rate": 3.324969987995199e-05, "loss": 0.6917, "mean_token_accuracy": 0.788127887248993, "num_tokens": 435027142.0, "step": 41910 }, { "entropy": 0.7181272029876709, "epoch": 0.33536, "grad_norm": 3.371729850769043, "learning_rate": 3.3245698279311724e-05, "loss": 0.7104, "mean_token_accuracy": 0.8004611492156982, "num_tokens": 435095934.0, "step": 41920 }, { "entropy": 0.6874495089054108, "epoch": 0.33544, "grad_norm": 1.9465709924697876, "learning_rate": 3.324169667867147e-05, "loss": 0.6823, "mean_token_accuracy": 0.8038553535938263, "num_tokens": 435188769.0, "step": 41930 }, { "entropy": 0.6703935980796814, "epoch": 0.33552, "grad_norm": 2.635558843612671, "learning_rate": 3.323769507803122e-05, "loss": 0.6719, "mean_token_accuracy": 0.7931288778781891, "num_tokens": 435327737.0, "step": 41940 }, { "entropy": 0.6825263261795044, "epoch": 0.3356, "grad_norm": 5.3654069900512695, "learning_rate": 3.323369347739096e-05, "loss": 0.6841, "mean_token_accuracy": 0.8182884752750397, "num_tokens": 435365846.0, "step": 41950 }, { "entropy": 0.6680265307426453, "epoch": 0.33568, "grad_norm": 1.462246060371399, "learning_rate": 3.32296918767507e-05, "loss": 0.6642, "mean_token_accuracy": 0.7906998515129089, "num_tokens": 435529671.0, "step": 41960 }, { "entropy": 0.6065011739730835, "epoch": 0.33576, "grad_norm": 2.912172555923462, "learning_rate": 3.322569027611044e-05, "loss": 0.595, "mean_token_accuracy": 0.8267190754413605, "num_tokens": 435611711.0, "step": 41970 }, { "entropy": 0.6822307139635087, "epoch": 0.33584, "grad_norm": 1.6196471452713013, "learning_rate": 3.322168867547019e-05, "loss": 0.6829, "mean_token_accuracy": 0.8048795521259308, "num_tokens": 435705577.0, "step": 41980 }, { "entropy": 0.6973718285560608, "epoch": 0.33592, "grad_norm": 3.554694652557373, "learning_rate": 3.3217687074829936e-05, "loss": 0.695, "mean_token_accuracy": 0.7904314398765564, "num_tokens": 435842734.0, "step": 41990 }, { "entropy": 0.6746239602565766, "epoch": 0.336, "grad_norm": 3.988105058670044, "learning_rate": 3.321368547418967e-05, "loss": 0.6719, "mean_token_accuracy": 0.8227952480316162, "num_tokens": 435882001.0, "step": 42000 }, { "entropy": 0.6392443478107452, "epoch": 0.33608, "grad_norm": 2.1111180782318115, "learning_rate": 3.320968387354942e-05, "loss": 0.6307, "mean_token_accuracy": 0.7980764627456665, "num_tokens": 436045841.0, "step": 42010 }, { "entropy": 0.6055415004491806, "epoch": 0.33616, "grad_norm": 3.717542886734009, "learning_rate": 3.320568227290917e-05, "loss": 0.605, "mean_token_accuracy": 0.8276113212108612, "num_tokens": 436124509.0, "step": 42020 }, { "entropy": 0.6854827880859375, "epoch": 0.33624, "grad_norm": 1.8987892866134644, "learning_rate": 3.320168067226891e-05, "loss": 0.6826, "mean_token_accuracy": 0.8011731505393982, "num_tokens": 436217229.0, "step": 42030 }, { "entropy": 0.7288228273391724, "epoch": 0.33632, "grad_norm": 2.5359396934509277, "learning_rate": 3.319767907162865e-05, "loss": 0.7176, "mean_token_accuracy": 0.7841479480266571, "num_tokens": 436360785.0, "step": 42040 }, { "entropy": 0.6926351964473725, "epoch": 0.3364, "grad_norm": 5.137969017028809, "learning_rate": 3.31936774709884e-05, "loss": 0.6989, "mean_token_accuracy": 0.8167081534862518, "num_tokens": 436401094.0, "step": 42050 }, { "entropy": 0.6317420423030853, "epoch": 0.33648, "grad_norm": 1.8687753677368164, "learning_rate": 3.318967587034814e-05, "loss": 0.6271, "mean_token_accuracy": 0.8015289902687073, "num_tokens": 436564064.0, "step": 42060 }, { "entropy": 0.6514025598764419, "epoch": 0.33656, "grad_norm": 4.023927688598633, "learning_rate": 3.3185674269707886e-05, "loss": 0.64, "mean_token_accuracy": 0.8131969392299652, "num_tokens": 436645326.0, "step": 42070 }, { "entropy": 0.688832801580429, "epoch": 0.33664, "grad_norm": 1.9286141395568848, "learning_rate": 3.318167266906762e-05, "loss": 0.6947, "mean_token_accuracy": 0.7960639774799347, "num_tokens": 436739852.0, "step": 42080 }, { "entropy": 0.7281155824661255, "epoch": 0.33672, "grad_norm": 2.401782751083374, "learning_rate": 3.3177671068427374e-05, "loss": 0.7225, "mean_token_accuracy": 0.7803000450134278, "num_tokens": 436887785.0, "step": 42090 }, { "entropy": 0.671175217628479, "epoch": 0.3368, "grad_norm": 5.019808292388916, "learning_rate": 3.317366946778712e-05, "loss": 0.6672, "mean_token_accuracy": 0.8141637504100799, "num_tokens": 436932033.0, "step": 42100 }, { "entropy": 0.6179870188236236, "epoch": 0.33688, "grad_norm": 2.110612630844116, "learning_rate": 3.316966786714686e-05, "loss": 0.6135, "mean_token_accuracy": 0.8012335181236268, "num_tokens": 437095873.0, "step": 42110 }, { "entropy": 0.6013740658760071, "epoch": 0.33696, "grad_norm": 2.565356969833374, "learning_rate": 3.3165666266506605e-05, "loss": 0.6092, "mean_token_accuracy": 0.8205263555049896, "num_tokens": 437186474.0, "step": 42120 }, { "entropy": 0.7249571084976196, "epoch": 0.33704, "grad_norm": 1.5558546781539917, "learning_rate": 3.316166466586635e-05, "loss": 0.707, "mean_token_accuracy": 0.7993585407733917, "num_tokens": 437281148.0, "step": 42130 }, { "entropy": 0.6841598987579346, "epoch": 0.33712, "grad_norm": 2.110727548599243, "learning_rate": 3.315766306522609e-05, "loss": 0.6818, "mean_token_accuracy": 0.7888383388519287, "num_tokens": 437426979.0, "step": 42140 }, { "entropy": 0.683401933312416, "epoch": 0.3372, "grad_norm": 6.540309429168701, "learning_rate": 3.3153661464585836e-05, "loss": 0.6881, "mean_token_accuracy": 0.8148502826690673, "num_tokens": 437472042.0, "step": 42150 }, { "entropy": 0.6945206582546234, "epoch": 0.33728, "grad_norm": 2.437852144241333, "learning_rate": 3.314965986394558e-05, "loss": 0.696, "mean_token_accuracy": 0.7823089182376861, "num_tokens": 437631369.0, "step": 42160 }, { "entropy": 0.7655669450759888, "epoch": 0.33736, "grad_norm": 2.927426815032959, "learning_rate": 3.314565826330532e-05, "loss": 0.7541, "mean_token_accuracy": 0.7874592006206512, "num_tokens": 437706851.0, "step": 42170 }, { "entropy": 0.6875976681709289, "epoch": 0.33744, "grad_norm": 1.5474663972854614, "learning_rate": 3.314165666266507e-05, "loss": 0.6894, "mean_token_accuracy": 0.8050741195678711, "num_tokens": 437799838.0, "step": 42180 }, { "entropy": 0.7621619999408722, "epoch": 0.33752, "grad_norm": 2.182615280151367, "learning_rate": 3.313765506202481e-05, "loss": 0.763, "mean_token_accuracy": 0.7720290124416351, "num_tokens": 437951262.0, "step": 42190 }, { "entropy": 0.6943045645952225, "epoch": 0.3376, "grad_norm": 3.7402796745300293, "learning_rate": 3.3133653461384554e-05, "loss": 0.7047, "mean_token_accuracy": 0.8066803574562073, "num_tokens": 437997488.0, "step": 42200 }, { "entropy": 0.6735562086105347, "epoch": 0.33768, "grad_norm": 2.9792873859405518, "learning_rate": 3.31296518607443e-05, "loss": 0.6684, "mean_token_accuracy": 0.7931362986564636, "num_tokens": 438161328.0, "step": 42210 }, { "entropy": 0.6150065660476685, "epoch": 0.33776, "grad_norm": 4.333512783050537, "learning_rate": 3.312565026010404e-05, "loss": 0.6046, "mean_token_accuracy": 0.8214276790618896, "num_tokens": 438255618.0, "step": 42220 }, { "entropy": 0.6633391976356506, "epoch": 0.33784, "grad_norm": 1.8568928241729736, "learning_rate": 3.3121648659463786e-05, "loss": 0.6713, "mean_token_accuracy": 0.8058452725410461, "num_tokens": 438349770.0, "step": 42230 }, { "entropy": 0.708874398469925, "epoch": 0.33792, "grad_norm": 2.306572198867798, "learning_rate": 3.311764705882353e-05, "loss": 0.6991, "mean_token_accuracy": 0.7874574542045594, "num_tokens": 438503939.0, "step": 42240 }, { "entropy": 0.6739913046360015, "epoch": 0.338, "grad_norm": 6.648380756378174, "learning_rate": 3.311364545818327e-05, "loss": 0.6866, "mean_token_accuracy": 0.8118830025196075, "num_tokens": 438558545.0, "step": 42250 }, { "entropy": 0.6496886134147644, "epoch": 0.33808, "grad_norm": 2.198982000350952, "learning_rate": 3.3109643857543023e-05, "loss": 0.6449, "mean_token_accuracy": 0.7945730149745941, "num_tokens": 438722359.0, "step": 42260 }, { "entropy": 0.6168760567903518, "epoch": 0.33816, "grad_norm": 2.8217263221740723, "learning_rate": 3.310564225690276e-05, "loss": 0.6164, "mean_token_accuracy": 0.8190497815608978, "num_tokens": 438817280.0, "step": 42270 }, { "entropy": 0.7789792239665985, "epoch": 0.33824, "grad_norm": 1.8748043775558472, "learning_rate": 3.3101640656262504e-05, "loss": 0.7784, "mean_token_accuracy": 0.7819106996059417, "num_tokens": 438912576.0, "step": 42280 }, { "entropy": 0.717975789308548, "epoch": 0.33832, "grad_norm": 3.9660284519195557, "learning_rate": 3.309763905562225e-05, "loss": 0.7122, "mean_token_accuracy": 0.7849754571914673, "num_tokens": 439048137.0, "step": 42290 }, { "entropy": 0.613767784833908, "epoch": 0.3384, "grad_norm": 3.888786792755127, "learning_rate": 3.3093637454982e-05, "loss": 0.6036, "mean_token_accuracy": 0.8341653704643249, "num_tokens": 439085669.0, "step": 42300 }, { "entropy": 0.6387864470481872, "epoch": 0.33848, "grad_norm": 2.1227190494537354, "learning_rate": 3.3089635854341735e-05, "loss": 0.6372, "mean_token_accuracy": 0.7957987308502197, "num_tokens": 439249509.0, "step": 42310 }, { "entropy": 0.6831529974937439, "epoch": 0.33856, "grad_norm": 2.8637373447418213, "learning_rate": 3.308563425370148e-05, "loss": 0.6717, "mean_token_accuracy": 0.8025990962982178, "num_tokens": 439341588.0, "step": 42320 }, { "entropy": 0.6636392593383789, "epoch": 0.33864, "grad_norm": 1.8304972648620605, "learning_rate": 3.308163265306123e-05, "loss": 0.6703, "mean_token_accuracy": 0.8067161858081817, "num_tokens": 439436008.0, "step": 42330 }, { "entropy": 0.5739502489566803, "epoch": 0.33872, "grad_norm": 3.1129934787750244, "learning_rate": 3.307763105242097e-05, "loss": 0.5653, "mean_token_accuracy": 0.8209488928318024, "num_tokens": 439579467.0, "step": 42340 }, { "entropy": 0.6446910619735717, "epoch": 0.3388, "grad_norm": 4.6415605545043945, "learning_rate": 3.307362945178071e-05, "loss": 0.6532, "mean_token_accuracy": 0.824363249540329, "num_tokens": 439619569.0, "step": 42350 }, { "entropy": 0.64803546667099, "epoch": 0.33888, "grad_norm": 2.1037917137145996, "learning_rate": 3.3069627851140454e-05, "loss": 0.6417, "mean_token_accuracy": 0.795232629776001, "num_tokens": 439783359.0, "step": 42360 }, { "entropy": 0.6832642942667008, "epoch": 0.33896, "grad_norm": 3.394268035888672, "learning_rate": 3.3065626250500204e-05, "loss": 0.6782, "mean_token_accuracy": 0.8088128864765167, "num_tokens": 439866771.0, "step": 42370 }, { "entropy": 0.6621807217597961, "epoch": 0.33904, "grad_norm": 1.2892835140228271, "learning_rate": 3.306162464985995e-05, "loss": 0.6728, "mean_token_accuracy": 0.8071506142616272, "num_tokens": 439961362.0, "step": 42380 }, { "entropy": 0.6511604011058807, "epoch": 0.33912, "grad_norm": 3.2669339179992676, "learning_rate": 3.3057623049219685e-05, "loss": 0.6431, "mean_token_accuracy": 0.802441680431366, "num_tokens": 440096256.0, "step": 42390 }, { "entropy": 0.7631279051303863, "epoch": 0.3392, "grad_norm": 4.646073341369629, "learning_rate": 3.3053621448579435e-05, "loss": 0.7598, "mean_token_accuracy": 0.8020580828189849, "num_tokens": 440138418.0, "step": 42400 }, { "entropy": 0.7516949832439422, "epoch": 0.33928, "grad_norm": 3.493028163909912, "learning_rate": 3.304961984793918e-05, "loss": 0.7526, "mean_token_accuracy": 0.770127022266388, "num_tokens": 440302258.0, "step": 42410 }, { "entropy": 0.7015589147806167, "epoch": 0.33936, "grad_norm": 3.0043439865112305, "learning_rate": 3.304561824729892e-05, "loss": 0.6988, "mean_token_accuracy": 0.7980788230895997, "num_tokens": 440393561.0, "step": 42420 }, { "entropy": 0.6776282787322998, "epoch": 0.33944, "grad_norm": 1.485468864440918, "learning_rate": 3.304161664665866e-05, "loss": 0.6789, "mean_token_accuracy": 0.8045651853084564, "num_tokens": 440487774.0, "step": 42430 }, { "entropy": 0.7508145689964294, "epoch": 0.33952, "grad_norm": 2.995363712310791, "learning_rate": 3.303761504601841e-05, "loss": 0.7292, "mean_token_accuracy": 0.7790767550468445, "num_tokens": 440633977.0, "step": 42440 }, { "entropy": 0.7287271916866302, "epoch": 0.3396, "grad_norm": 4.404946804046631, "learning_rate": 3.3033613445378154e-05, "loss": 0.7292, "mean_token_accuracy": 0.805517452955246, "num_tokens": 440678148.0, "step": 42450 }, { "entropy": 0.5883499681949615, "epoch": 0.33968, "grad_norm": 1.5519975423812866, "learning_rate": 3.30296118447379e-05, "loss": 0.59, "mean_token_accuracy": 0.809598731994629, "num_tokens": 440841440.0, "step": 42460 }, { "entropy": 0.6368323534727096, "epoch": 0.33976, "grad_norm": 2.9337892532348633, "learning_rate": 3.302561024409764e-05, "loss": 0.6292, "mean_token_accuracy": 0.8194755852222443, "num_tokens": 440916175.0, "step": 42470 }, { "entropy": 0.7048662662506103, "epoch": 0.33984, "grad_norm": 1.6281139850616455, "learning_rate": 3.3021608643457385e-05, "loss": 0.7225, "mean_token_accuracy": 0.79386225938797, "num_tokens": 441009721.0, "step": 42480 }, { "entropy": 0.6793629825115204, "epoch": 0.33992, "grad_norm": 2.018080472946167, "learning_rate": 3.301760704281713e-05, "loss": 0.665, "mean_token_accuracy": 0.7944580614566803, "num_tokens": 441153408.0, "step": 42490 }, { "entropy": 0.7685471296310424, "epoch": 0.34, "grad_norm": 4.863678932189941, "learning_rate": 3.301360544217687e-05, "loss": 0.7626, "mean_token_accuracy": 0.7978982865810395, "num_tokens": 441198527.0, "step": 42500 }, { "entropy": 0.6213638931512833, "epoch": 0.34008, "grad_norm": 1.9546267986297607, "learning_rate": 3.3009603841536616e-05, "loss": 0.6241, "mean_token_accuracy": 0.7986260414123535, "num_tokens": 441362367.0, "step": 42510 }, { "entropy": 0.6487293422222138, "epoch": 0.34016, "grad_norm": 2.6005890369415283, "learning_rate": 3.300560224089636e-05, "loss": 0.6337, "mean_token_accuracy": 0.8126892924308777, "num_tokens": 441447534.0, "step": 42520 }, { "entropy": 0.6679217636585235, "epoch": 0.34024, "grad_norm": 1.537379503250122, "learning_rate": 3.3001600640256104e-05, "loss": 0.6717, "mean_token_accuracy": 0.8045731604099273, "num_tokens": 441540571.0, "step": 42530 }, { "entropy": 0.6614271938800812, "epoch": 0.34032, "grad_norm": 2.0073344707489014, "learning_rate": 3.299759903961585e-05, "loss": 0.6568, "mean_token_accuracy": 0.792987459897995, "num_tokens": 441688540.0, "step": 42540 }, { "entropy": 0.7499577611684799, "epoch": 0.3404, "grad_norm": 5.33182430267334, "learning_rate": 3.299359743897559e-05, "loss": 0.7323, "mean_token_accuracy": 0.8075461626052857, "num_tokens": 441732584.0, "step": 42550 }, { "entropy": 0.6599862813949585, "epoch": 0.34048, "grad_norm": 2.1469290256500244, "learning_rate": 3.2989595838335335e-05, "loss": 0.6582, "mean_token_accuracy": 0.7929816603660583, "num_tokens": 441894608.0, "step": 42560 }, { "entropy": 0.6614422708749771, "epoch": 0.34056, "grad_norm": 4.503615856170654, "learning_rate": 3.298559423769508e-05, "loss": 0.6532, "mean_token_accuracy": 0.81646928191185, "num_tokens": 441966827.0, "step": 42570 }, { "entropy": 0.7850607633590698, "epoch": 0.34064, "grad_norm": 1.445708155632019, "learning_rate": 3.298159263705482e-05, "loss": 0.7832, "mean_token_accuracy": 0.7825464069843292, "num_tokens": 442059758.0, "step": 42580 }, { "entropy": 0.6414844930171967, "epoch": 0.34072, "grad_norm": 2.739818572998047, "learning_rate": 3.2977591036414566e-05, "loss": 0.6462, "mean_token_accuracy": 0.8024498462677002, "num_tokens": 442193455.0, "step": 42590 }, { "entropy": 0.6220035523176193, "epoch": 0.3408, "grad_norm": 5.081754207611084, "learning_rate": 3.297358943577431e-05, "loss": 0.6126, "mean_token_accuracy": 0.8299427092075348, "num_tokens": 442228912.0, "step": 42600 }, { "entropy": 0.6416154026985168, "epoch": 0.34088, "grad_norm": 4.1951398849487305, "learning_rate": 3.2969587835134053e-05, "loss": 0.6374, "mean_token_accuracy": 0.8000305414199829, "num_tokens": 442392752.0, "step": 42610 }, { "entropy": 0.605635541677475, "epoch": 0.34096, "grad_norm": 3.5990045070648193, "learning_rate": 3.29655862344938e-05, "loss": 0.6006, "mean_token_accuracy": 0.8256738424301148, "num_tokens": 442472469.0, "step": 42620 }, { "entropy": 0.721391761302948, "epoch": 0.34104, "grad_norm": 1.734176516532898, "learning_rate": 3.296158463385354e-05, "loss": 0.7278, "mean_token_accuracy": 0.7925459682941437, "num_tokens": 442566003.0, "step": 42630 }, { "entropy": 0.6718544125556946, "epoch": 0.34112, "grad_norm": 2.1046509742736816, "learning_rate": 3.2957583033213285e-05, "loss": 0.6673, "mean_token_accuracy": 0.7937187254428864, "num_tokens": 442702371.0, "step": 42640 }, { "entropy": 0.6239781320095062, "epoch": 0.3412, "grad_norm": 4.939766883850098, "learning_rate": 3.2953581432573035e-05, "loss": 0.6146, "mean_token_accuracy": 0.8354046404361725, "num_tokens": 442738221.0, "step": 42650 }, { "entropy": 0.6862532675266266, "epoch": 0.34128, "grad_norm": 1.9390864372253418, "learning_rate": 3.294957983193277e-05, "loss": 0.682, "mean_token_accuracy": 0.7879824101924896, "num_tokens": 442902061.0, "step": 42660 }, { "entropy": 0.5979673504829407, "epoch": 0.34136, "grad_norm": 3.740574836730957, "learning_rate": 3.2945578231292516e-05, "loss": 0.5918, "mean_token_accuracy": 0.8241738736629486, "num_tokens": 442983871.0, "step": 42670 }, { "entropy": 0.6943708717823028, "epoch": 0.34144, "grad_norm": 1.6528599262237549, "learning_rate": 3.294157663065226e-05, "loss": 0.688, "mean_token_accuracy": 0.8048372805118561, "num_tokens": 443075859.0, "step": 42680 }, { "entropy": 0.7173841297626495, "epoch": 0.34152, "grad_norm": 2.016489267349243, "learning_rate": 3.293757503001201e-05, "loss": 0.7116, "mean_token_accuracy": 0.7809992134571075, "num_tokens": 443221347.0, "step": 42690 }, { "entropy": 0.703176599740982, "epoch": 0.3416, "grad_norm": 4.784998893737793, "learning_rate": 3.293357342937175e-05, "loss": 0.7001, "mean_token_accuracy": 0.8083385765552521, "num_tokens": 443263741.0, "step": 42700 }, { "entropy": 0.6719812452793121, "epoch": 0.34168, "grad_norm": 1.753061056137085, "learning_rate": 3.292957182873149e-05, "loss": 0.6725, "mean_token_accuracy": 0.7906421065330506, "num_tokens": 443425467.0, "step": 42710 }, { "entropy": 0.6462839901447296, "epoch": 0.34176, "grad_norm": 3.461167097091675, "learning_rate": 3.292557022809124e-05, "loss": 0.6449, "mean_token_accuracy": 0.8149786174297333, "num_tokens": 443496819.0, "step": 42720 }, { "entropy": 0.7387010455131531, "epoch": 0.34184, "grad_norm": 1.547845721244812, "learning_rate": 3.2921568627450985e-05, "loss": 0.7289, "mean_token_accuracy": 0.7923835813999176, "num_tokens": 443589274.0, "step": 42730 }, { "entropy": 0.7253835618495941, "epoch": 0.34192, "grad_norm": 2.366208076477051, "learning_rate": 3.291756702681072e-05, "loss": 0.7305, "mean_token_accuracy": 0.7773471713066101, "num_tokens": 443732142.0, "step": 42740 }, { "entropy": 0.6660663604736328, "epoch": 0.342, "grad_norm": 6.138154029846191, "learning_rate": 3.2913565426170465e-05, "loss": 0.6518, "mean_token_accuracy": 0.823762321472168, "num_tokens": 443775109.0, "step": 42750 }, { "entropy": 0.6412768304347992, "epoch": 0.34208, "grad_norm": 1.6818419694900513, "learning_rate": 3.2909563825530216e-05, "loss": 0.6453, "mean_token_accuracy": 0.7967513501644135, "num_tokens": 443938949.0, "step": 42760 }, { "entropy": 0.7321756243705749, "epoch": 0.34216, "grad_norm": 2.750931739807129, "learning_rate": 3.290556222488996e-05, "loss": 0.723, "mean_token_accuracy": 0.7933711111545563, "num_tokens": 444037269.0, "step": 42770 }, { "entropy": 0.667201018333435, "epoch": 0.34224, "grad_norm": 2.3364570140838623, "learning_rate": 3.2901560624249697e-05, "loss": 0.6784, "mean_token_accuracy": 0.8041021406650544, "num_tokens": 444131857.0, "step": 42780 }, { "entropy": 0.6867633044719696, "epoch": 0.34232, "grad_norm": 2.7549655437469482, "learning_rate": 3.289755902360945e-05, "loss": 0.68, "mean_token_accuracy": 0.7920513451099396, "num_tokens": 444269220.0, "step": 42790 }, { "entropy": 0.6353570342063903, "epoch": 0.3424, "grad_norm": 6.395461082458496, "learning_rate": 3.289355742296919e-05, "loss": 0.6231, "mean_token_accuracy": 0.8287099301815033, "num_tokens": 444308626.0, "step": 42800 }, { "entropy": 0.65511474609375, "epoch": 0.34248, "grad_norm": 1.6383442878723145, "learning_rate": 3.2889555822328934e-05, "loss": 0.6533, "mean_token_accuracy": 0.7941927194595337, "num_tokens": 444472466.0, "step": 42810 }, { "entropy": 0.6136079847812652, "epoch": 0.34256, "grad_norm": 3.0245378017425537, "learning_rate": 3.288555422168867e-05, "loss": 0.6123, "mean_token_accuracy": 0.8178657233715058, "num_tokens": 444564706.0, "step": 42820 }, { "entropy": 0.7755627691745758, "epoch": 0.34264, "grad_norm": 2.1284732818603516, "learning_rate": 3.288155262104842e-05, "loss": 0.7795, "mean_token_accuracy": 0.7826891779899597, "num_tokens": 444658316.0, "step": 42830 }, { "entropy": 0.65278200507164, "epoch": 0.34272, "grad_norm": 2.9831883907318115, "learning_rate": 3.2877551020408166e-05, "loss": 0.6548, "mean_token_accuracy": 0.7972146153450013, "num_tokens": 444800126.0, "step": 42840 }, { "entropy": 0.6334793567657471, "epoch": 0.3428, "grad_norm": 5.869292259216309, "learning_rate": 3.287354941976791e-05, "loss": 0.6276, "mean_token_accuracy": 0.8262249886989593, "num_tokens": 444839911.0, "step": 42850 }, { "entropy": 0.65914888381958, "epoch": 0.34288, "grad_norm": 1.8901492357254028, "learning_rate": 3.286954781912765e-05, "loss": 0.6618, "mean_token_accuracy": 0.7883060574531555, "num_tokens": 445003751.0, "step": 42860 }, { "entropy": 0.6444944500923157, "epoch": 0.34296, "grad_norm": 3.10563588142395, "learning_rate": 3.28655462184874e-05, "loss": 0.6329, "mean_token_accuracy": 0.8146685540676117, "num_tokens": 445089391.0, "step": 42870 }, { "entropy": 0.6848049998283386, "epoch": 0.34304, "grad_norm": 1.348793387413025, "learning_rate": 3.286154461784714e-05, "loss": 0.6807, "mean_token_accuracy": 0.8019946098327637, "num_tokens": 445183011.0, "step": 42880 }, { "entropy": 0.6899725317955017, "epoch": 0.34312, "grad_norm": 2.6250157356262207, "learning_rate": 3.2857543017206884e-05, "loss": 0.683, "mean_token_accuracy": 0.7927283465862274, "num_tokens": 445314072.0, "step": 42890 }, { "entropy": 0.7074708610773086, "epoch": 0.3432, "grad_norm": 5.462912082672119, "learning_rate": 3.285354141656663e-05, "loss": 0.7162, "mean_token_accuracy": 0.8084740757942199, "num_tokens": 445350177.0, "step": 42900 }, { "entropy": 0.6609443724155426, "epoch": 0.34328, "grad_norm": 1.3665543794631958, "learning_rate": 3.284953981592637e-05, "loss": 0.6631, "mean_token_accuracy": 0.7886785566806793, "num_tokens": 445514017.0, "step": 42910 }, { "entropy": 0.7421562969684601, "epoch": 0.34336, "grad_norm": 3.2222044467926025, "learning_rate": 3.2845538215286115e-05, "loss": 0.7311, "mean_token_accuracy": 0.7917923092842102, "num_tokens": 445598703.0, "step": 42920 }, { "entropy": 0.615149587392807, "epoch": 0.34344, "grad_norm": 1.8409351110458374, "learning_rate": 3.2841536614645866e-05, "loss": 0.6043, "mean_token_accuracy": 0.8223752558231354, "num_tokens": 445694392.0, "step": 42930 }, { "entropy": 0.737680858373642, "epoch": 0.34352, "grad_norm": 2.34262752532959, "learning_rate": 3.28375350140056e-05, "loss": 0.729, "mean_token_accuracy": 0.7834593713283539, "num_tokens": 445839879.0, "step": 42940 }, { "entropy": 0.6996616244316101, "epoch": 0.3436, "grad_norm": 4.502718448638916, "learning_rate": 3.2833533413365346e-05, "loss": 0.7011, "mean_token_accuracy": 0.8127973318099976, "num_tokens": 445881799.0, "step": 42950 }, { "entropy": 0.6830228507518769, "epoch": 0.34368, "grad_norm": 1.7180036306381226, "learning_rate": 3.282953181272509e-05, "loss": 0.6825, "mean_token_accuracy": 0.791917234659195, "num_tokens": 446044725.0, "step": 42960 }, { "entropy": 0.6931262910366058, "epoch": 0.34376, "grad_norm": 3.114461660385132, "learning_rate": 3.282553021208484e-05, "loss": 0.688, "mean_token_accuracy": 0.8040809214115143, "num_tokens": 446122909.0, "step": 42970 }, { "entropy": 0.6709923505783081, "epoch": 0.34384, "grad_norm": 1.6973580121994019, "learning_rate": 3.282152861144458e-05, "loss": 0.6603, "mean_token_accuracy": 0.8056463897228241, "num_tokens": 446216826.0, "step": 42980 }, { "entropy": 0.6656126976013184, "epoch": 0.34392, "grad_norm": 2.5410547256469727, "learning_rate": 3.281752701080432e-05, "loss": 0.6639, "mean_token_accuracy": 0.8004610657691955, "num_tokens": 446352899.0, "step": 42990 }, { "entropy": 0.6796181410551071, "epoch": 0.344, "grad_norm": 4.683195114135742, "learning_rate": 3.281352541016407e-05, "loss": 0.6667, "mean_token_accuracy": 0.8232348322868347, "num_tokens": 446388127.0, "step": 43000 }, { "entropy": 0.7101831495761871, "epoch": 0.34408, "grad_norm": 1.8333340883255005, "learning_rate": 3.2809523809523815e-05, "loss": 0.7155, "mean_token_accuracy": 0.7813726603984833, "num_tokens": 446545310.0, "step": 43010 }, { "entropy": 0.7316343009471893, "epoch": 0.34416, "grad_norm": 3.611999988555908, "learning_rate": 3.280552220888355e-05, "loss": 0.7227, "mean_token_accuracy": 0.8002341151237488, "num_tokens": 446604186.0, "step": 43020 }, { "entropy": 0.742469334602356, "epoch": 0.34424, "grad_norm": 1.8303660154342651, "learning_rate": 3.2801520608243296e-05, "loss": 0.7387, "mean_token_accuracy": 0.7905532777309418, "num_tokens": 446695865.0, "step": 43030 }, { "entropy": 0.6999586820602417, "epoch": 0.34432, "grad_norm": 3.4704749584198, "learning_rate": 3.279751900760305e-05, "loss": 0.7041, "mean_token_accuracy": 0.7866924285888672, "num_tokens": 446834754.0, "step": 43040 }, { "entropy": 0.6943661212921143, "epoch": 0.3444, "grad_norm": 4.636980056762695, "learning_rate": 3.279351740696279e-05, "loss": 0.6912, "mean_token_accuracy": 0.8167681932449341, "num_tokens": 446874969.0, "step": 43050 }, { "entropy": 0.6291429162025451, "epoch": 0.34448, "grad_norm": 2.2603814601898193, "learning_rate": 3.278951580632253e-05, "loss": 0.6222, "mean_token_accuracy": 0.7998839795589447, "num_tokens": 447038809.0, "step": 43060 }, { "entropy": 0.6984606117010117, "epoch": 0.34456, "grad_norm": 2.580277442932129, "learning_rate": 3.278551420568227e-05, "loss": 0.6953, "mean_token_accuracy": 0.7995408535003662, "num_tokens": 447123193.0, "step": 43070 }, { "entropy": 0.6613956093788147, "epoch": 0.34464, "grad_norm": 1.8699911832809448, "learning_rate": 3.278151260504202e-05, "loss": 0.652, "mean_token_accuracy": 0.8076871991157532, "num_tokens": 447217071.0, "step": 43080 }, { "entropy": 0.6812968671321868, "epoch": 0.34472, "grad_norm": 3.1095893383026123, "learning_rate": 3.2777511004401765e-05, "loss": 0.6818, "mean_token_accuracy": 0.7866366624832153, "num_tokens": 447370294.0, "step": 43090 }, { "entropy": 0.6514096677303314, "epoch": 0.3448, "grad_norm": 4.3834547996521, "learning_rate": 3.27735094037615e-05, "loss": 0.6659, "mean_token_accuracy": 0.8193212866783142, "num_tokens": 447411455.0, "step": 43100 }, { "entropy": 0.6570982575416565, "epoch": 0.34488, "grad_norm": 1.488563060760498, "learning_rate": 3.276950780312125e-05, "loss": 0.6462, "mean_token_accuracy": 0.7933500289916993, "num_tokens": 447575295.0, "step": 43110 }, { "entropy": 0.6731982082128525, "epoch": 0.34496, "grad_norm": 3.0243263244628906, "learning_rate": 3.2765506202480996e-05, "loss": 0.6658, "mean_token_accuracy": 0.8038355827331543, "num_tokens": 447666723.0, "step": 43120 }, { "entropy": 0.6765221178531646, "epoch": 0.34504, "grad_norm": 2.2272043228149414, "learning_rate": 3.276150460184074e-05, "loss": 0.6832, "mean_token_accuracy": 0.7990160942077636, "num_tokens": 447762247.0, "step": 43130 }, { "entropy": 0.698837724328041, "epoch": 0.34512, "grad_norm": 2.2608063220977783, "learning_rate": 3.275750300120048e-05, "loss": 0.6861, "mean_token_accuracy": 0.7967319369316102, "num_tokens": 447883613.0, "step": 43140 }, { "entropy": 0.7656076312065124, "epoch": 0.3452, "grad_norm": 5.095448970794678, "learning_rate": 3.275350140056023e-05, "loss": 0.7719, "mean_token_accuracy": 0.8013746857643127, "num_tokens": 447918263.0, "step": 43150 }, { "entropy": 0.6513091385364532, "epoch": 0.34528, "grad_norm": 2.3176934719085693, "learning_rate": 3.274949979991997e-05, "loss": 0.6479, "mean_token_accuracy": 0.7938996076583862, "num_tokens": 448082103.0, "step": 43160 }, { "entropy": 0.5701131016016007, "epoch": 0.34536, "grad_norm": 3.268982410430908, "learning_rate": 3.2745498199279715e-05, "loss": 0.5583, "mean_token_accuracy": 0.8331372320652009, "num_tokens": 448174134.0, "step": 43170 }, { "entropy": 0.6301431238651276, "epoch": 0.34544, "grad_norm": 1.8855925798416138, "learning_rate": 3.274149659863946e-05, "loss": 0.6447, "mean_token_accuracy": 0.8100700080394745, "num_tokens": 448268263.0, "step": 43180 }, { "entropy": 0.7200656354427337, "epoch": 0.34552, "grad_norm": 2.1806931495666504, "learning_rate": 3.27374949979992e-05, "loss": 0.7131, "mean_token_accuracy": 0.7817758858203888, "num_tokens": 448418773.0, "step": 43190 }, { "entropy": 0.6185190051794052, "epoch": 0.3456, "grad_norm": 4.292586326599121, "learning_rate": 3.2733493397358946e-05, "loss": 0.6135, "mean_token_accuracy": 0.8305230617523194, "num_tokens": 448464829.0, "step": 43200 }, { "entropy": 0.6333631873130798, "epoch": 0.34568, "grad_norm": 1.3432538509368896, "learning_rate": 3.272949179671869e-05, "loss": 0.6311, "mean_token_accuracy": 0.7994076788425446, "num_tokens": 448628669.0, "step": 43210 }, { "entropy": 0.6867225050926209, "epoch": 0.34576, "grad_norm": 3.818568468093872, "learning_rate": 3.2725490196078433e-05, "loss": 0.68, "mean_token_accuracy": 0.8041228950023651, "num_tokens": 448709745.0, "step": 43220 }, { "entropy": 0.7282233774662018, "epoch": 0.34584, "grad_norm": 1.988319754600525, "learning_rate": 3.272148859543818e-05, "loss": 0.7112, "mean_token_accuracy": 0.7963241517543793, "num_tokens": 448803500.0, "step": 43230 }, { "entropy": 0.6897161185741425, "epoch": 0.34592, "grad_norm": 3.421736240386963, "learning_rate": 3.271748699479792e-05, "loss": 0.6902, "mean_token_accuracy": 0.7883034169673919, "num_tokens": 448949378.0, "step": 43240 }, { "entropy": 0.7124107897281646, "epoch": 0.346, "grad_norm": 5.489914417266846, "learning_rate": 3.2713485394157665e-05, "loss": 0.7082, "mean_token_accuracy": 0.8147594630718231, "num_tokens": 448984134.0, "step": 43250 }, { "entropy": 0.6520423352718353, "epoch": 0.34608, "grad_norm": 1.7418544292449951, "learning_rate": 3.270948379351741e-05, "loss": 0.6487, "mean_token_accuracy": 0.7965864717960358, "num_tokens": 449147974.0, "step": 43260 }, { "entropy": 0.6198710381984711, "epoch": 0.34616, "grad_norm": 3.370708703994751, "learning_rate": 3.270548219287715e-05, "loss": 0.6091, "mean_token_accuracy": 0.8199663400650025, "num_tokens": 449248016.0, "step": 43270 }, { "entropy": 0.6393252968788147, "epoch": 0.34624, "grad_norm": 1.4801234006881714, "learning_rate": 3.2701480592236896e-05, "loss": 0.6402, "mean_token_accuracy": 0.8130701899528503, "num_tokens": 449344193.0, "step": 43280 }, { "entropy": 0.6768830358982086, "epoch": 0.34632, "grad_norm": 2.637465238571167, "learning_rate": 3.269747899159664e-05, "loss": 0.6732, "mean_token_accuracy": 0.7923321545124054, "num_tokens": 449482348.0, "step": 43290 }, { "entropy": 0.6928452372550964, "epoch": 0.3464, "grad_norm": 5.408182144165039, "learning_rate": 3.269347739095638e-05, "loss": 0.6931, "mean_token_accuracy": 0.816123628616333, "num_tokens": 449519174.0, "step": 43300 }, { "entropy": 0.6603492438793183, "epoch": 0.34648, "grad_norm": 1.561178207397461, "learning_rate": 3.268947579031613e-05, "loss": 0.6575, "mean_token_accuracy": 0.789643383026123, "num_tokens": 449683014.0, "step": 43310 }, { "entropy": 0.6493176221847534, "epoch": 0.34656, "grad_norm": 3.364264726638794, "learning_rate": 3.268547418967588e-05, "loss": 0.643, "mean_token_accuracy": 0.812283456325531, "num_tokens": 449773733.0, "step": 43320 }, { "entropy": 0.7184621572494507, "epoch": 0.34664, "grad_norm": 1.768973708152771, "learning_rate": 3.2681472589035614e-05, "loss": 0.7028, "mean_token_accuracy": 0.8040735483169555, "num_tokens": 449868584.0, "step": 43330 }, { "entropy": 0.6564631640911103, "epoch": 0.34672, "grad_norm": 2.9172487258911133, "learning_rate": 3.267747098839536e-05, "loss": 0.6583, "mean_token_accuracy": 0.7976446330547333, "num_tokens": 450008188.0, "step": 43340 }, { "entropy": 0.8160139858722687, "epoch": 0.3468, "grad_norm": 8.2091703414917, "learning_rate": 3.26734693877551e-05, "loss": 0.8094, "mean_token_accuracy": 0.788637924194336, "num_tokens": 450043618.0, "step": 43350 }, { "entropy": 0.6835364997386932, "epoch": 0.34688, "grad_norm": 2.146077871322632, "learning_rate": 3.266946778711485e-05, "loss": 0.6811, "mean_token_accuracy": 0.7858329355716706, "num_tokens": 450207458.0, "step": 43360 }, { "entropy": 0.6433830738067627, "epoch": 0.34696, "grad_norm": 3.18760085105896, "learning_rate": 3.266546618647459e-05, "loss": 0.6361, "mean_token_accuracy": 0.8122335493564605, "num_tokens": 450302113.0, "step": 43370 }, { "entropy": 0.7417049586772919, "epoch": 0.34704, "grad_norm": 2.585707187652588, "learning_rate": 3.266146458583433e-05, "loss": 0.7569, "mean_token_accuracy": 0.788035124540329, "num_tokens": 450397539.0, "step": 43380 }, { "entropy": 0.6833308339118958, "epoch": 0.34712, "grad_norm": 2.787621259689331, "learning_rate": 3.265746298519408e-05, "loss": 0.67, "mean_token_accuracy": 0.8019178986549378, "num_tokens": 450519535.0, "step": 43390 }, { "entropy": 0.7228344500064849, "epoch": 0.3472, "grad_norm": 4.334185600280762, "learning_rate": 3.265346138455383e-05, "loss": 0.7229, "mean_token_accuracy": 0.8110022962093353, "num_tokens": 450553145.0, "step": 43400 }, { "entropy": 0.6911342978477478, "epoch": 0.34728, "grad_norm": 1.5888952016830444, "learning_rate": 3.2649459783913564e-05, "loss": 0.6906, "mean_token_accuracy": 0.7845444619655609, "num_tokens": 450716985.0, "step": 43410 }, { "entropy": 0.6866464436054229, "epoch": 0.34736, "grad_norm": 4.635396957397461, "learning_rate": 3.264545818327331e-05, "loss": 0.6746, "mean_token_accuracy": 0.8065112292766571, "num_tokens": 450801594.0, "step": 43420 }, { "entropy": 0.6859981715679169, "epoch": 0.34744, "grad_norm": 1.5568199157714844, "learning_rate": 3.264145658263306e-05, "loss": 0.7009, "mean_token_accuracy": 0.7989930033683776, "num_tokens": 450894378.0, "step": 43430 }, { "entropy": 0.777531236410141, "epoch": 0.34752, "grad_norm": 2.617558002471924, "learning_rate": 3.26374549819928e-05, "loss": 0.779, "mean_token_accuracy": 0.7730464160442352, "num_tokens": 451017609.0, "step": 43440 }, { "entropy": 0.8213249385356903, "epoch": 0.3476, "grad_norm": 4.726444244384766, "learning_rate": 3.263345338135254e-05, "loss": 0.8185, "mean_token_accuracy": 0.7890470921993256, "num_tokens": 451050343.0, "step": 43450 }, { "entropy": 0.6477029979228973, "epoch": 0.34768, "grad_norm": 1.9279093742370605, "learning_rate": 3.262945178071229e-05, "loss": 0.6411, "mean_token_accuracy": 0.7963727414608002, "num_tokens": 451214183.0, "step": 43460 }, { "entropy": 0.6717117249965667, "epoch": 0.34776, "grad_norm": 3.2040817737579346, "learning_rate": 3.262545018007203e-05, "loss": 0.6652, "mean_token_accuracy": 0.8043190002441406, "num_tokens": 451306961.0, "step": 43470 }, { "entropy": 0.6960487604141236, "epoch": 0.34784, "grad_norm": 1.6580891609191895, "learning_rate": 3.262144857943178e-05, "loss": 0.6959, "mean_token_accuracy": 0.8019130170345307, "num_tokens": 451401202.0, "step": 43480 }, { "entropy": 0.6872906625270844, "epoch": 0.34792, "grad_norm": 2.5802161693573, "learning_rate": 3.2617446978791514e-05, "loss": 0.6864, "mean_token_accuracy": 0.7897855579853058, "num_tokens": 451539740.0, "step": 43490 }, { "entropy": 0.7380066573619842, "epoch": 0.348, "grad_norm": 4.021237850189209, "learning_rate": 3.2613445378151264e-05, "loss": 0.7343, "mean_token_accuracy": 0.8003056287765503, "num_tokens": 451580056.0, "step": 43500 }, { "entropy": 0.5988789439201355, "epoch": 0.34808, "grad_norm": 1.5744820833206177, "learning_rate": 3.260944377751101e-05, "loss": 0.595, "mean_token_accuracy": 0.8075537383556366, "num_tokens": 451743896.0, "step": 43510 }, { "entropy": 0.7394289791584014, "epoch": 0.34816, "grad_norm": 3.848562479019165, "learning_rate": 3.260544217687075e-05, "loss": 0.7268, "mean_token_accuracy": 0.7924438059329987, "num_tokens": 451839659.0, "step": 43520 }, { "entropy": 0.7509140551090241, "epoch": 0.34824, "grad_norm": 2.5499470233917236, "learning_rate": 3.2601440576230495e-05, "loss": 0.7439, "mean_token_accuracy": 0.7909041285514832, "num_tokens": 451932828.0, "step": 43530 }, { "entropy": 0.6740683555603028, "epoch": 0.34832, "grad_norm": 2.9055275917053223, "learning_rate": 3.259743897559024e-05, "loss": 0.6791, "mean_token_accuracy": 0.7931153357028962, "num_tokens": 452079899.0, "step": 43540 }, { "entropy": 0.7257159650325775, "epoch": 0.3484, "grad_norm": 3.695627450942993, "learning_rate": 3.259343737494998e-05, "loss": 0.7173, "mean_token_accuracy": 0.8069205760955811, "num_tokens": 452123292.0, "step": 43550 }, { "entropy": 0.6786695182323456, "epoch": 0.34848, "grad_norm": 1.4686012268066406, "learning_rate": 3.2589435774309727e-05, "loss": 0.6772, "mean_token_accuracy": 0.7838544309139251, "num_tokens": 452287132.0, "step": 43560 }, { "entropy": 0.6652124494314193, "epoch": 0.34856, "grad_norm": 2.365098714828491, "learning_rate": 3.258543417366947e-05, "loss": 0.6624, "mean_token_accuracy": 0.8035508930683136, "num_tokens": 452398659.0, "step": 43570 }, { "entropy": 0.7212514221668244, "epoch": 0.34864, "grad_norm": 1.7738227844238281, "learning_rate": 3.2581432573029214e-05, "loss": 0.7205, "mean_token_accuracy": 0.7996475994586945, "num_tokens": 452495238.0, "step": 43580 }, { "entropy": 0.6590900391340255, "epoch": 0.34872, "grad_norm": 2.4837746620178223, "learning_rate": 3.257743097238896e-05, "loss": 0.6626, "mean_token_accuracy": 0.7983704626560211, "num_tokens": 452625455.0, "step": 43590 }, { "entropy": 0.7477512419223785, "epoch": 0.3488, "grad_norm": 4.681305885314941, "learning_rate": 3.25734293717487e-05, "loss": 0.7518, "mean_token_accuracy": 0.8070102810859681, "num_tokens": 452658957.0, "step": 43600 }, { "entropy": 0.6717670530080795, "epoch": 0.34888, "grad_norm": 2.3037919998168945, "learning_rate": 3.2569427771108445e-05, "loss": 0.6595, "mean_token_accuracy": 0.7917996942996979, "num_tokens": 452820398.0, "step": 43610 }, { "entropy": 0.7152568161487579, "epoch": 0.34896, "grad_norm": 3.618093252182007, "learning_rate": 3.256542617046819e-05, "loss": 0.7035, "mean_token_accuracy": 0.8004279732704163, "num_tokens": 452901795.0, "step": 43620 }, { "entropy": 0.6670766234397888, "epoch": 0.34904, "grad_norm": 2.23457670211792, "learning_rate": 3.256142456982793e-05, "loss": 0.6669, "mean_token_accuracy": 0.8064318239688874, "num_tokens": 452997889.0, "step": 43630 }, { "entropy": 0.6654040217399597, "epoch": 0.34912, "grad_norm": 2.4699900150299072, "learning_rate": 3.2557422969187676e-05, "loss": 0.6601, "mean_token_accuracy": 0.7964942812919616, "num_tokens": 453146562.0, "step": 43640 }, { "entropy": 0.6349775493144989, "epoch": 0.3492, "grad_norm": 4.203359603881836, "learning_rate": 3.255342136854742e-05, "loss": 0.6335, "mean_token_accuracy": 0.8263629019260407, "num_tokens": 453187830.0, "step": 43650 }, { "entropy": 0.6536146193742752, "epoch": 0.34928, "grad_norm": 1.3576698303222656, "learning_rate": 3.2549419767907164e-05, "loss": 0.6624, "mean_token_accuracy": 0.7917684435844421, "num_tokens": 453351670.0, "step": 43660 }, { "entropy": 0.793049818277359, "epoch": 0.34936, "grad_norm": 3.3791391849517822, "learning_rate": 3.254541816726691e-05, "loss": 0.7806, "mean_token_accuracy": 0.786422199010849, "num_tokens": 453430316.0, "step": 43670 }, { "entropy": 0.6688565731048584, "epoch": 0.34944, "grad_norm": 1.9275867938995361, "learning_rate": 3.254141656662665e-05, "loss": 0.6707, "mean_token_accuracy": 0.8106909453868866, "num_tokens": 453522852.0, "step": 43680 }, { "entropy": 0.7262610822916031, "epoch": 0.34952, "grad_norm": 2.703155279159546, "learning_rate": 3.2537414965986395e-05, "loss": 0.7207, "mean_token_accuracy": 0.7819111704826355, "num_tokens": 453669535.0, "step": 43690 }, { "entropy": 0.6576280385255814, "epoch": 0.3496, "grad_norm": 4.892335891723633, "learning_rate": 3.253341336534614e-05, "loss": 0.6443, "mean_token_accuracy": 0.8217665195465088, "num_tokens": 453714755.0, "step": 43700 }, { "entropy": 0.6468155145645141, "epoch": 0.34968, "grad_norm": 1.8153996467590332, "learning_rate": 3.252941176470589e-05, "loss": 0.6472, "mean_token_accuracy": 0.7944614052772522, "num_tokens": 453878595.0, "step": 43710 }, { "entropy": 0.6410993814468384, "epoch": 0.34976, "grad_norm": 3.039710760116577, "learning_rate": 3.2525410164065626e-05, "loss": 0.6372, "mean_token_accuracy": 0.8191541969776154, "num_tokens": 453966556.0, "step": 43720 }, { "entropy": 0.7135389268398284, "epoch": 0.34984, "grad_norm": 2.662668466567993, "learning_rate": 3.252140856342537e-05, "loss": 0.7057, "mean_token_accuracy": 0.791243314743042, "num_tokens": 454061528.0, "step": 43730 }, { "entropy": 0.7196810483932495, "epoch": 0.34992, "grad_norm": 3.074716329574585, "learning_rate": 3.251740696278511e-05, "loss": 0.7209, "mean_token_accuracy": 0.7846421003341675, "num_tokens": 454198135.0, "step": 43740 }, { "entropy": 0.7269715487957, "epoch": 0.35, "grad_norm": 5.478509426116943, "learning_rate": 3.2513405362144864e-05, "loss": 0.7208, "mean_token_accuracy": 0.808404016494751, "num_tokens": 454237063.0, "step": 43750 }, { "entropy": 0.6692389369010925, "epoch": 0.35008, "grad_norm": 1.6199219226837158, "learning_rate": 3.25094037615046e-05, "loss": 0.6677, "mean_token_accuracy": 0.7909432888031006, "num_tokens": 454400511.0, "step": 43760 }, { "entropy": 0.6842547833919526, "epoch": 0.35016, "grad_norm": 3.1422157287597656, "learning_rate": 3.2505402160864344e-05, "loss": 0.6814, "mean_token_accuracy": 0.8085704386234284, "num_tokens": 454479311.0, "step": 43770 }, { "entropy": 0.6804643869400024, "epoch": 0.35024, "grad_norm": 1.716996431350708, "learning_rate": 3.2501400560224095e-05, "loss": 0.6744, "mean_token_accuracy": 0.803778064250946, "num_tokens": 454573969.0, "step": 43780 }, { "entropy": 0.6920981705188751, "epoch": 0.35032, "grad_norm": 3.415592908859253, "learning_rate": 3.249739895958384e-05, "loss": 0.6851, "mean_token_accuracy": 0.7885104894638062, "num_tokens": 454717192.0, "step": 43790 }, { "entropy": 0.6838442951440811, "epoch": 0.3504, "grad_norm": 4.009577751159668, "learning_rate": 3.2493397358943576e-05, "loss": 0.6876, "mean_token_accuracy": 0.8231146931648254, "num_tokens": 454755089.0, "step": 43800 }, { "entropy": 0.6566973567008972, "epoch": 0.35048, "grad_norm": 1.94611656665802, "learning_rate": 3.248939575830332e-05, "loss": 0.6516, "mean_token_accuracy": 0.7960460722446442, "num_tokens": 454917471.0, "step": 43810 }, { "entropy": 0.7443434715270996, "epoch": 0.35056, "grad_norm": 3.024561882019043, "learning_rate": 3.248539415766307e-05, "loss": 0.7381, "mean_token_accuracy": 0.7949434757232666, "num_tokens": 454990937.0, "step": 43820 }, { "entropy": 0.7488255500793457, "epoch": 0.35064, "grad_norm": 2.5717380046844482, "learning_rate": 3.2481392557022814e-05, "loss": 0.7539, "mean_token_accuracy": 0.789767736196518, "num_tokens": 455083000.0, "step": 43830 }, { "entropy": 0.656521451473236, "epoch": 0.35072, "grad_norm": 3.3498470783233643, "learning_rate": 3.247739095638255e-05, "loss": 0.6552, "mean_token_accuracy": 0.7999741554260253, "num_tokens": 455219829.0, "step": 43840 }, { "entropy": 0.7286197543144226, "epoch": 0.3508, "grad_norm": 5.393115043640137, "learning_rate": 3.24733893557423e-05, "loss": 0.7298, "mean_token_accuracy": 0.8121844589710235, "num_tokens": 455261757.0, "step": 43850 }, { "entropy": 0.673205417394638, "epoch": 0.35088, "grad_norm": 1.38970947265625, "learning_rate": 3.2469387755102045e-05, "loss": 0.6722, "mean_token_accuracy": 0.7876115083694458, "num_tokens": 455425223.0, "step": 43860 }, { "entropy": 0.6414714574813842, "epoch": 0.35096, "grad_norm": 4.085360050201416, "learning_rate": 3.246538615446179e-05, "loss": 0.6317, "mean_token_accuracy": 0.8151678562164306, "num_tokens": 455505728.0, "step": 43870 }, { "entropy": 0.6704039096832275, "epoch": 0.35104, "grad_norm": 2.193186044692993, "learning_rate": 3.2461384553821525e-05, "loss": 0.6757, "mean_token_accuracy": 0.8043546199798584, "num_tokens": 455600679.0, "step": 43880 }, { "entropy": 0.6607210338115692, "epoch": 0.35112, "grad_norm": 3.178318738937378, "learning_rate": 3.2457382953181276e-05, "loss": 0.6497, "mean_token_accuracy": 0.7988711178302765, "num_tokens": 455737273.0, "step": 43890 }, { "entropy": 0.7314483970403671, "epoch": 0.3512, "grad_norm": 4.484324932098389, "learning_rate": 3.245338135254102e-05, "loss": 0.7286, "mean_token_accuracy": 0.8054883420467377, "num_tokens": 455769557.0, "step": 43900 }, { "entropy": 0.6859910428524018, "epoch": 0.35128, "grad_norm": 2.159787654876709, "learning_rate": 3.244937975190076e-05, "loss": 0.6886, "mean_token_accuracy": 0.7852157294750214, "num_tokens": 455930916.0, "step": 43910 }, { "entropy": 0.6666322886943817, "epoch": 0.35136, "grad_norm": 3.6745426654815674, "learning_rate": 3.244537815126051e-05, "loss": 0.6477, "mean_token_accuracy": 0.8167730808258057, "num_tokens": 456008005.0, "step": 43920 }, { "entropy": 0.7024967312812805, "epoch": 0.35144, "grad_norm": 2.0379202365875244, "learning_rate": 3.244137655062025e-05, "loss": 0.6926, "mean_token_accuracy": 0.7988697052001953, "num_tokens": 456101595.0, "step": 43930 }, { "entropy": 0.6487274289131164, "epoch": 0.35152, "grad_norm": 2.9859087467193604, "learning_rate": 3.2437374949979994e-05, "loss": 0.6458, "mean_token_accuracy": 0.8015651106834412, "num_tokens": 456232226.0, "step": 43940 }, { "entropy": 0.6504885703325272, "epoch": 0.3516, "grad_norm": 3.874951124191284, "learning_rate": 3.243337334933974e-05, "loss": 0.6395, "mean_token_accuracy": 0.8224169433116912, "num_tokens": 456270813.0, "step": 43950 }, { "entropy": 0.6671985924243927, "epoch": 0.35168, "grad_norm": 2.7552080154418945, "learning_rate": 3.242937174869948e-05, "loss": 0.6686, "mean_token_accuracy": 0.7887737214565277, "num_tokens": 456434440.0, "step": 43960 }, { "entropy": 0.6892648160457611, "epoch": 0.35176, "grad_norm": 3.1037306785583496, "learning_rate": 3.2425370148059226e-05, "loss": 0.6784, "mean_token_accuracy": 0.8082211554050446, "num_tokens": 456515377.0, "step": 43970 }, { "entropy": 0.7445252478122711, "epoch": 0.35184, "grad_norm": 2.607933759689331, "learning_rate": 3.242136854741897e-05, "loss": 0.7419, "mean_token_accuracy": 0.791281932592392, "num_tokens": 456610196.0, "step": 43980 }, { "entropy": 0.7384205400943756, "epoch": 0.35192, "grad_norm": 2.0459868907928467, "learning_rate": 3.241736694677871e-05, "loss": 0.7374, "mean_token_accuracy": 0.7809876263141632, "num_tokens": 456743522.0, "step": 43990 }, { "entropy": 0.657304972410202, "epoch": 0.352, "grad_norm": 5.155731201171875, "learning_rate": 3.241336534613846e-05, "loss": 0.6587, "mean_token_accuracy": 0.8185275137424469, "num_tokens": 456779056.0, "step": 44000 }, { "entropy": 0.7085324227809906, "epoch": 0.35208, "grad_norm": 2.4830567836761475, "learning_rate": 3.24093637454982e-05, "loss": 0.7096, "mean_token_accuracy": 0.7832399904727936, "num_tokens": 456942334.0, "step": 44010 }, { "entropy": 0.7026634573936462, "epoch": 0.35216, "grad_norm": 3.0089666843414307, "learning_rate": 3.2405362144857944e-05, "loss": 0.6959, "mean_token_accuracy": 0.8060937285423279, "num_tokens": 457014939.0, "step": 44020 }, { "entropy": 0.6878491878509522, "epoch": 0.35224, "grad_norm": 2.249444007873535, "learning_rate": 3.240136054421769e-05, "loss": 0.6684, "mean_token_accuracy": 0.8042283713817596, "num_tokens": 457108050.0, "step": 44030 }, { "entropy": 0.7073160588741303, "epoch": 0.35232, "grad_norm": 2.3881607055664062, "learning_rate": 3.239735894357743e-05, "loss": 0.7023, "mean_token_accuracy": 0.7903549432754516, "num_tokens": 457244065.0, "step": 44040 }, { "entropy": 0.6757444143295288, "epoch": 0.3524, "grad_norm": 5.8967604637146, "learning_rate": 3.2393357342937175e-05, "loss": 0.6837, "mean_token_accuracy": 0.8137237906455994, "num_tokens": 457288264.0, "step": 44050 }, { "entropy": 0.6041391283273697, "epoch": 0.35248, "grad_norm": 1.6015516519546509, "learning_rate": 3.2389355742296926e-05, "loss": 0.6025, "mean_token_accuracy": 0.8067476809024811, "num_tokens": 457452104.0, "step": 44060 }, { "entropy": 0.6852901816368103, "epoch": 0.35256, "grad_norm": 3.9044787883758545, "learning_rate": 3.238535414165666e-05, "loss": 0.6719, "mean_token_accuracy": 0.8061717987060547, "num_tokens": 457536358.0, "step": 44070 }, { "entropy": 0.8068065702915191, "epoch": 0.35264, "grad_norm": 1.9369356632232666, "learning_rate": 3.2381352541016406e-05, "loss": 0.8009, "mean_token_accuracy": 0.7738951921463013, "num_tokens": 457630830.0, "step": 44080 }, { "entropy": 0.7816553592681885, "epoch": 0.35272, "grad_norm": 2.4236037731170654, "learning_rate": 3.237735094037615e-05, "loss": 0.7672, "mean_token_accuracy": 0.7739278972148895, "num_tokens": 457764961.0, "step": 44090 }, { "entropy": 0.6978663682937623, "epoch": 0.3528, "grad_norm": 4.556518077850342, "learning_rate": 3.23733493397359e-05, "loss": 0.7129, "mean_token_accuracy": 0.806897759437561, "num_tokens": 457803926.0, "step": 44100 }, { "entropy": 0.6247132360935211, "epoch": 0.35288, "grad_norm": 1.7294594049453735, "learning_rate": 3.236934773909564e-05, "loss": 0.6197, "mean_token_accuracy": 0.8009465098381042, "num_tokens": 457967766.0, "step": 44110 }, { "entropy": 0.6908395826816559, "epoch": 0.35296, "grad_norm": 2.4588894844055176, "learning_rate": 3.236534613845538e-05, "loss": 0.6897, "mean_token_accuracy": 0.7959310054779053, "num_tokens": 458062214.0, "step": 44120 }, { "entropy": 0.6362902313470841, "epoch": 0.35304, "grad_norm": 1.9126152992248535, "learning_rate": 3.236134453781513e-05, "loss": 0.6343, "mean_token_accuracy": 0.8161371171474456, "num_tokens": 458157530.0, "step": 44130 }, { "entropy": 0.632801903784275, "epoch": 0.35312, "grad_norm": 2.3499555587768555, "learning_rate": 3.2357342937174875e-05, "loss": 0.6214, "mean_token_accuracy": 0.8105327308177948, "num_tokens": 458289757.0, "step": 44140 }, { "entropy": 0.629379665851593, "epoch": 0.3532, "grad_norm": 5.598528861999512, "learning_rate": 3.235334133653461e-05, "loss": 0.6251, "mean_token_accuracy": 0.8285751640796661, "num_tokens": 458324781.0, "step": 44150 }, { "entropy": 0.6453611612319946, "epoch": 0.35328, "grad_norm": 2.42568039894104, "learning_rate": 3.2349339735894356e-05, "loss": 0.6458, "mean_token_accuracy": 0.7966963410377502, "num_tokens": 458484717.0, "step": 44160 }, { "entropy": 0.6538681387901306, "epoch": 0.35336, "grad_norm": 4.279183387756348, "learning_rate": 3.2345338135254107e-05, "loss": 0.6497, "mean_token_accuracy": 0.8169251322746277, "num_tokens": 458544829.0, "step": 44170 }, { "entropy": 0.7197427570819854, "epoch": 0.35344, "grad_norm": 1.795357584953308, "learning_rate": 3.234133653461385e-05, "loss": 0.7272, "mean_token_accuracy": 0.7925461590290069, "num_tokens": 458636082.0, "step": 44180 }, { "entropy": 0.6691724061965942, "epoch": 0.35352, "grad_norm": 2.5300607681274414, "learning_rate": 3.233733493397359e-05, "loss": 0.6686, "mean_token_accuracy": 0.795178908109665, "num_tokens": 458776519.0, "step": 44190 }, { "entropy": 0.7103238224983215, "epoch": 0.3536, "grad_norm": 4.459480285644531, "learning_rate": 3.233333333333333e-05, "loss": 0.6967, "mean_token_accuracy": 0.8164365351200104, "num_tokens": 458813355.0, "step": 44200 }, { "entropy": 0.683168214559555, "epoch": 0.35368, "grad_norm": 1.5724912881851196, "learning_rate": 3.232933173269308e-05, "loss": 0.6901, "mean_token_accuracy": 0.7866387665271759, "num_tokens": 458976950.0, "step": 44210 }, { "entropy": 0.7101587295532227, "epoch": 0.35376, "grad_norm": 3.164057970046997, "learning_rate": 3.2325330132052825e-05, "loss": 0.7014, "mean_token_accuracy": 0.8057584345340729, "num_tokens": 459046174.0, "step": 44220 }, { "entropy": 0.7273330867290497, "epoch": 0.35384, "grad_norm": 2.28837251663208, "learning_rate": 3.232132853141256e-05, "loss": 0.7242, "mean_token_accuracy": 0.7918701291084289, "num_tokens": 459139049.0, "step": 44230 }, { "entropy": 0.7151725471019745, "epoch": 0.35392, "grad_norm": 1.9954490661621094, "learning_rate": 3.231732693077231e-05, "loss": 0.7102, "mean_token_accuracy": 0.7841633677482605, "num_tokens": 459278034.0, "step": 44240 }, { "entropy": 0.6783519178628922, "epoch": 0.354, "grad_norm": 4.143818378448486, "learning_rate": 3.2313325330132056e-05, "loss": 0.6808, "mean_token_accuracy": 0.819880485534668, "num_tokens": 459318255.0, "step": 44250 }, { "entropy": 0.6954020678997039, "epoch": 0.35408, "grad_norm": 2.8950674533843994, "learning_rate": 3.23093237294918e-05, "loss": 0.6968, "mean_token_accuracy": 0.7787921488285064, "num_tokens": 459482095.0, "step": 44260 }, { "entropy": 0.6484593331813813, "epoch": 0.35416, "grad_norm": 3.810166358947754, "learning_rate": 3.230532212885154e-05, "loss": 0.635, "mean_token_accuracy": 0.812953644990921, "num_tokens": 459573376.0, "step": 44270 }, { "entropy": 0.659238874912262, "epoch": 0.35424, "grad_norm": 2.107288122177124, "learning_rate": 3.230132052821129e-05, "loss": 0.6572, "mean_token_accuracy": 0.8109182476997375, "num_tokens": 459668337.0, "step": 44280 }, { "entropy": 0.7047430574893951, "epoch": 0.35432, "grad_norm": 3.2193994522094727, "learning_rate": 3.229731892757103e-05, "loss": 0.6967, "mean_token_accuracy": 0.7865885972976685, "num_tokens": 459826015.0, "step": 44290 }, { "entropy": 0.6480253666639328, "epoch": 0.3544, "grad_norm": 4.390379428863525, "learning_rate": 3.2293317326930775e-05, "loss": 0.6424, "mean_token_accuracy": 0.8236777722835541, "num_tokens": 459878754.0, "step": 44300 }, { "entropy": 0.6669659852981568, "epoch": 0.35448, "grad_norm": 2.3863370418548584, "learning_rate": 3.228931572629052e-05, "loss": 0.6668, "mean_token_accuracy": 0.7893502652645111, "num_tokens": 460042594.0, "step": 44310 }, { "entropy": 0.6612113177776336, "epoch": 0.35456, "grad_norm": 3.3503024578094482, "learning_rate": 3.228531412565026e-05, "loss": 0.6498, "mean_token_accuracy": 0.8136255860328674, "num_tokens": 460123504.0, "step": 44320 }, { "entropy": 0.6486345201730728, "epoch": 0.35464, "grad_norm": 1.960463285446167, "learning_rate": 3.2281312525010006e-05, "loss": 0.647, "mean_token_accuracy": 0.8086575627326965, "num_tokens": 460216619.0, "step": 44330 }, { "entropy": 0.6709145903587341, "epoch": 0.35472, "grad_norm": 3.6039161682128906, "learning_rate": 3.227731092436975e-05, "loss": 0.6662, "mean_token_accuracy": 0.7964960277080536, "num_tokens": 460346752.0, "step": 44340 }, { "entropy": 0.7035406410694123, "epoch": 0.3548, "grad_norm": 3.922393560409546, "learning_rate": 3.2273309323729493e-05, "loss": 0.6851, "mean_token_accuracy": 0.8151979386806488, "num_tokens": 460386291.0, "step": 44350 }, { "entropy": 0.654712799191475, "epoch": 0.35488, "grad_norm": 1.8913501501083374, "learning_rate": 3.226930772308924e-05, "loss": 0.66, "mean_token_accuracy": 0.7920554459095002, "num_tokens": 460550131.0, "step": 44360 }, { "entropy": 0.6379908174276352, "epoch": 0.35496, "grad_norm": 4.476908206939697, "learning_rate": 3.226530612244898e-05, "loss": 0.6387, "mean_token_accuracy": 0.8140453577041626, "num_tokens": 460635616.0, "step": 44370 }, { "entropy": 0.7142340064048767, "epoch": 0.35504, "grad_norm": 2.7831876277923584, "learning_rate": 3.2261304521808725e-05, "loss": 0.7093, "mean_token_accuracy": 0.7976178884506225, "num_tokens": 460728461.0, "step": 44380 }, { "entropy": 0.6976864397525787, "epoch": 0.35512, "grad_norm": 2.171734571456909, "learning_rate": 3.225730292116847e-05, "loss": 0.7024, "mean_token_accuracy": 0.7874964416027069, "num_tokens": 460860005.0, "step": 44390 }, { "entropy": 0.6414459109306335, "epoch": 0.3552, "grad_norm": 5.600785732269287, "learning_rate": 3.225330132052821e-05, "loss": 0.6408, "mean_token_accuracy": 0.8288912117481232, "num_tokens": 460899567.0, "step": 44400 }, { "entropy": 0.6949476540088654, "epoch": 0.35528, "grad_norm": 1.4443448781967163, "learning_rate": 3.2249299719887956e-05, "loss": 0.6977, "mean_token_accuracy": 0.7911272644996643, "num_tokens": 461063407.0, "step": 44410 }, { "entropy": 0.6179213672876358, "epoch": 0.35536, "grad_norm": 3.7282893657684326, "learning_rate": 3.22452981192477e-05, "loss": 0.6127, "mean_token_accuracy": 0.8184221088886261, "num_tokens": 461144449.0, "step": 44420 }, { "entropy": 0.7007597863674164, "epoch": 0.35544, "grad_norm": 1.3776803016662598, "learning_rate": 3.224129651860744e-05, "loss": 0.6872, "mean_token_accuracy": 0.806399142742157, "num_tokens": 461238403.0, "step": 44430 }, { "entropy": 0.6857496201992035, "epoch": 0.35552, "grad_norm": 2.002079486846924, "learning_rate": 3.223729491796719e-05, "loss": 0.6852, "mean_token_accuracy": 0.7890484035015106, "num_tokens": 461375724.0, "step": 44440 }, { "entropy": 0.6453664153814316, "epoch": 0.3556, "grad_norm": 4.441694259643555, "learning_rate": 3.223329331732694e-05, "loss": 0.6446, "mean_token_accuracy": 0.8333254754543304, "num_tokens": 461413260.0, "step": 44450 }, { "entropy": 0.636895477771759, "epoch": 0.35568, "grad_norm": 1.8811681270599365, "learning_rate": 3.2229291716686674e-05, "loss": 0.6322, "mean_token_accuracy": 0.800090742111206, "num_tokens": 461576895.0, "step": 44460 }, { "entropy": 0.6953371942043305, "epoch": 0.35576, "grad_norm": 3.675203323364258, "learning_rate": 3.222529011604642e-05, "loss": 0.6934, "mean_token_accuracy": 0.8036539554595947, "num_tokens": 461657107.0, "step": 44470 }, { "entropy": 0.6939321458339691, "epoch": 0.35584, "grad_norm": 1.852442741394043, "learning_rate": 3.222128851540616e-05, "loss": 0.6926, "mean_token_accuracy": 0.7990390002727509, "num_tokens": 461749880.0, "step": 44480 }, { "entropy": 0.6633140087127686, "epoch": 0.35592, "grad_norm": 3.4768335819244385, "learning_rate": 3.221728691476591e-05, "loss": 0.6608, "mean_token_accuracy": 0.7981104016304016, "num_tokens": 461893935.0, "step": 44490 }, { "entropy": 0.6131524741649628, "epoch": 0.356, "grad_norm": 5.3057475090026855, "learning_rate": 3.221328531412565e-05, "loss": 0.6068, "mean_token_accuracy": 0.8330170691013337, "num_tokens": 461934812.0, "step": 44500 }, { "entropy": 0.6701534509658813, "epoch": 0.35608, "grad_norm": 1.8971806764602661, "learning_rate": 3.220928371348539e-05, "loss": 0.6649, "mean_token_accuracy": 0.7901127576828003, "num_tokens": 462098592.0, "step": 44510 }, { "entropy": 0.6798520147800445, "epoch": 0.35616, "grad_norm": 3.9414126873016357, "learning_rate": 3.220528211284514e-05, "loss": 0.6746, "mean_token_accuracy": 0.8054875731468201, "num_tokens": 462190072.0, "step": 44520 }, { "entropy": 0.6654811143875122, "epoch": 0.35624, "grad_norm": 1.620293378829956, "learning_rate": 3.220128051220489e-05, "loss": 0.6725, "mean_token_accuracy": 0.8050237178802491, "num_tokens": 462285325.0, "step": 44530 }, { "entropy": 0.7707163870334626, "epoch": 0.35632, "grad_norm": 2.4944095611572266, "learning_rate": 3.2197278911564624e-05, "loss": 0.7555, "mean_token_accuracy": 0.7761540353298187, "num_tokens": 462415727.0, "step": 44540 }, { "entropy": 0.6898195087909699, "epoch": 0.3564, "grad_norm": 4.491970062255859, "learning_rate": 3.219327731092437e-05, "loss": 0.6819, "mean_token_accuracy": 0.8182611644268036, "num_tokens": 462452386.0, "step": 44550 }, { "entropy": 0.6589364111423492, "epoch": 0.35648, "grad_norm": 2.0946829319000244, "learning_rate": 3.218927571028412e-05, "loss": 0.66, "mean_token_accuracy": 0.7882510960102082, "num_tokens": 462616226.0, "step": 44560 }, { "entropy": 0.7256138443946838, "epoch": 0.35656, "grad_norm": 3.2814176082611084, "learning_rate": 3.218527410964386e-05, "loss": 0.7241, "mean_token_accuracy": 0.7944461762905121, "num_tokens": 462706544.0, "step": 44570 }, { "entropy": 0.7118258357048035, "epoch": 0.35664, "grad_norm": 1.5032143592834473, "learning_rate": 3.21812725090036e-05, "loss": 0.6923, "mean_token_accuracy": 0.7991929829120636, "num_tokens": 462800102.0, "step": 44580 }, { "entropy": 0.7132286548614502, "epoch": 0.35672, "grad_norm": 2.3106789588928223, "learning_rate": 3.217727090836335e-05, "loss": 0.7063, "mean_token_accuracy": 0.7894313335418701, "num_tokens": 462930202.0, "step": 44590 }, { "entropy": 0.6898552477359772, "epoch": 0.3568, "grad_norm": 4.6095967292785645, "learning_rate": 3.217326930772309e-05, "loss": 0.6943, "mean_token_accuracy": 0.8147911131381989, "num_tokens": 462964493.0, "step": 44600 }, { "entropy": 0.641590741276741, "epoch": 0.35688, "grad_norm": 1.5416042804718018, "learning_rate": 3.216926770708284e-05, "loss": 0.6414, "mean_token_accuracy": 0.8006584167480468, "num_tokens": 463128332.0, "step": 44610 }, { "entropy": 0.6644994616508484, "epoch": 0.35696, "grad_norm": 3.1294355392456055, "learning_rate": 3.2165266106442574e-05, "loss": 0.6708, "mean_token_accuracy": 0.8093185067176819, "num_tokens": 463213662.0, "step": 44620 }, { "entropy": 0.7438838064670563, "epoch": 0.35704, "grad_norm": 1.681151032447815, "learning_rate": 3.2161264505802324e-05, "loss": 0.7369, "mean_token_accuracy": 0.7896659135818481, "num_tokens": 463308884.0, "step": 44630 }, { "entropy": 0.6864937007427215, "epoch": 0.35712, "grad_norm": 2.1461503505706787, "learning_rate": 3.215726290516207e-05, "loss": 0.7074, "mean_token_accuracy": 0.7858795523643494, "num_tokens": 463460420.0, "step": 44640 }, { "entropy": 0.6465192407369613, "epoch": 0.3572, "grad_norm": 4.418943405151367, "learning_rate": 3.215326130452181e-05, "loss": 0.6321, "mean_token_accuracy": 0.8269768953323364, "num_tokens": 463507009.0, "step": 44650 }, { "entropy": 0.6317429006099701, "epoch": 0.35728, "grad_norm": 1.3430362939834595, "learning_rate": 3.2149259703881555e-05, "loss": 0.6286, "mean_token_accuracy": 0.7974719107151031, "num_tokens": 463670849.0, "step": 44660 }, { "entropy": 0.7019365310668946, "epoch": 0.35736, "grad_norm": 5.192697525024414, "learning_rate": 3.21452581032413e-05, "loss": 0.7045, "mean_token_accuracy": 0.8019921720027924, "num_tokens": 463746916.0, "step": 44670 }, { "entropy": 0.6753624498844146, "epoch": 0.35744, "grad_norm": 1.604204773902893, "learning_rate": 3.214125650260104e-05, "loss": 0.6639, "mean_token_accuracy": 0.8070875465869903, "num_tokens": 463841042.0, "step": 44680 }, { "entropy": 0.7044304221868515, "epoch": 0.35752, "grad_norm": 2.2352592945098877, "learning_rate": 3.2137254901960786e-05, "loss": 0.6964, "mean_token_accuracy": 0.7870619416236877, "num_tokens": 463983350.0, "step": 44690 }, { "entropy": 0.6671429872512817, "epoch": 0.3576, "grad_norm": 4.760208606719971, "learning_rate": 3.213325330132053e-05, "loss": 0.6754, "mean_token_accuracy": 0.8191865921020508, "num_tokens": 464018373.0, "step": 44700 }, { "entropy": 0.6776389360427857, "epoch": 0.35768, "grad_norm": 2.213040828704834, "learning_rate": 3.2129251700680274e-05, "loss": 0.6784, "mean_token_accuracy": 0.788043487071991, "num_tokens": 464182213.0, "step": 44710 }, { "entropy": 0.7388453483581543, "epoch": 0.35776, "grad_norm": 3.2734501361846924, "learning_rate": 3.212525010004002e-05, "loss": 0.7281, "mean_token_accuracy": 0.7939410209655762, "num_tokens": 464264342.0, "step": 44720 }, { "entropy": 0.6737010598182678, "epoch": 0.35784, "grad_norm": 1.4395129680633545, "learning_rate": 3.212124849939976e-05, "loss": 0.6788, "mean_token_accuracy": 0.8033208608627319, "num_tokens": 464356683.0, "step": 44730 }, { "entropy": 0.7689559757709503, "epoch": 0.35792, "grad_norm": 2.585561513900757, "learning_rate": 3.2117246898759505e-05, "loss": 0.7646, "mean_token_accuracy": 0.7729054510593414, "num_tokens": 464489190.0, "step": 44740 }, { "entropy": 0.699128520488739, "epoch": 0.358, "grad_norm": 5.676526069641113, "learning_rate": 3.211324529811925e-05, "loss": 0.7006, "mean_token_accuracy": 0.8150096654891967, "num_tokens": 464526707.0, "step": 44750 }, { "entropy": 0.6411950886249542, "epoch": 0.35808, "grad_norm": 1.7655119895935059, "learning_rate": 3.210924369747899e-05, "loss": 0.6373, "mean_token_accuracy": 0.7944857776165009, "num_tokens": 464690528.0, "step": 44760 }, { "entropy": 0.6568853914737701, "epoch": 0.35816, "grad_norm": 4.426377773284912, "learning_rate": 3.2105242096838736e-05, "loss": 0.6517, "mean_token_accuracy": 0.8113075077533722, "num_tokens": 464772643.0, "step": 44770 }, { "entropy": 0.7283334732055664, "epoch": 0.35824, "grad_norm": 3.029571771621704, "learning_rate": 3.210124049619848e-05, "loss": 0.7221, "mean_token_accuracy": 0.7968691170215607, "num_tokens": 464866203.0, "step": 44780 }, { "entropy": 0.7361651599407196, "epoch": 0.35832, "grad_norm": 2.290057897567749, "learning_rate": 3.2097238895558224e-05, "loss": 0.7321, "mean_token_accuracy": 0.7773764669895172, "num_tokens": 465002193.0, "step": 44790 }, { "entropy": 0.7256452023983002, "epoch": 0.3584, "grad_norm": 4.3069915771484375, "learning_rate": 3.209323729491797e-05, "loss": 0.7134, "mean_token_accuracy": 0.8126459658145905, "num_tokens": 465043069.0, "step": 44800 }, { "entropy": 0.6596769571304322, "epoch": 0.35848, "grad_norm": 1.585774540901184, "learning_rate": 3.208923569427771e-05, "loss": 0.6627, "mean_token_accuracy": 0.7916688919067383, "num_tokens": 465206054.0, "step": 44810 }, { "entropy": 0.7160611748695374, "epoch": 0.35856, "grad_norm": 3.516375780105591, "learning_rate": 3.2085234093637455e-05, "loss": 0.7089, "mean_token_accuracy": 0.802764892578125, "num_tokens": 465275799.0, "step": 44820 }, { "entropy": 0.6391971707344055, "epoch": 0.35864, "grad_norm": 2.004920244216919, "learning_rate": 3.20812324929972e-05, "loss": 0.6386, "mean_token_accuracy": 0.8125634729862213, "num_tokens": 465368014.0, "step": 44830 }, { "entropy": 0.6469950258731842, "epoch": 0.35872, "grad_norm": 2.51785945892334, "learning_rate": 3.207723089235695e-05, "loss": 0.6454, "mean_token_accuracy": 0.8003439784049988, "num_tokens": 465497834.0, "step": 44840 }, { "entropy": 0.7022814989089966, "epoch": 0.3588, "grad_norm": 4.7196550369262695, "learning_rate": 3.2073229291716686e-05, "loss": 0.6851, "mean_token_accuracy": 0.8126578867435456, "num_tokens": 465537761.0, "step": 44850 }, { "entropy": 0.6349909365177154, "epoch": 0.35888, "grad_norm": 1.4982484579086304, "learning_rate": 3.206922769107643e-05, "loss": 0.6337, "mean_token_accuracy": 0.7999817788600921, "num_tokens": 465701312.0, "step": 44860 }, { "entropy": 0.6434086561203003, "epoch": 0.35896, "grad_norm": 3.9923553466796875, "learning_rate": 3.206522609043617e-05, "loss": 0.6396, "mean_token_accuracy": 0.816442996263504, "num_tokens": 465777762.0, "step": 44870 }, { "entropy": 0.7224907636642456, "epoch": 0.35904, "grad_norm": 2.437248706817627, "learning_rate": 3.2061224489795924e-05, "loss": 0.7223, "mean_token_accuracy": 0.7969660699367523, "num_tokens": 465871021.0, "step": 44880 }, { "entropy": 0.6679006159305573, "epoch": 0.35912, "grad_norm": 3.225172758102417, "learning_rate": 3.205722288915566e-05, "loss": 0.6619, "mean_token_accuracy": 0.7957839131355285, "num_tokens": 466017276.0, "step": 44890 }, { "entropy": 0.6778702080249787, "epoch": 0.3592, "grad_norm": 5.3030595779418945, "learning_rate": 3.2053221288515404e-05, "loss": 0.6831, "mean_token_accuracy": 0.816403865814209, "num_tokens": 466061703.0, "step": 44900 }, { "entropy": 0.6899653315544129, "epoch": 0.35928, "grad_norm": 1.8255648612976074, "learning_rate": 3.2049219687875155e-05, "loss": 0.686, "mean_token_accuracy": 0.7831820666790008, "num_tokens": 466222865.0, "step": 44910 }, { "entropy": 0.7020414650440217, "epoch": 0.35936, "grad_norm": 3.3943030834198, "learning_rate": 3.20452180872349e-05, "loss": 0.6866, "mean_token_accuracy": 0.8090151846408844, "num_tokens": 466291655.0, "step": 44920 }, { "entropy": 0.666763710975647, "epoch": 0.35944, "grad_norm": 2.239281177520752, "learning_rate": 3.2041216486594636e-05, "loss": 0.6741, "mean_token_accuracy": 0.8033583343029023, "num_tokens": 466383282.0, "step": 44930 }, { "entropy": 0.6708672881126404, "epoch": 0.35952, "grad_norm": 3.1932384967803955, "learning_rate": 3.203721488595438e-05, "loss": 0.6633, "mean_token_accuracy": 0.797940319776535, "num_tokens": 466518413.0, "step": 44940 }, { "entropy": 0.7324462294578552, "epoch": 0.3596, "grad_norm": 4.63865327835083, "learning_rate": 3.203321328531413e-05, "loss": 0.7242, "mean_token_accuracy": 0.8127978682518006, "num_tokens": 466553024.0, "step": 44950 }, { "entropy": 0.6627133131027222, "epoch": 0.35968, "grad_norm": 2.7451443672180176, "learning_rate": 3.2029211684673874e-05, "loss": 0.6605, "mean_token_accuracy": 0.793172937631607, "num_tokens": 466716864.0, "step": 44960 }, { "entropy": 0.7195792227983475, "epoch": 0.35976, "grad_norm": 3.4200236797332764, "learning_rate": 3.202521008403361e-05, "loss": 0.7229, "mean_token_accuracy": 0.7932877659797668, "num_tokens": 466815777.0, "step": 44970 }, { "entropy": 0.716311514377594, "epoch": 0.35984, "grad_norm": 2.782804489135742, "learning_rate": 3.202120848339336e-05, "loss": 0.7162, "mean_token_accuracy": 0.7969492256641388, "num_tokens": 466912219.0, "step": 44980 }, { "entropy": 0.7028019189834595, "epoch": 0.35992, "grad_norm": 2.6774215698242188, "learning_rate": 3.2017206882753105e-05, "loss": 0.6988, "mean_token_accuracy": 0.7855830550193786, "num_tokens": 467058294.0, "step": 44990 }, { "entropy": 0.6353817403316497, "epoch": 0.36, "grad_norm": 5.0214924812316895, "learning_rate": 3.201320528211285e-05, "loss": 0.6238, "mean_token_accuracy": 0.8265964031219483, "num_tokens": 467102220.0, "step": 45000 }, { "entropy": 0.6364384710788726, "epoch": 0.36008, "grad_norm": 1.6752135753631592, "learning_rate": 3.2009203681472585e-05, "loss": 0.6376, "mean_token_accuracy": 0.7982002973556519, "num_tokens": 467264954.0, "step": 45010 }, { "entropy": 0.7709848940372467, "epoch": 0.36016, "grad_norm": 3.0837316513061523, "learning_rate": 3.2005202080832336e-05, "loss": 0.7606, "mean_token_accuracy": 0.7946932077407837, "num_tokens": 467342111.0, "step": 45020 }, { "entropy": 0.7131686866283417, "epoch": 0.36024, "grad_norm": 1.9288781881332397, "learning_rate": 3.200120048019208e-05, "loss": 0.7081, "mean_token_accuracy": 0.8017375826835632, "num_tokens": 467434056.0, "step": 45030 }, { "entropy": 0.6786032736301422, "epoch": 0.36032, "grad_norm": 2.5751616954803467, "learning_rate": 3.199719887955182e-05, "loss": 0.6832, "mean_token_accuracy": 0.7900064051151275, "num_tokens": 467577277.0, "step": 45040 }, { "entropy": 0.7300406038761139, "epoch": 0.3604, "grad_norm": 4.8770060539245605, "learning_rate": 3.199319727891157e-05, "loss": 0.7265, "mean_token_accuracy": 0.8077235877513885, "num_tokens": 467616273.0, "step": 45050 }, { "entropy": 0.6165395557880402, "epoch": 0.36048, "grad_norm": 2.8666913509368896, "learning_rate": 3.198919567827131e-05, "loss": 0.6068, "mean_token_accuracy": 0.8041464686393738, "num_tokens": 467778893.0, "step": 45060 }, { "entropy": 0.6167364299297333, "epoch": 0.36056, "grad_norm": 3.376110076904297, "learning_rate": 3.1985194077631054e-05, "loss": 0.6111, "mean_token_accuracy": 0.8205816447734833, "num_tokens": 467848861.0, "step": 45070 }, { "entropy": 0.7646422266960144, "epoch": 0.36064, "grad_norm": 2.623906135559082, "learning_rate": 3.19811924769908e-05, "loss": 0.7674, "mean_token_accuracy": 0.78862966299057, "num_tokens": 467942519.0, "step": 45080 }, { "entropy": 0.6890443801879883, "epoch": 0.36072, "grad_norm": 2.0603697299957275, "learning_rate": 3.197719087635054e-05, "loss": 0.6841, "mean_token_accuracy": 0.791387927532196, "num_tokens": 468083515.0, "step": 45090 }, { "entropy": 0.6525414437055588, "epoch": 0.3608, "grad_norm": 4.689109802246094, "learning_rate": 3.1973189275710285e-05, "loss": 0.6414, "mean_token_accuracy": 0.8246653735637665, "num_tokens": 468125413.0, "step": 45100 }, { "entropy": 0.6859546720981597, "epoch": 0.36088, "grad_norm": 2.0976455211639404, "learning_rate": 3.196918767507003e-05, "loss": 0.6825, "mean_token_accuracy": 0.7879879951477051, "num_tokens": 468288975.0, "step": 45110 }, { "entropy": 0.6682309955358505, "epoch": 0.36096, "grad_norm": 3.182284355163574, "learning_rate": 3.196518607442977e-05, "loss": 0.6553, "mean_token_accuracy": 0.8146282076835633, "num_tokens": 468359318.0, "step": 45120 }, { "entropy": 0.6902442812919617, "epoch": 0.36104, "grad_norm": 1.9586106538772583, "learning_rate": 3.196118447378952e-05, "loss": 0.68, "mean_token_accuracy": 0.8048744618892669, "num_tokens": 468453064.0, "step": 45130 }, { "entropy": 0.6730538725852966, "epoch": 0.36112, "grad_norm": 2.469818353652954, "learning_rate": 3.195718287314926e-05, "loss": 0.6802, "mean_token_accuracy": 0.7921754539012908, "num_tokens": 468594156.0, "step": 45140 }, { "entropy": 0.6986342191696167, "epoch": 0.3612, "grad_norm": 4.503365993499756, "learning_rate": 3.1953181272509004e-05, "loss": 0.6825, "mean_token_accuracy": 0.8167340338230134, "num_tokens": 468632784.0, "step": 45150 }, { "entropy": 0.6780755937099456, "epoch": 0.36128, "grad_norm": 2.773483991622925, "learning_rate": 3.194917967186875e-05, "loss": 0.675, "mean_token_accuracy": 0.7887640416622161, "num_tokens": 468796624.0, "step": 45160 }, { "entropy": 0.5455418407917023, "epoch": 0.36136, "grad_norm": 3.4734432697296143, "learning_rate": 3.194517807122849e-05, "loss": 0.5314, "mean_token_accuracy": 0.8369139671325684, "num_tokens": 468882295.0, "step": 45170 }, { "entropy": 0.6388763010501861, "epoch": 0.36144, "grad_norm": 1.6017295122146606, "learning_rate": 3.1941176470588235e-05, "loss": 0.6472, "mean_token_accuracy": 0.8103324174880981, "num_tokens": 468975946.0, "step": 45180 }, { "entropy": 0.6645319700241089, "epoch": 0.36152, "grad_norm": 1.8826559782028198, "learning_rate": 3.1937174869947986e-05, "loss": 0.6594, "mean_token_accuracy": 0.796282821893692, "num_tokens": 469118994.0, "step": 45190 }, { "entropy": 0.7566877484321595, "epoch": 0.3616, "grad_norm": 5.622277736663818, "learning_rate": 3.193317326930772e-05, "loss": 0.7569, "mean_token_accuracy": 0.8031009197235107, "num_tokens": 469155766.0, "step": 45200 }, { "entropy": 0.6627835512161255, "epoch": 0.36168, "grad_norm": 2.2331368923187256, "learning_rate": 3.1929171668667466e-05, "loss": 0.6626, "mean_token_accuracy": 0.7894907295703888, "num_tokens": 469319606.0, "step": 45210 }, { "entropy": 0.6542768806219101, "epoch": 0.36176, "grad_norm": 3.383273124694824, "learning_rate": 3.192517006802721e-05, "loss": 0.65, "mean_token_accuracy": 0.8131427884101867, "num_tokens": 469397042.0, "step": 45220 }, { "entropy": 0.7114742815494537, "epoch": 0.36184, "grad_norm": 1.8911943435668945, "learning_rate": 3.192116846738696e-05, "loss": 0.6928, "mean_token_accuracy": 0.798816692829132, "num_tokens": 469489044.0, "step": 45230 }, { "entropy": 0.7077245712280273, "epoch": 0.36192, "grad_norm": 2.26700496673584, "learning_rate": 3.19171668667467e-05, "loss": 0.7038, "mean_token_accuracy": 0.7851408302783967, "num_tokens": 469625903.0, "step": 45240 }, { "entropy": 0.6889564573764801, "epoch": 0.362, "grad_norm": 4.065036773681641, "learning_rate": 3.191316526610644e-05, "loss": 0.6838, "mean_token_accuracy": 0.8173484325408935, "num_tokens": 469663467.0, "step": 45250 }, { "entropy": 0.6990552186965943, "epoch": 0.36208, "grad_norm": 1.8329991102218628, "learning_rate": 3.190916366546619e-05, "loss": 0.6887, "mean_token_accuracy": 0.7856454968452453, "num_tokens": 469826722.0, "step": 45260 }, { "entropy": 0.6249801218509674, "epoch": 0.36216, "grad_norm": 3.213376998901367, "learning_rate": 3.1905162064825935e-05, "loss": 0.6281, "mean_token_accuracy": 0.8154550313949585, "num_tokens": 469908474.0, "step": 45270 }, { "entropy": 0.7041952073574066, "epoch": 0.36224, "grad_norm": 2.5981943607330322, "learning_rate": 3.190116046418567e-05, "loss": 0.7073, "mean_token_accuracy": 0.7957584381103515, "num_tokens": 470004050.0, "step": 45280 }, { "entropy": 0.738406628370285, "epoch": 0.36232, "grad_norm": 1.8320420980453491, "learning_rate": 3.1897158863545416e-05, "loss": 0.7321, "mean_token_accuracy": 0.7837781012058258, "num_tokens": 470145912.0, "step": 45290 }, { "entropy": 0.6559513151645661, "epoch": 0.3624, "grad_norm": 4.90236759185791, "learning_rate": 3.1893157262905167e-05, "loss": 0.6499, "mean_token_accuracy": 0.8215591073036194, "num_tokens": 470187164.0, "step": 45300 }, { "entropy": 0.6213062018156051, "epoch": 0.36248, "grad_norm": 1.3126698732376099, "learning_rate": 3.188915566226491e-05, "loss": 0.6187, "mean_token_accuracy": 0.803486806154251, "num_tokens": 470351004.0, "step": 45310 }, { "entropy": 0.7345186173915863, "epoch": 0.36256, "grad_norm": 3.0509262084960938, "learning_rate": 3.188515406162465e-05, "loss": 0.7367, "mean_token_accuracy": 0.7942331075668335, "num_tokens": 470445194.0, "step": 45320 }, { "entropy": 0.7025833606719971, "epoch": 0.36264, "grad_norm": 1.5595018863677979, "learning_rate": 3.188115246098439e-05, "loss": 0.7113, "mean_token_accuracy": 0.7938463270664216, "num_tokens": 470539682.0, "step": 45330 }, { "entropy": 0.7112751364707947, "epoch": 0.36272, "grad_norm": 2.259882926940918, "learning_rate": 3.187715086034414e-05, "loss": 0.694, "mean_token_accuracy": 0.7887637138366699, "num_tokens": 470678465.0, "step": 45340 }, { "entropy": 0.6906847208738327, "epoch": 0.3628, "grad_norm": 5.078019142150879, "learning_rate": 3.1873149259703885e-05, "loss": 0.689, "mean_token_accuracy": 0.816045093536377, "num_tokens": 470714246.0, "step": 45350 }, { "entropy": 0.6439386487007142, "epoch": 0.36288, "grad_norm": 1.8248168230056763, "learning_rate": 3.186914765906362e-05, "loss": 0.6413, "mean_token_accuracy": 0.7991695165634155, "num_tokens": 470878086.0, "step": 45360 }, { "entropy": 0.6888016700744629, "epoch": 0.36296, "grad_norm": 3.3727078437805176, "learning_rate": 3.186514605842337e-05, "loss": 0.6858, "mean_token_accuracy": 0.8074071645736695, "num_tokens": 470955122.0, "step": 45370 }, { "entropy": 0.7182723939418793, "epoch": 0.36304, "grad_norm": 2.203989028930664, "learning_rate": 3.1861144457783116e-05, "loss": 0.7272, "mean_token_accuracy": 0.7949142217636108, "num_tokens": 471047306.0, "step": 45380 }, { "entropy": 0.6586539268493652, "epoch": 0.36312, "grad_norm": 3.049260139465332, "learning_rate": 3.185714285714286e-05, "loss": 0.6472, "mean_token_accuracy": 0.7978297948837281, "num_tokens": 471194248.0, "step": 45390 }, { "entropy": 0.6875478863716126, "epoch": 0.3632, "grad_norm": 5.550519943237305, "learning_rate": 3.18531412565026e-05, "loss": 0.68, "mean_token_accuracy": 0.8221526443958282, "num_tokens": 471236482.0, "step": 45400 }, { "entropy": 0.7152871072292328, "epoch": 0.36328, "grad_norm": 1.408777117729187, "learning_rate": 3.184913965586235e-05, "loss": 0.7163, "mean_token_accuracy": 0.7798513352870942, "num_tokens": 471400129.0, "step": 45410 }, { "entropy": 0.6395212322473526, "epoch": 0.36336, "grad_norm": 3.8052594661712646, "learning_rate": 3.184513805522209e-05, "loss": 0.6443, "mean_token_accuracy": 0.8175150752067566, "num_tokens": 471473779.0, "step": 45420 }, { "entropy": 0.7034120321273803, "epoch": 0.36344, "grad_norm": 2.359116554260254, "learning_rate": 3.1841136454581835e-05, "loss": 0.7017, "mean_token_accuracy": 0.7985696911811828, "num_tokens": 471566218.0, "step": 45430 }, { "entropy": 0.6676263034343719, "epoch": 0.36352, "grad_norm": 2.759840488433838, "learning_rate": 3.183713485394158e-05, "loss": 0.6567, "mean_token_accuracy": 0.7970960676670075, "num_tokens": 471700590.0, "step": 45440 }, { "entropy": 0.6653890877962112, "epoch": 0.3636, "grad_norm": 4.679049015045166, "learning_rate": 3.183313325330132e-05, "loss": 0.6613, "mean_token_accuracy": 0.8182582437992096, "num_tokens": 471736994.0, "step": 45450 }, { "entropy": 0.6508447647094726, "epoch": 0.36368, "grad_norm": 1.7949296236038208, "learning_rate": 3.1829131652661066e-05, "loss": 0.6508, "mean_token_accuracy": 0.79427210688591, "num_tokens": 471900834.0, "step": 45460 }, { "entropy": 0.693162277340889, "epoch": 0.36376, "grad_norm": 3.461536407470703, "learning_rate": 3.182513005202081e-05, "loss": 0.6855, "mean_token_accuracy": 0.8035815715789795, "num_tokens": 471992179.0, "step": 45470 }, { "entropy": 0.7297193825244903, "epoch": 0.36384, "grad_norm": 1.5784056186676025, "learning_rate": 3.1821128451380553e-05, "loss": 0.7257, "mean_token_accuracy": 0.7917249977588654, "num_tokens": 472085674.0, "step": 45480 }, { "entropy": 0.6483953356742859, "epoch": 0.36392, "grad_norm": 3.1963486671447754, "learning_rate": 3.18171268507403e-05, "loss": 0.6456, "mean_token_accuracy": 0.7987158596515656, "num_tokens": 472229746.0, "step": 45490 }, { "entropy": 0.6619321227073669, "epoch": 0.364, "grad_norm": 5.454357147216797, "learning_rate": 3.181312525010004e-05, "loss": 0.6523, "mean_token_accuracy": 0.819201534986496, "num_tokens": 472274735.0, "step": 45500 }, { "entropy": 0.6131232023239136, "epoch": 0.36408, "grad_norm": 1.90065598487854, "learning_rate": 3.1809123649459785e-05, "loss": 0.6098, "mean_token_accuracy": 0.8028894066810608, "num_tokens": 472438253.0, "step": 45510 }, { "entropy": 0.7462464451789856, "epoch": 0.36416, "grad_norm": 2.925076723098755, "learning_rate": 3.180512204881953e-05, "loss": 0.7423, "mean_token_accuracy": 0.7937355756759643, "num_tokens": 472517231.0, "step": 45520 }, { "entropy": 0.6623136579990387, "epoch": 0.36424, "grad_norm": 2.0061097145080566, "learning_rate": 3.180112044817927e-05, "loss": 0.6674, "mean_token_accuracy": 0.8037448823451996, "num_tokens": 472610339.0, "step": 45530 }, { "entropy": 0.723106324672699, "epoch": 0.36432, "grad_norm": 2.6370253562927246, "learning_rate": 3.1797118847539016e-05, "loss": 0.7123, "mean_token_accuracy": 0.7825429320335389, "num_tokens": 472748501.0, "step": 45540 }, { "entropy": 0.7430795073509217, "epoch": 0.3644, "grad_norm": 4.620197772979736, "learning_rate": 3.179311724689876e-05, "loss": 0.7269, "mean_token_accuracy": 0.8101366102695465, "num_tokens": 472784770.0, "step": 45550 }, { "entropy": 0.680181235074997, "epoch": 0.36448, "grad_norm": 1.4909542798995972, "learning_rate": 3.17891156462585e-05, "loss": 0.6818, "mean_token_accuracy": 0.786376416683197, "num_tokens": 472948610.0, "step": 45560 }, { "entropy": 0.664658111333847, "epoch": 0.36456, "grad_norm": 3.4442052841186523, "learning_rate": 3.178511404561825e-05, "loss": 0.6655, "mean_token_accuracy": 0.8049159705638885, "num_tokens": 473035294.0, "step": 45570 }, { "entropy": 0.6981842815876007, "epoch": 0.36464, "grad_norm": 1.3800749778747559, "learning_rate": 3.1781112444978e-05, "loss": 0.6948, "mean_token_accuracy": 0.8002778232097626, "num_tokens": 473127809.0, "step": 45580 }, { "entropy": 0.6694527208805084, "epoch": 0.36472, "grad_norm": 2.0011239051818848, "learning_rate": 3.1777110844337734e-05, "loss": 0.6637, "mean_token_accuracy": 0.7948454082012176, "num_tokens": 473275965.0, "step": 45590 }, { "entropy": 0.6801258802413941, "epoch": 0.3648, "grad_norm": 5.638204574584961, "learning_rate": 3.177310924369748e-05, "loss": 0.6758, "mean_token_accuracy": 0.8176331400871277, "num_tokens": 473319569.0, "step": 45600 }, { "entropy": 0.6764623105525971, "epoch": 0.36488, "grad_norm": 2.3631441593170166, "learning_rate": 3.176910764305722e-05, "loss": 0.682, "mean_token_accuracy": 0.7858373045921325, "num_tokens": 473481450.0, "step": 45610 }, { "entropy": 0.7035014033317566, "epoch": 0.36496, "grad_norm": 4.651878356933594, "learning_rate": 3.176510604241697e-05, "loss": 0.6909, "mean_token_accuracy": 0.8051041960716248, "num_tokens": 473556306.0, "step": 45620 }, { "entropy": 0.7123866558074952, "epoch": 0.36504, "grad_norm": 2.122969627380371, "learning_rate": 3.176110444177671e-05, "loss": 0.7057, "mean_token_accuracy": 0.8001438438892364, "num_tokens": 473649963.0, "step": 45630 }, { "entropy": 0.7105086028575898, "epoch": 0.36512, "grad_norm": 2.3206417560577393, "learning_rate": 3.175710284113645e-05, "loss": 0.7059, "mean_token_accuracy": 0.7882544755935669, "num_tokens": 473787464.0, "step": 45640 }, { "entropy": 0.7620268762111664, "epoch": 0.3652, "grad_norm": 6.2536773681640625, "learning_rate": 3.17531012404962e-05, "loss": 0.7564, "mean_token_accuracy": 0.7980055689811707, "num_tokens": 473827680.0, "step": 45650 }, { "entropy": 0.6295276284217834, "epoch": 0.36528, "grad_norm": 1.7098274230957031, "learning_rate": 3.174909963985595e-05, "loss": 0.6289, "mean_token_accuracy": 0.7971307754516601, "num_tokens": 473989709.0, "step": 45660 }, { "entropy": 0.6264082878828049, "epoch": 0.36536, "grad_norm": 3.180365800857544, "learning_rate": 3.1745098039215684e-05, "loss": 0.6202, "mean_token_accuracy": 0.8182412385940552, "num_tokens": 474064941.0, "step": 45670 }, { "entropy": 0.668610680103302, "epoch": 0.36544, "grad_norm": 1.4492682218551636, "learning_rate": 3.174109643857543e-05, "loss": 0.6583, "mean_token_accuracy": 0.809865415096283, "num_tokens": 474157417.0, "step": 45680 }, { "entropy": 0.6779583334922791, "epoch": 0.36552, "grad_norm": 4.099092483520508, "learning_rate": 3.173709483793518e-05, "loss": 0.6783, "mean_token_accuracy": 0.795347797870636, "num_tokens": 474292732.0, "step": 45690 }, { "entropy": 0.7078100502490997, "epoch": 0.3656, "grad_norm": 4.661482334136963, "learning_rate": 3.173309323729492e-05, "loss": 0.7022, "mean_token_accuracy": 0.8117845296859741, "num_tokens": 474328135.0, "step": 45700 }, { "entropy": 0.6391501665115357, "epoch": 0.36568, "grad_norm": 1.5929149389266968, "learning_rate": 3.172909163665466e-05, "loss": 0.6342, "mean_token_accuracy": 0.7983378469944, "num_tokens": 474491311.0, "step": 45710 }, { "entropy": 0.6488377839326859, "epoch": 0.36576, "grad_norm": 3.142111301422119, "learning_rate": 3.172509003601441e-05, "loss": 0.6459, "mean_token_accuracy": 0.8128696203231811, "num_tokens": 474564032.0, "step": 45720 }, { "entropy": 0.804606556892395, "epoch": 0.36584, "grad_norm": 1.8943109512329102, "learning_rate": 3.172108843537415e-05, "loss": 0.7991, "mean_token_accuracy": 0.7797929644584656, "num_tokens": 474657674.0, "step": 45730 }, { "entropy": 0.7085259199142456, "epoch": 0.36592, "grad_norm": 2.1942708492279053, "learning_rate": 3.17170868347339e-05, "loss": 0.7085, "mean_token_accuracy": 0.7847232103347779, "num_tokens": 474800089.0, "step": 45740 }, { "entropy": 0.6689802765846252, "epoch": 0.366, "grad_norm": 5.123831272125244, "learning_rate": 3.1713085234093634e-05, "loss": 0.6754, "mean_token_accuracy": 0.8140792071819305, "num_tokens": 474838819.0, "step": 45750 }, { "entropy": 0.6482818067073822, "epoch": 0.36608, "grad_norm": 1.6201857328414917, "learning_rate": 3.1709083633453384e-05, "loss": 0.6442, "mean_token_accuracy": 0.7950758159160614, "num_tokens": 475002132.0, "step": 45760 }, { "entropy": 0.6462069928646088, "epoch": 0.36616, "grad_norm": 2.6691651344299316, "learning_rate": 3.170508203281313e-05, "loss": 0.6415, "mean_token_accuracy": 0.8129720747470855, "num_tokens": 475082103.0, "step": 45770 }, { "entropy": 0.6903126060962677, "epoch": 0.36624, "grad_norm": 1.7191218137741089, "learning_rate": 3.170108043217287e-05, "loss": 0.682, "mean_token_accuracy": 0.8032392263412476, "num_tokens": 475175574.0, "step": 45780 }, { "entropy": 0.661971926689148, "epoch": 0.36632, "grad_norm": 2.948401689529419, "learning_rate": 3.1697078831532615e-05, "loss": 0.6647, "mean_token_accuracy": 0.7959568202495575, "num_tokens": 475309611.0, "step": 45790 }, { "entropy": 0.702874356508255, "epoch": 0.3664, "grad_norm": 5.540886402130127, "learning_rate": 3.169307723089236e-05, "loss": 0.6926, "mean_token_accuracy": 0.8109195828437805, "num_tokens": 475347069.0, "step": 45800 }, { "entropy": 0.625390213727951, "epoch": 0.36648, "grad_norm": 1.6832599639892578, "learning_rate": 3.16890756302521e-05, "loss": 0.6161, "mean_token_accuracy": 0.7996030747890472, "num_tokens": 475510909.0, "step": 45810 }, { "entropy": 0.6925829946994781, "epoch": 0.36656, "grad_norm": 3.374199151992798, "learning_rate": 3.1685074029611846e-05, "loss": 0.6869, "mean_token_accuracy": 0.8054333806037903, "num_tokens": 475603235.0, "step": 45820 }, { "entropy": 0.6685332149267197, "epoch": 0.36664, "grad_norm": 1.5899064540863037, "learning_rate": 3.168107242897159e-05, "loss": 0.6697, "mean_token_accuracy": 0.8063900947570801, "num_tokens": 475697260.0, "step": 45830 }, { "entropy": 0.6883288979530334, "epoch": 0.36672, "grad_norm": 1.9162869453430176, "learning_rate": 3.1677070828331334e-05, "loss": 0.6905, "mean_token_accuracy": 0.7879442393779754, "num_tokens": 475833606.0, "step": 45840 }, { "entropy": 0.7819944858551026, "epoch": 0.3668, "grad_norm": 4.412511348724365, "learning_rate": 3.167306922769108e-05, "loss": 0.7695, "mean_token_accuracy": 0.7994127690792083, "num_tokens": 475868577.0, "step": 45850 }, { "entropy": 0.6299939155578613, "epoch": 0.36688, "grad_norm": 2.2600338459014893, "learning_rate": 3.166906762705082e-05, "loss": 0.6255, "mean_token_accuracy": 0.7997324287891387, "num_tokens": 476031640.0, "step": 45860 }, { "entropy": 0.6900869578123092, "epoch": 0.36696, "grad_norm": 3.0821104049682617, "learning_rate": 3.1665066026410565e-05, "loss": 0.6876, "mean_token_accuracy": 0.8081527709960937, "num_tokens": 476115480.0, "step": 45870 }, { "entropy": 0.8017792880535126, "epoch": 0.36704, "grad_norm": 1.5225473642349243, "learning_rate": 3.166106442577031e-05, "loss": 0.8011, "mean_token_accuracy": 0.777313506603241, "num_tokens": 476209215.0, "step": 45880 }, { "entropy": 0.6819039583206177, "epoch": 0.36712, "grad_norm": 3.5199074745178223, "learning_rate": 3.165706282513005e-05, "loss": 0.6792, "mean_token_accuracy": 0.7910673022270203, "num_tokens": 476347281.0, "step": 45890 }, { "entropy": 0.7547084987163544, "epoch": 0.3672, "grad_norm": 4.689651012420654, "learning_rate": 3.1653061224489796e-05, "loss": 0.7489, "mean_token_accuracy": 0.8038954138755798, "num_tokens": 476384830.0, "step": 45900 }, { "entropy": 0.6481045335531235, "epoch": 0.36728, "grad_norm": 1.5971171855926514, "learning_rate": 3.164905962384954e-05, "loss": 0.6497, "mean_token_accuracy": 0.797603166103363, "num_tokens": 476548630.0, "step": 45910 }, { "entropy": 0.6840682029724121, "epoch": 0.36736, "grad_norm": 2.706991672515869, "learning_rate": 3.1645058023209284e-05, "loss": 0.6778, "mean_token_accuracy": 0.8049509227275848, "num_tokens": 476632816.0, "step": 45920 }, { "entropy": 0.6750412464141846, "epoch": 0.36744, "grad_norm": 1.9831023216247559, "learning_rate": 3.164105642256903e-05, "loss": 0.6633, "mean_token_accuracy": 0.8074833154678345, "num_tokens": 476727150.0, "step": 45930 }, { "entropy": 0.6689211070537567, "epoch": 0.36752, "grad_norm": 2.3172762393951416, "learning_rate": 3.163705482192877e-05, "loss": 0.6663, "mean_token_accuracy": 0.7970334231853485, "num_tokens": 476857822.0, "step": 45940 }, { "entropy": 0.7632852494716644, "epoch": 0.3676, "grad_norm": 4.716324806213379, "learning_rate": 3.1633053221288515e-05, "loss": 0.7676, "mean_token_accuracy": 0.7962851107120514, "num_tokens": 476893971.0, "step": 45950 }, { "entropy": 0.7650377452373505, "epoch": 0.36768, "grad_norm": 1.5254621505737305, "learning_rate": 3.162905162064826e-05, "loss": 0.7635, "mean_token_accuracy": 0.7719643771648407, "num_tokens": 477057707.0, "step": 45960 }, { "entropy": 0.6313802063465118, "epoch": 0.36776, "grad_norm": 3.3801379203796387, "learning_rate": 3.162505002000801e-05, "loss": 0.6211, "mean_token_accuracy": 0.8205712020397187, "num_tokens": 477128998.0, "step": 45970 }, { "entropy": 0.6623766183853149, "epoch": 0.36784, "grad_norm": 2.1364989280700684, "learning_rate": 3.1621048419367746e-05, "loss": 0.6664, "mean_token_accuracy": 0.805543577671051, "num_tokens": 477222111.0, "step": 45980 }, { "entropy": 0.7410064220428467, "epoch": 0.36792, "grad_norm": 2.345475435256958, "learning_rate": 3.161704681872749e-05, "loss": 0.7375, "mean_token_accuracy": 0.7758409023284912, "num_tokens": 477365987.0, "step": 45990 }, { "entropy": 0.6560814410448075, "epoch": 0.368, "grad_norm": 5.86719274520874, "learning_rate": 3.161304521808723e-05, "loss": 0.6525, "mean_token_accuracy": 0.8239133656024933, "num_tokens": 477404768.0, "step": 46000 }, { "entropy": 0.7055656135082244, "epoch": 0.36808, "grad_norm": 1.4920899868011475, "learning_rate": 3.1609043617446984e-05, "loss": 0.7012, "mean_token_accuracy": 0.7815255165100098, "num_tokens": 477568572.0, "step": 46010 }, { "entropy": 0.6715831965208053, "epoch": 0.36816, "grad_norm": 2.9688570499420166, "learning_rate": 3.160504201680672e-05, "loss": 0.667, "mean_token_accuracy": 0.8055655002593994, "num_tokens": 477650142.0, "step": 46020 }, { "entropy": 0.7116151034832001, "epoch": 0.36824, "grad_norm": 2.0876052379608154, "learning_rate": 3.1601040416166464e-05, "loss": 0.7067, "mean_token_accuracy": 0.7989785254001618, "num_tokens": 477743935.0, "step": 46030 }, { "entropy": 0.6757172673940659, "epoch": 0.36832, "grad_norm": 2.524012327194214, "learning_rate": 3.1597038815526215e-05, "loss": 0.6774, "mean_token_accuracy": 0.8000106573104858, "num_tokens": 477874611.0, "step": 46040 }, { "entropy": 0.720261961221695, "epoch": 0.3684, "grad_norm": 5.350050449371338, "learning_rate": 3.159303721488596e-05, "loss": 0.7033, "mean_token_accuracy": 0.8186730206012726, "num_tokens": 477912988.0, "step": 46050 }, { "entropy": 0.6348325967788696, "epoch": 0.36848, "grad_norm": 1.766335368156433, "learning_rate": 3.1589035614245696e-05, "loss": 0.6339, "mean_token_accuracy": 0.7976184666156769, "num_tokens": 478076828.0, "step": 46060 }, { "entropy": 0.6296625405550003, "epoch": 0.36856, "grad_norm": 2.720677375793457, "learning_rate": 3.158503401360544e-05, "loss": 0.617, "mean_token_accuracy": 0.8209921061992645, "num_tokens": 478165931.0, "step": 46070 }, { "entropy": 0.6800826996564865, "epoch": 0.36864, "grad_norm": 2.2502851486206055, "learning_rate": 3.158103241296519e-05, "loss": 0.6807, "mean_token_accuracy": 0.8046640038490296, "num_tokens": 478259994.0, "step": 46080 }, { "entropy": 0.6560854375362396, "epoch": 0.36872, "grad_norm": 3.302813768386841, "learning_rate": 3.1577030812324933e-05, "loss": 0.6541, "mean_token_accuracy": 0.7975993275642395, "num_tokens": 478397266.0, "step": 46090 }, { "entropy": 0.6415486186742783, "epoch": 0.3688, "grad_norm": 4.920510292053223, "learning_rate": 3.157302921168467e-05, "loss": 0.6365, "mean_token_accuracy": 0.8261005520820618, "num_tokens": 478435492.0, "step": 46100 }, { "entropy": 0.670471864938736, "epoch": 0.36888, "grad_norm": 2.413632869720459, "learning_rate": 3.156902761104442e-05, "loss": 0.6613, "mean_token_accuracy": 0.791823399066925, "num_tokens": 478599332.0, "step": 46110 }, { "entropy": 0.6611346781253815, "epoch": 0.36896, "grad_norm": 2.924926280975342, "learning_rate": 3.1565026010404165e-05, "loss": 0.6627, "mean_token_accuracy": 0.8076856672763825, "num_tokens": 478690046.0, "step": 46120 }, { "entropy": 0.6642916798591614, "epoch": 0.36904, "grad_norm": 1.2858946323394775, "learning_rate": 3.156102440976391e-05, "loss": 0.6648, "mean_token_accuracy": 0.8083497524261475, "num_tokens": 478785252.0, "step": 46130 }, { "entropy": 0.7160345315933228, "epoch": 0.36912, "grad_norm": 3.2886087894439697, "learning_rate": 3.1557022809123645e-05, "loss": 0.7151, "mean_token_accuracy": 0.7868100225925445, "num_tokens": 478922497.0, "step": 46140 }, { "entropy": 0.7241332560777665, "epoch": 0.3692, "grad_norm": 5.764484882354736, "learning_rate": 3.1553021208483396e-05, "loss": 0.7177, "mean_token_accuracy": 0.8070785701274872, "num_tokens": 478961226.0, "step": 46150 }, { "entropy": 0.6259029567241668, "epoch": 0.36928, "grad_norm": 2.4587156772613525, "learning_rate": 3.154901960784314e-05, "loss": 0.6256, "mean_token_accuracy": 0.801380068063736, "num_tokens": 479125066.0, "step": 46160 }, { "entropy": 0.7240453988313675, "epoch": 0.36936, "grad_norm": 4.53654670715332, "learning_rate": 3.154501800720288e-05, "loss": 0.713, "mean_token_accuracy": 0.800387841463089, "num_tokens": 479210852.0, "step": 46170 }, { "entropy": 0.7781402111053467, "epoch": 0.36944, "grad_norm": 1.4354772567749023, "learning_rate": 3.154101640656263e-05, "loss": 0.7806, "mean_token_accuracy": 0.7852657556533813, "num_tokens": 479304958.0, "step": 46180 }, { "entropy": 0.6911553919315339, "epoch": 0.36952, "grad_norm": 2.628218650817871, "learning_rate": 3.153701480592237e-05, "loss": 0.6922, "mean_token_accuracy": 0.7891784965991974, "num_tokens": 479443426.0, "step": 46190 }, { "entropy": 0.6886953294277192, "epoch": 0.3696, "grad_norm": 4.820003032684326, "learning_rate": 3.1533013205282114e-05, "loss": 0.6696, "mean_token_accuracy": 0.8190112888813019, "num_tokens": 479481044.0, "step": 46200 }, { "entropy": 0.6695211887359619, "epoch": 0.36968, "grad_norm": 1.7275046110153198, "learning_rate": 3.152901160464186e-05, "loss": 0.6614, "mean_token_accuracy": 0.7944805383682251, "num_tokens": 479643955.0, "step": 46210 }, { "entropy": 0.6086727976799011, "epoch": 0.36976, "grad_norm": 3.249387502670288, "learning_rate": 3.15250100040016e-05, "loss": 0.6033, "mean_token_accuracy": 0.8246913731098175, "num_tokens": 479722519.0, "step": 46220 }, { "entropy": 0.738269704580307, "epoch": 0.36984, "grad_norm": 2.0134522914886475, "learning_rate": 3.1521008403361345e-05, "loss": 0.7428, "mean_token_accuracy": 0.7918832540512085, "num_tokens": 479815496.0, "step": 46230 }, { "entropy": 0.6934969127178192, "epoch": 0.36992, "grad_norm": 2.9257426261901855, "learning_rate": 3.151700680272109e-05, "loss": 0.6854, "mean_token_accuracy": 0.7886762917041779, "num_tokens": 479955796.0, "step": 46240 }, { "entropy": 0.671113395690918, "epoch": 0.37, "grad_norm": 5.285338401794434, "learning_rate": 3.151300520208084e-05, "loss": 0.6634, "mean_token_accuracy": 0.8187917113304138, "num_tokens": 479999011.0, "step": 46250 }, { "entropy": 0.6529689729213715, "epoch": 0.37008, "grad_norm": 1.258394479751587, "learning_rate": 3.1509003601440577e-05, "loss": 0.6554, "mean_token_accuracy": 0.7967574596405029, "num_tokens": 480162851.0, "step": 46260 }, { "entropy": 0.7003046274185181, "epoch": 0.37016, "grad_norm": 3.231081962585449, "learning_rate": 3.150500200080032e-05, "loss": 0.706, "mean_token_accuracy": 0.7990997195243835, "num_tokens": 480253169.0, "step": 46270 }, { "entropy": 0.7288196980953217, "epoch": 0.37024, "grad_norm": 2.0564255714416504, "learning_rate": 3.1501000400160064e-05, "loss": 0.7214, "mean_token_accuracy": 0.7924377679824829, "num_tokens": 480346962.0, "step": 46280 }, { "entropy": 0.6285056114196778, "epoch": 0.37032, "grad_norm": 2.852418899536133, "learning_rate": 3.1496998799519815e-05, "loss": 0.6212, "mean_token_accuracy": 0.8078755497932434, "num_tokens": 480490849.0, "step": 46290 }, { "entropy": 0.6469433605670929, "epoch": 0.3704, "grad_norm": 4.722484111785889, "learning_rate": 3.149299719887955e-05, "loss": 0.6475, "mean_token_accuracy": 0.8232754230499267, "num_tokens": 480527342.0, "step": 46300 }, { "entropy": 0.6264242112636567, "epoch": 0.37048, "grad_norm": 1.5624195337295532, "learning_rate": 3.1488995598239295e-05, "loss": 0.6264, "mean_token_accuracy": 0.7979482173919678, "num_tokens": 480691182.0, "step": 46310 }, { "entropy": 0.752746707201004, "epoch": 0.37056, "grad_norm": 3.061124563217163, "learning_rate": 3.1484993997599046e-05, "loss": 0.7416, "mean_token_accuracy": 0.79422048330307, "num_tokens": 480776946.0, "step": 46320 }, { "entropy": 0.6546071767807007, "epoch": 0.37064, "grad_norm": 1.5957846641540527, "learning_rate": 3.148099239695879e-05, "loss": 0.666, "mean_token_accuracy": 0.8088646233081818, "num_tokens": 480870293.0, "step": 46330 }, { "entropy": 0.72525435090065, "epoch": 0.37072, "grad_norm": 2.244076728820801, "learning_rate": 3.1476990796318526e-05, "loss": 0.7183, "mean_token_accuracy": 0.7860584020614624, "num_tokens": 481005367.0, "step": 46340 }, { "entropy": 0.643096747994423, "epoch": 0.3708, "grad_norm": 4.854743480682373, "learning_rate": 3.147298919567827e-05, "loss": 0.624, "mean_token_accuracy": 0.8334752976894378, "num_tokens": 481045168.0, "step": 46350 }, { "entropy": 0.6246553957462311, "epoch": 0.37088, "grad_norm": 2.1475634574890137, "learning_rate": 3.146898759503802e-05, "loss": 0.6315, "mean_token_accuracy": 0.7998839735984802, "num_tokens": 481209008.0, "step": 46360 }, { "entropy": 0.6651377320289612, "epoch": 0.37096, "grad_norm": 4.490220069885254, "learning_rate": 3.1464985994397764e-05, "loss": 0.6497, "mean_token_accuracy": 0.8100491285324096, "num_tokens": 481296596.0, "step": 46370 }, { "entropy": 0.6671867191791534, "epoch": 0.37104, "grad_norm": 1.5790674686431885, "learning_rate": 3.14609843937575e-05, "loss": 0.6626, "mean_token_accuracy": 0.8116686582565308, "num_tokens": 481390080.0, "step": 46380 }, { "entropy": 0.7563291430473328, "epoch": 0.37112, "grad_norm": 2.305227756500244, "learning_rate": 3.145698279311725e-05, "loss": 0.7598, "mean_token_accuracy": 0.7771957635879516, "num_tokens": 481521442.0, "step": 46390 }, { "entropy": 0.7426235318183899, "epoch": 0.3712, "grad_norm": 4.497000217437744, "learning_rate": 3.1452981192476995e-05, "loss": 0.7216, "mean_token_accuracy": 0.8095278859138488, "num_tokens": 481555450.0, "step": 46400 }, { "entropy": 0.6587612509727478, "epoch": 0.37128, "grad_norm": 2.496840000152588, "learning_rate": 3.144897959183674e-05, "loss": 0.6545, "mean_token_accuracy": 0.7931057751178742, "num_tokens": 481719290.0, "step": 46410 }, { "entropy": 0.6694984138011932, "epoch": 0.37136, "grad_norm": 3.4281647205352783, "learning_rate": 3.1444977991196476e-05, "loss": 0.6797, "mean_token_accuracy": 0.8051099956035614, "num_tokens": 481808831.0, "step": 46420 }, { "entropy": 0.7065285861492157, "epoch": 0.37144, "grad_norm": 2.3260834217071533, "learning_rate": 3.1440976390556227e-05, "loss": 0.7133, "mean_token_accuracy": 0.7943634390830994, "num_tokens": 481903178.0, "step": 46430 }, { "entropy": 0.7072576284408569, "epoch": 0.37152, "grad_norm": 3.513394832611084, "learning_rate": 3.143697478991597e-05, "loss": 0.6991, "mean_token_accuracy": 0.7882398068904877, "num_tokens": 482042914.0, "step": 46440 }, { "entropy": 0.6764666020870209, "epoch": 0.3716, "grad_norm": 4.549623966217041, "learning_rate": 3.1432973189275714e-05, "loss": 0.6744, "mean_token_accuracy": 0.8210007190704346, "num_tokens": 482082721.0, "step": 46450 }, { "entropy": 0.6654201745986938, "epoch": 0.37168, "grad_norm": 2.0229594707489014, "learning_rate": 3.142897158863545e-05, "loss": 0.6663, "mean_token_accuracy": 0.7909746050834656, "num_tokens": 482246561.0, "step": 46460 }, { "entropy": 0.5856992214918136, "epoch": 0.37176, "grad_norm": 2.9711179733276367, "learning_rate": 3.14249699879952e-05, "loss": 0.5753, "mean_token_accuracy": 0.8268712222576141, "num_tokens": 482334843.0, "step": 46470 }, { "entropy": 0.675640732049942, "epoch": 0.37184, "grad_norm": 1.3886842727661133, "learning_rate": 3.1420968387354945e-05, "loss": 0.6863, "mean_token_accuracy": 0.799291855096817, "num_tokens": 482428039.0, "step": 46480 }, { "entropy": 0.6778597712516785, "epoch": 0.37192, "grad_norm": 2.885442018508911, "learning_rate": 3.141696678671469e-05, "loss": 0.6689, "mean_token_accuracy": 0.7989411056041718, "num_tokens": 482563398.0, "step": 46490 }, { "entropy": 0.6913679003715515, "epoch": 0.372, "grad_norm": 3.8197975158691406, "learning_rate": 3.141296518607443e-05, "loss": 0.6807, "mean_token_accuracy": 0.8145268678665161, "num_tokens": 482604126.0, "step": 46500 }, { "entropy": 0.6433812230825424, "epoch": 0.37208, "grad_norm": 1.573021411895752, "learning_rate": 3.1408963585434176e-05, "loss": 0.6445, "mean_token_accuracy": 0.7982474327087402, "num_tokens": 482767966.0, "step": 46510 }, { "entropy": 0.6374940991401672, "epoch": 0.37216, "grad_norm": 2.6044187545776367, "learning_rate": 3.140496198479392e-05, "loss": 0.6322, "mean_token_accuracy": 0.8105538427829743, "num_tokens": 482864231.0, "step": 46520 }, { "entropy": 0.7484820425510407, "epoch": 0.37224, "grad_norm": 1.709248423576355, "learning_rate": 3.1400960384153664e-05, "loss": 0.7654, "mean_token_accuracy": 0.7894065856933594, "num_tokens": 482957514.0, "step": 46530 }, { "entropy": 0.7407938122749329, "epoch": 0.37232, "grad_norm": 2.3910861015319824, "learning_rate": 3.139695878351341e-05, "loss": 0.7208, "mean_token_accuracy": 0.7807225525379181, "num_tokens": 483099424.0, "step": 46540 }, { "entropy": 0.6283759385347366, "epoch": 0.3724, "grad_norm": 5.040219783782959, "learning_rate": 3.139295718287315e-05, "loss": 0.6355, "mean_token_accuracy": 0.8290241837501526, "num_tokens": 483141147.0, "step": 46550 }, { "entropy": 0.6394010841846466, "epoch": 0.37248, "grad_norm": 1.4317271709442139, "learning_rate": 3.1388955582232895e-05, "loss": 0.6421, "mean_token_accuracy": 0.7949743509292603, "num_tokens": 483304987.0, "step": 46560 }, { "entropy": 0.6908891439437866, "epoch": 0.37256, "grad_norm": 2.779021978378296, "learning_rate": 3.138495398159264e-05, "loss": 0.6604, "mean_token_accuracy": 0.8094664871692657, "num_tokens": 483389844.0, "step": 46570 }, { "entropy": 0.6810031950473785, "epoch": 0.37264, "grad_norm": 1.994199275970459, "learning_rate": 3.138095238095238e-05, "loss": 0.6947, "mean_token_accuracy": 0.8010411500930786, "num_tokens": 483484381.0, "step": 46580 }, { "entropy": 0.689424866437912, "epoch": 0.37272, "grad_norm": 2.255185127258301, "learning_rate": 3.1376950780312126e-05, "loss": 0.6841, "mean_token_accuracy": 0.792910361289978, "num_tokens": 483626062.0, "step": 46590 }, { "entropy": 0.663945484161377, "epoch": 0.3728, "grad_norm": 5.1062517166137695, "learning_rate": 3.137294917967187e-05, "loss": 0.6677, "mean_token_accuracy": 0.8168999016284942, "num_tokens": 483664596.0, "step": 46600 }, { "entropy": 0.6392981052398682, "epoch": 0.37288, "grad_norm": 1.5961247682571411, "learning_rate": 3.136894757903161e-05, "loss": 0.631, "mean_token_accuracy": 0.8004702031612396, "num_tokens": 483828436.0, "step": 46610 }, { "entropy": 0.584288826584816, "epoch": 0.37296, "grad_norm": 3.507877826690674, "learning_rate": 3.136494597839136e-05, "loss": 0.5844, "mean_token_accuracy": 0.8237371683120728, "num_tokens": 483914153.0, "step": 46620 }, { "entropy": 0.7374466538429261, "epoch": 0.37304, "grad_norm": 2.0799453258514404, "learning_rate": 3.13609443777511e-05, "loss": 0.739, "mean_token_accuracy": 0.7960696816444397, "num_tokens": 484007599.0, "step": 46630 }, { "entropy": 0.7030897021293641, "epoch": 0.37312, "grad_norm": 3.0795202255249023, "learning_rate": 3.135694277711085e-05, "loss": 0.6951, "mean_token_accuracy": 0.7884317398071289, "num_tokens": 484139857.0, "step": 46640 }, { "entropy": 0.703346973657608, "epoch": 0.3732, "grad_norm": 5.882081031799316, "learning_rate": 3.135294117647059e-05, "loss": 0.6906, "mean_token_accuracy": 0.815533459186554, "num_tokens": 484174307.0, "step": 46650 }, { "entropy": 0.6148872613906861, "epoch": 0.37328, "grad_norm": 2.3856201171875, "learning_rate": 3.134893957583033e-05, "loss": 0.6201, "mean_token_accuracy": 0.8012895107269287, "num_tokens": 484338108.0, "step": 46660 }, { "entropy": 0.6622496128082276, "epoch": 0.37336, "grad_norm": 3.3766939640045166, "learning_rate": 3.1344937975190076e-05, "loss": 0.654, "mean_token_accuracy": 0.8124814808368683, "num_tokens": 484416716.0, "step": 46670 }, { "entropy": 0.7475467979907989, "epoch": 0.37344, "grad_norm": 2.171870470046997, "learning_rate": 3.1340936374549826e-05, "loss": 0.7489, "mean_token_accuracy": 0.7865680634975434, "num_tokens": 484510368.0, "step": 46680 }, { "entropy": 0.6642130196094513, "epoch": 0.37352, "grad_norm": 2.091278076171875, "learning_rate": 3.133693477390956e-05, "loss": 0.6598, "mean_token_accuracy": 0.7986777901649476, "num_tokens": 484654713.0, "step": 46690 }, { "entropy": 0.714363020658493, "epoch": 0.3736, "grad_norm": 4.669675350189209, "learning_rate": 3.133293317326931e-05, "loss": 0.7181, "mean_token_accuracy": 0.8083028435707093, "num_tokens": 484693109.0, "step": 46700 }, { "entropy": 0.698563426733017, "epoch": 0.37368, "grad_norm": 2.305337905883789, "learning_rate": 3.132893157262906e-05, "loss": 0.6942, "mean_token_accuracy": 0.7828508734703064, "num_tokens": 484856759.0, "step": 46710 }, { "entropy": 0.7021685659885406, "epoch": 0.37376, "grad_norm": 3.108444929122925, "learning_rate": 3.13249299719888e-05, "loss": 0.689, "mean_token_accuracy": 0.8017630398273468, "num_tokens": 484934970.0, "step": 46720 }, { "entropy": 0.7111922979354859, "epoch": 0.37384, "grad_norm": 1.5912621021270752, "learning_rate": 3.132092837134854e-05, "loss": 0.7212, "mean_token_accuracy": 0.7971585035324097, "num_tokens": 485028320.0, "step": 46730 }, { "entropy": 0.6060227572917938, "epoch": 0.37392, "grad_norm": 2.136159658432007, "learning_rate": 3.131692677070828e-05, "loss": 0.5983, "mean_token_accuracy": 0.8111918449401856, "num_tokens": 485158938.0, "step": 46740 }, { "entropy": 0.6825856685638427, "epoch": 0.374, "grad_norm": 4.663960933685303, "learning_rate": 3.131292517006803e-05, "loss": 0.6707, "mean_token_accuracy": 0.8211650073528289, "num_tokens": 485193862.0, "step": 46750 }, { "entropy": 0.6347122550010681, "epoch": 0.37408, "grad_norm": 1.3470327854156494, "learning_rate": 3.1308923569427776e-05, "loss": 0.6416, "mean_token_accuracy": 0.7944919407367707, "num_tokens": 485357702.0, "step": 46760 }, { "entropy": 0.659668555855751, "epoch": 0.37416, "grad_norm": 3.2929744720458984, "learning_rate": 3.130492196878751e-05, "loss": 0.6474, "mean_token_accuracy": 0.8098491787910461, "num_tokens": 485452366.0, "step": 46770 }, { "entropy": 0.7113860487937927, "epoch": 0.37424, "grad_norm": 1.410309076309204, "learning_rate": 3.130092036814726e-05, "loss": 0.7096, "mean_token_accuracy": 0.7938165426254272, "num_tokens": 485545484.0, "step": 46780 }, { "entropy": 0.7031586349010468, "epoch": 0.37432, "grad_norm": 3.4076380729675293, "learning_rate": 3.129691876750701e-05, "loss": 0.7034, "mean_token_accuracy": 0.7875605404376984, "num_tokens": 485686540.0, "step": 46790 }, { "entropy": 0.6842304229736328, "epoch": 0.3744, "grad_norm": 4.804588794708252, "learning_rate": 3.129291716686675e-05, "loss": 0.6848, "mean_token_accuracy": 0.8152009069919586, "num_tokens": 485729661.0, "step": 46800 }, { "entropy": 0.6544094026088715, "epoch": 0.37448, "grad_norm": 2.1830496788024902, "learning_rate": 3.128891556622649e-05, "loss": 0.6484, "mean_token_accuracy": 0.798094779253006, "num_tokens": 485893501.0, "step": 46810 }, { "entropy": 0.6985646247863769, "epoch": 0.37456, "grad_norm": 3.6903231143951416, "learning_rate": 3.128491396558624e-05, "loss": 0.6964, "mean_token_accuracy": 0.796992403268814, "num_tokens": 485988095.0, "step": 46820 }, { "entropy": 0.6760083973407746, "epoch": 0.37464, "grad_norm": 1.6579335927963257, "learning_rate": 3.128091236494598e-05, "loss": 0.6602, "mean_token_accuracy": 0.8151793301105499, "num_tokens": 486083195.0, "step": 46830 }, { "entropy": 0.6565777838230134, "epoch": 0.37472, "grad_norm": 3.3023018836975098, "learning_rate": 3.1276910764305726e-05, "loss": 0.6548, "mean_token_accuracy": 0.8019704639911651, "num_tokens": 486207783.0, "step": 46840 }, { "entropy": 0.6966630488634109, "epoch": 0.3748, "grad_norm": 4.100432872772217, "learning_rate": 3.127290916366547e-05, "loss": 0.6929, "mean_token_accuracy": 0.8154433310031891, "num_tokens": 486244392.0, "step": 46850 }, { "entropy": 0.6607856512069702, "epoch": 0.37488, "grad_norm": 1.372153639793396, "learning_rate": 3.126890756302521e-05, "loss": 0.6624, "mean_token_accuracy": 0.7910403609275818, "num_tokens": 486408040.0, "step": 46860 }, { "entropy": 0.6983886361122131, "epoch": 0.37496, "grad_norm": 2.9662060737609863, "learning_rate": 3.126490596238496e-05, "loss": 0.672, "mean_token_accuracy": 0.8063107788562774, "num_tokens": 486485930.0, "step": 46870 }, { "entropy": 0.6320427000522614, "epoch": 0.37504, "grad_norm": 1.5999484062194824, "learning_rate": 3.12609043617447e-05, "loss": 0.6484, "mean_token_accuracy": 0.8117096126079559, "num_tokens": 486578680.0, "step": 46880 }, { "entropy": 0.7966085076332092, "epoch": 0.37512, "grad_norm": 2.9645252227783203, "learning_rate": 3.1256902761104444e-05, "loss": 0.7967, "mean_token_accuracy": 0.7666524291038513, "num_tokens": 486711933.0, "step": 46890 }, { "entropy": 0.6750315815210343, "epoch": 0.3752, "grad_norm": 4.126388072967529, "learning_rate": 3.125290116046419e-05, "loss": 0.6493, "mean_token_accuracy": 0.8283603966236115, "num_tokens": 486751262.0, "step": 46900 }, { "entropy": 0.6657049477100372, "epoch": 0.37528, "grad_norm": 2.478982925415039, "learning_rate": 3.124889955982393e-05, "loss": 0.6657, "mean_token_accuracy": 0.7917440176010132, "num_tokens": 486915102.0, "step": 46910 }, { "entropy": 0.7444516390562057, "epoch": 0.37536, "grad_norm": 3.3818836212158203, "learning_rate": 3.1244897959183675e-05, "loss": 0.741, "mean_token_accuracy": 0.7916403532028198, "num_tokens": 486998035.0, "step": 46920 }, { "entropy": 0.7290327727794648, "epoch": 0.37544, "grad_norm": 1.462597370147705, "learning_rate": 3.124089635854342e-05, "loss": 0.7224, "mean_token_accuracy": 0.7930607438087464, "num_tokens": 487091023.0, "step": 46930 }, { "entropy": 0.7451871991157532, "epoch": 0.37552, "grad_norm": 2.354675769805908, "learning_rate": 3.123689475790316e-05, "loss": 0.7472, "mean_token_accuracy": 0.7761639654636383, "num_tokens": 487217470.0, "step": 46940 }, { "entropy": 0.6075649321079254, "epoch": 0.3756, "grad_norm": 5.276937961578369, "learning_rate": 3.1232893157262906e-05, "loss": 0.595, "mean_token_accuracy": 0.8364163696765899, "num_tokens": 487254054.0, "step": 46950 }, { "entropy": 0.615480899810791, "epoch": 0.37568, "grad_norm": 1.7519066333770752, "learning_rate": 3.122889155662265e-05, "loss": 0.6114, "mean_token_accuracy": 0.8036028385162354, "num_tokens": 487417894.0, "step": 46960 }, { "entropy": 0.6964720606803894, "epoch": 0.37576, "grad_norm": 3.374393939971924, "learning_rate": 3.1224889955982394e-05, "loss": 0.6952, "mean_token_accuracy": 0.7997594714164734, "num_tokens": 487503667.0, "step": 46970 }, { "entropy": 0.7635971248149872, "epoch": 0.37584, "grad_norm": 2.1621932983398438, "learning_rate": 3.122088835534214e-05, "loss": 0.7584, "mean_token_accuracy": 0.7853862583637238, "num_tokens": 487598629.0, "step": 46980 }, { "entropy": 0.7337959706783295, "epoch": 0.37592, "grad_norm": 2.106546401977539, "learning_rate": 3.121688675470188e-05, "loss": 0.7273, "mean_token_accuracy": 0.7780763924121856, "num_tokens": 487746798.0, "step": 46990 }, { "entropy": 0.6459152966737747, "epoch": 0.376, "grad_norm": 7.911811828613281, "learning_rate": 3.1212885154061625e-05, "loss": 0.6618, "mean_token_accuracy": 0.8222403883934021, "num_tokens": 487791539.0, "step": 47000 }, { "entropy": 0.6449272572994232, "epoch": 0.37608, "grad_norm": 1.736029863357544, "learning_rate": 3.120888355342137e-05, "loss": 0.6418, "mean_token_accuracy": 0.7947362065315247, "num_tokens": 487955379.0, "step": 47010 }, { "entropy": 0.7559600651264191, "epoch": 0.37616, "grad_norm": 5.093323707580566, "learning_rate": 3.120488195278111e-05, "loss": 0.7477, "mean_token_accuracy": 0.7901746809482575, "num_tokens": 488041828.0, "step": 47020 }, { "entropy": 0.6920288860797882, "epoch": 0.37624, "grad_norm": 1.9290560483932495, "learning_rate": 3.120088035214086e-05, "loss": 0.6951, "mean_token_accuracy": 0.7986896276473999, "num_tokens": 488135635.0, "step": 47030 }, { "entropy": 0.7141017317771912, "epoch": 0.37632, "grad_norm": 4.340318202972412, "learning_rate": 3.11968787515006e-05, "loss": 0.7005, "mean_token_accuracy": 0.7841826438903808, "num_tokens": 488271219.0, "step": 47040 }, { "entropy": 0.6509219318628311, "epoch": 0.3764, "grad_norm": 7.33812952041626, "learning_rate": 3.1192877150860344e-05, "loss": 0.6547, "mean_token_accuracy": 0.826807153224945, "num_tokens": 488308574.0, "step": 47050 }, { "entropy": 0.6720033705234527, "epoch": 0.37648, "grad_norm": 2.2931129932403564, "learning_rate": 3.118887555022009e-05, "loss": 0.6758, "mean_token_accuracy": 0.7922243714332581, "num_tokens": 488465736.0, "step": 47060 }, { "entropy": 0.6357047170400619, "epoch": 0.37656, "grad_norm": 4.497596740722656, "learning_rate": 3.118487394957984e-05, "loss": 0.6272, "mean_token_accuracy": 0.8170304417610168, "num_tokens": 488535722.0, "step": 47070 }, { "entropy": 0.6932304620742797, "epoch": 0.37664, "grad_norm": 1.4688531160354614, "learning_rate": 3.1180872348939575e-05, "loss": 0.6736, "mean_token_accuracy": 0.8051877558231354, "num_tokens": 488629181.0, "step": 47080 }, { "entropy": 0.6584198176860809, "epoch": 0.37672, "grad_norm": 2.9240822792053223, "learning_rate": 3.117687074829932e-05, "loss": 0.6537, "mean_token_accuracy": 0.7979111969470978, "num_tokens": 488767512.0, "step": 47090 }, { "entropy": 0.6793704867362976, "epoch": 0.3768, "grad_norm": 5.897231101989746, "learning_rate": 3.117286914765907e-05, "loss": 0.67, "mean_token_accuracy": 0.8199732899665833, "num_tokens": 488811359.0, "step": 47100 }, { "entropy": 0.6233887672424316, "epoch": 0.37688, "grad_norm": 2.0613980293273926, "learning_rate": 3.116886754701881e-05, "loss": 0.6196, "mean_token_accuracy": 0.8030288338661193, "num_tokens": 488975199.0, "step": 47110 }, { "entropy": 0.6563793361186981, "epoch": 0.37696, "grad_norm": 4.035504341125488, "learning_rate": 3.116486594637855e-05, "loss": 0.6518, "mean_token_accuracy": 0.8096658408641815, "num_tokens": 489062035.0, "step": 47120 }, { "entropy": 0.7330701470375061, "epoch": 0.37704, "grad_norm": 1.6729059219360352, "learning_rate": 3.116086434573829e-05, "loss": 0.7308, "mean_token_accuracy": 0.7929204940795899, "num_tokens": 489155774.0, "step": 47130 }, { "entropy": 0.6436300992965698, "epoch": 0.37712, "grad_norm": 2.3211207389831543, "learning_rate": 3.1156862745098044e-05, "loss": 0.634, "mean_token_accuracy": 0.8029018580913544, "num_tokens": 489293034.0, "step": 47140 }, { "entropy": 0.7098048448562622, "epoch": 0.3772, "grad_norm": 4.722684383392334, "learning_rate": 3.115286114445779e-05, "loss": 0.7074, "mean_token_accuracy": 0.8144096672534943, "num_tokens": 489334466.0, "step": 47150 }, { "entropy": 0.6692732989788055, "epoch": 0.37728, "grad_norm": 1.5045866966247559, "learning_rate": 3.1148859543817524e-05, "loss": 0.6732, "mean_token_accuracy": 0.7889755427837372, "num_tokens": 489497723.0, "step": 47160 }, { "entropy": 0.6355509221553802, "epoch": 0.37736, "grad_norm": 3.309433937072754, "learning_rate": 3.1144857943177275e-05, "loss": 0.6231, "mean_token_accuracy": 0.8173560619354248, "num_tokens": 489575686.0, "step": 47170 }, { "entropy": 0.7319754481315612, "epoch": 0.37744, "grad_norm": 2.543571949005127, "learning_rate": 3.114085634253702e-05, "loss": 0.7417, "mean_token_accuracy": 0.7948467671871186, "num_tokens": 489669284.0, "step": 47180 }, { "entropy": 0.7040830373764038, "epoch": 0.37752, "grad_norm": 2.5455334186553955, "learning_rate": 3.113685474189676e-05, "loss": 0.697, "mean_token_accuracy": 0.7909080982208252, "num_tokens": 489807076.0, "step": 47190 }, { "entropy": 0.6737659096717834, "epoch": 0.3776, "grad_norm": 8.308879852294922, "learning_rate": 3.11328531412565e-05, "loss": 0.6593, "mean_token_accuracy": 0.8200942277908325, "num_tokens": 489847332.0, "step": 47200 }, { "entropy": 0.6340750873088836, "epoch": 0.37768, "grad_norm": 1.8411058187484741, "learning_rate": 3.112885154061625e-05, "loss": 0.6363, "mean_token_accuracy": 0.7964764297008514, "num_tokens": 490007322.0, "step": 47210 }, { "entropy": 0.681373804807663, "epoch": 0.37776, "grad_norm": 4.934355735778809, "learning_rate": 3.1124849939975993e-05, "loss": 0.6745, "mean_token_accuracy": 0.8073569297790527, "num_tokens": 490074190.0, "step": 47220 }, { "entropy": 0.6574989378452301, "epoch": 0.37784, "grad_norm": 1.8218109607696533, "learning_rate": 3.112084833933574e-05, "loss": 0.6522, "mean_token_accuracy": 0.8095172822475434, "num_tokens": 490167267.0, "step": 47230 }, { "entropy": 0.6641658842563629, "epoch": 0.37792, "grad_norm": 2.188572406768799, "learning_rate": 3.111684673869548e-05, "loss": 0.6717, "mean_token_accuracy": 0.7964535892009735, "num_tokens": 490298809.0, "step": 47240 }, { "entropy": 0.675881564617157, "epoch": 0.378, "grad_norm": 5.044878959655762, "learning_rate": 3.1112845138055225e-05, "loss": 0.6634, "mean_token_accuracy": 0.821911895275116, "num_tokens": 490335320.0, "step": 47250 }, { "entropy": 0.6638094305992126, "epoch": 0.37808, "grad_norm": 1.7657814025878906, "learning_rate": 3.110884353741497e-05, "loss": 0.6569, "mean_token_accuracy": 0.7924543976783752, "num_tokens": 490497149.0, "step": 47260 }, { "entropy": 0.6367420941591263, "epoch": 0.37816, "grad_norm": 3.6044907569885254, "learning_rate": 3.110484193677471e-05, "loss": 0.6371, "mean_token_accuracy": 0.8180912375450134, "num_tokens": 490574195.0, "step": 47270 }, { "entropy": 0.6941019117832183, "epoch": 0.37824, "grad_norm": 2.0984976291656494, "learning_rate": 3.1100840336134456e-05, "loss": 0.6807, "mean_token_accuracy": 0.8034428417682647, "num_tokens": 490667732.0, "step": 47280 }, { "entropy": 0.6568644404411316, "epoch": 0.37832, "grad_norm": 1.9542829990386963, "learning_rate": 3.10968387354942e-05, "loss": 0.6583, "mean_token_accuracy": 0.7958950519561767, "num_tokens": 490811558.0, "step": 47290 }, { "entropy": 0.6966010570526123, "epoch": 0.3784, "grad_norm": 4.5647454261779785, "learning_rate": 3.109283713485394e-05, "loss": 0.6759, "mean_token_accuracy": 0.8181643903255462, "num_tokens": 490855532.0, "step": 47300 }, { "entropy": 0.6390933454036712, "epoch": 0.37848, "grad_norm": 2.4084041118621826, "learning_rate": 3.108883553421369e-05, "loss": 0.6408, "mean_token_accuracy": 0.7986077189445495, "num_tokens": 491019372.0, "step": 47310 }, { "entropy": 0.6493392884731293, "epoch": 0.37856, "grad_norm": 2.3016197681427, "learning_rate": 3.108483393357343e-05, "loss": 0.6425, "mean_token_accuracy": 0.8103923380374909, "num_tokens": 491129780.0, "step": 47320 }, { "entropy": 0.6841242849826813, "epoch": 0.37864, "grad_norm": 1.8924670219421387, "learning_rate": 3.1080832332933174e-05, "loss": 0.6758, "mean_token_accuracy": 0.8046363353729248, "num_tokens": 491225510.0, "step": 47330 }, { "entropy": 0.7807510077953339, "epoch": 0.37872, "grad_norm": 2.471247911453247, "learning_rate": 3.107683073229292e-05, "loss": 0.7806, "mean_token_accuracy": 0.7697577536106109, "num_tokens": 491366985.0, "step": 47340 }, { "entropy": 0.6774905860424042, "epoch": 0.3788, "grad_norm": 4.9473724365234375, "learning_rate": 3.107282913165266e-05, "loss": 0.6538, "mean_token_accuracy": 0.8245517432689666, "num_tokens": 491405217.0, "step": 47350 }, { "entropy": 0.6412428975105285, "epoch": 0.37888, "grad_norm": 1.9856549501419067, "learning_rate": 3.1068827531012405e-05, "loss": 0.6382, "mean_token_accuracy": 0.7958658993244171, "num_tokens": 491569057.0, "step": 47360 }, { "entropy": 0.6931989848613739, "epoch": 0.37896, "grad_norm": 2.8941426277160645, "learning_rate": 3.106482593037215e-05, "loss": 0.6989, "mean_token_accuracy": 0.8007992267608642, "num_tokens": 491654739.0, "step": 47370 }, { "entropy": 0.7192629337310791, "epoch": 0.37904, "grad_norm": 2.0861549377441406, "learning_rate": 3.10608243297319e-05, "loss": 0.7263, "mean_token_accuracy": 0.7961122751235962, "num_tokens": 491746070.0, "step": 47380 }, { "entropy": 0.6578749418258667, "epoch": 0.37912, "grad_norm": 2.6067328453063965, "learning_rate": 3.1056822729091637e-05, "loss": 0.6504, "mean_token_accuracy": 0.8062120378017426, "num_tokens": 491874958.0, "step": 47390 }, { "entropy": 0.6224983930587769, "epoch": 0.3792, "grad_norm": 4.004885196685791, "learning_rate": 3.105282112845138e-05, "loss": 0.6175, "mean_token_accuracy": 0.8318457663059234, "num_tokens": 491910860.0, "step": 47400 }, { "entropy": 0.6496350944042206, "epoch": 0.37928, "grad_norm": 1.6341928243637085, "learning_rate": 3.1048819527811124e-05, "loss": 0.6499, "mean_token_accuracy": 0.7941560804843902, "num_tokens": 492074700.0, "step": 47410 }, { "entropy": 0.6450999557971955, "epoch": 0.37936, "grad_norm": 4.526064872741699, "learning_rate": 3.1044817927170874e-05, "loss": 0.6409, "mean_token_accuracy": 0.8163668632507324, "num_tokens": 492167195.0, "step": 47420 }, { "entropy": 0.7127932012081146, "epoch": 0.37944, "grad_norm": 2.165472984313965, "learning_rate": 3.104081632653061e-05, "loss": 0.7196, "mean_token_accuracy": 0.7949965596199036, "num_tokens": 492260274.0, "step": 47430 }, { "entropy": 0.6736871063709259, "epoch": 0.37952, "grad_norm": 2.708587884902954, "learning_rate": 3.1036814725890355e-05, "loss": 0.6673, "mean_token_accuracy": 0.7921796798706054, "num_tokens": 492406832.0, "step": 47440 }, { "entropy": 0.6682861506938934, "epoch": 0.3796, "grad_norm": 6.044281482696533, "learning_rate": 3.1032813125250106e-05, "loss": 0.6614, "mean_token_accuracy": 0.821695762872696, "num_tokens": 492444596.0, "step": 47450 }, { "entropy": 0.6032351315021515, "epoch": 0.37968, "grad_norm": 1.5142052173614502, "learning_rate": 3.102881152460985e-05, "loss": 0.5948, "mean_token_accuracy": 0.8065583765506744, "num_tokens": 492608436.0, "step": 47460 }, { "entropy": 0.7117119669914246, "epoch": 0.37976, "grad_norm": 3.074207067489624, "learning_rate": 3.1024809923969586e-05, "loss": 0.7126, "mean_token_accuracy": 0.8005372226238251, "num_tokens": 492698873.0, "step": 47470 }, { "entropy": 0.6863802522420883, "epoch": 0.37984, "grad_norm": 1.7327218055725098, "learning_rate": 3.102080832332933e-05, "loss": 0.6994, "mean_token_accuracy": 0.800909161567688, "num_tokens": 492792225.0, "step": 47480 }, { "entropy": 0.6780973970890045, "epoch": 0.37992, "grad_norm": 2.101238965988159, "learning_rate": 3.101680672268908e-05, "loss": 0.6656, "mean_token_accuracy": 0.7947889387607574, "num_tokens": 492921733.0, "step": 47490 }, { "entropy": 0.6938665866851806, "epoch": 0.38, "grad_norm": 5.347146511077881, "learning_rate": 3.1012805122048824e-05, "loss": 0.6856, "mean_token_accuracy": 0.8181631386280059, "num_tokens": 492959246.0, "step": 47500 }, { "entropy": 0.6423829615116119, "epoch": 0.38008, "grad_norm": 1.9629343748092651, "learning_rate": 3.100880352140856e-05, "loss": 0.6375, "mean_token_accuracy": 0.7977100729942321, "num_tokens": 493123086.0, "step": 47510 }, { "entropy": 0.7039921969175339, "epoch": 0.38016, "grad_norm": 2.7047550678253174, "learning_rate": 3.100480192076831e-05, "loss": 0.6988, "mean_token_accuracy": 0.7980874657630921, "num_tokens": 493221313.0, "step": 47520 }, { "entropy": 0.6843528985977173, "epoch": 0.38024, "grad_norm": 1.6408367156982422, "learning_rate": 3.1000800320128055e-05, "loss": 0.6812, "mean_token_accuracy": 0.8041348874568939, "num_tokens": 493316926.0, "step": 47530 }, { "entropy": 0.702528440952301, "epoch": 0.38032, "grad_norm": 3.3319780826568604, "learning_rate": 3.09967987194878e-05, "loss": 0.6924, "mean_token_accuracy": 0.7884878993034363, "num_tokens": 493454903.0, "step": 47540 }, { "entropy": 0.689072635769844, "epoch": 0.3804, "grad_norm": 4.862944602966309, "learning_rate": 3.0992797118847536e-05, "loss": 0.6897, "mean_token_accuracy": 0.8131682336330414, "num_tokens": 493497228.0, "step": 47550 }, { "entropy": 0.6439039230346679, "epoch": 0.38048, "grad_norm": 1.378849744796753, "learning_rate": 3.0988795518207286e-05, "loss": 0.6435, "mean_token_accuracy": 0.7939451813697815, "num_tokens": 493661051.0, "step": 47560 }, { "entropy": 0.6109539657831192, "epoch": 0.38056, "grad_norm": 4.1173996925354, "learning_rate": 3.098479391756703e-05, "loss": 0.6013, "mean_token_accuracy": 0.8216832637786865, "num_tokens": 493739304.0, "step": 47570 }, { "entropy": 0.6706855535507202, "epoch": 0.38064, "grad_norm": 1.5539387464523315, "learning_rate": 3.0980792316926774e-05, "loss": 0.6675, "mean_token_accuracy": 0.8059275448322296, "num_tokens": 493831765.0, "step": 47580 }, { "entropy": 0.7064958989620209, "epoch": 0.38072, "grad_norm": 2.9794652462005615, "learning_rate": 3.097679071628651e-05, "loss": 0.6957, "mean_token_accuracy": 0.7890029609203338, "num_tokens": 493976408.0, "step": 47590 }, { "entropy": 0.7094851493835449, "epoch": 0.3808, "grad_norm": 5.462306976318359, "learning_rate": 3.097278911564626e-05, "loss": 0.7201, "mean_token_accuracy": 0.8046626031398774, "num_tokens": 494014958.0, "step": 47600 }, { "entropy": 0.6524434864521027, "epoch": 0.38088, "grad_norm": 1.7676807641983032, "learning_rate": 3.0968787515006005e-05, "loss": 0.6473, "mean_token_accuracy": 0.7942110478878022, "num_tokens": 494178798.0, "step": 47610 }, { "entropy": 0.6918013691902161, "epoch": 0.38096, "grad_norm": 4.068403720855713, "learning_rate": 3.096478591436575e-05, "loss": 0.6733, "mean_token_accuracy": 0.8089974582195282, "num_tokens": 494257731.0, "step": 47620 }, { "entropy": 0.6995983123779297, "epoch": 0.38104, "grad_norm": 2.7413346767425537, "learning_rate": 3.096078431372549e-05, "loss": 0.7097, "mean_token_accuracy": 0.7970587074756622, "num_tokens": 494352020.0, "step": 47630 }, { "entropy": 0.7165526032447815, "epoch": 0.38112, "grad_norm": 4.0982842445373535, "learning_rate": 3.0956782713085236e-05, "loss": 0.7157, "mean_token_accuracy": 0.7850749671459198, "num_tokens": 494485987.0, "step": 47640 }, { "entropy": 0.732045766711235, "epoch": 0.3812, "grad_norm": 4.637330055236816, "learning_rate": 3.095278111244498e-05, "loss": 0.7237, "mean_token_accuracy": 0.8071612536907196, "num_tokens": 494520937.0, "step": 47650 }, { "entropy": 0.7171895027160644, "epoch": 0.38128, "grad_norm": 1.8214032649993896, "learning_rate": 3.0948779511804724e-05, "loss": 0.7085, "mean_token_accuracy": 0.7809659779071808, "num_tokens": 494683615.0, "step": 47660 }, { "entropy": 0.6774268805980682, "epoch": 0.38136, "grad_norm": 3.1534271240234375, "learning_rate": 3.094477791116447e-05, "loss": 0.6769, "mean_token_accuracy": 0.8052613735198975, "num_tokens": 494762629.0, "step": 47670 }, { "entropy": 0.7698775589466095, "epoch": 0.38144, "grad_norm": 2.062807321548462, "learning_rate": 3.094077631052421e-05, "loss": 0.7693, "mean_token_accuracy": 0.7872285604476928, "num_tokens": 494855458.0, "step": 47680 }, { "entropy": 0.6212342977523804, "epoch": 0.38152, "grad_norm": 3.139636516571045, "learning_rate": 3.0936774709883955e-05, "loss": 0.6213, "mean_token_accuracy": 0.8079854249954224, "num_tokens": 494999552.0, "step": 47690 }, { "entropy": 0.6557882130146027, "epoch": 0.3816, "grad_norm": 4.820474147796631, "learning_rate": 3.09327731092437e-05, "loss": 0.6401, "mean_token_accuracy": 0.8242540717124939, "num_tokens": 495042161.0, "step": 47700 }, { "entropy": 0.703055202960968, "epoch": 0.38168, "grad_norm": 1.551080346107483, "learning_rate": 3.092877150860344e-05, "loss": 0.6982, "mean_token_accuracy": 0.7840035855770111, "num_tokens": 495204320.0, "step": 47710 }, { "entropy": 0.650077474117279, "epoch": 0.38176, "grad_norm": 2.887556552886963, "learning_rate": 3.0924769907963186e-05, "loss": 0.6537, "mean_token_accuracy": 0.8102268159389496, "num_tokens": 495283476.0, "step": 47720 }, { "entropy": 0.6378835260868072, "epoch": 0.38184, "grad_norm": 1.2483644485473633, "learning_rate": 3.092076830732293e-05, "loss": 0.6289, "mean_token_accuracy": 0.8163421094417572, "num_tokens": 495378228.0, "step": 47730 }, { "entropy": 0.7128715425729751, "epoch": 0.38192, "grad_norm": 2.83795166015625, "learning_rate": 3.091676670668267e-05, "loss": 0.7073, "mean_token_accuracy": 0.7895039021968842, "num_tokens": 495510861.0, "step": 47740 }, { "entropy": 0.6829171478748322, "epoch": 0.382, "grad_norm": 5.237654209136963, "learning_rate": 3.091276510604242e-05, "loss": 0.6907, "mean_token_accuracy": 0.8180537164211273, "num_tokens": 495548391.0, "step": 47750 }, { "entropy": 0.7029347240924835, "epoch": 0.38208, "grad_norm": 2.43966007232666, "learning_rate": 3.090876350540216e-05, "loss": 0.6927, "mean_token_accuracy": 0.7864694654941559, "num_tokens": 495711849.0, "step": 47760 }, { "entropy": 0.6312734186649323, "epoch": 0.38216, "grad_norm": 3.0667331218719482, "learning_rate": 3.090476190476191e-05, "loss": 0.6303, "mean_token_accuracy": 0.8181399285793305, "num_tokens": 495794463.0, "step": 47770 }, { "entropy": 0.6176040530204773, "epoch": 0.38224, "grad_norm": 1.3437877893447876, "learning_rate": 3.090076030412165e-05, "loss": 0.6094, "mean_token_accuracy": 0.8190242290496826, "num_tokens": 495887844.0, "step": 47780 }, { "entropy": 0.7360144138336182, "epoch": 0.38232, "grad_norm": 2.615665912628174, "learning_rate": 3.089675870348139e-05, "loss": 0.7324, "mean_token_accuracy": 0.7815007150173188, "num_tokens": 496024289.0, "step": 47790 }, { "entropy": 0.6516565680503845, "epoch": 0.3824, "grad_norm": 5.332898139953613, "learning_rate": 3.0892757102841136e-05, "loss": 0.6592, "mean_token_accuracy": 0.8159472227096558, "num_tokens": 496066737.0, "step": 47800 }, { "entropy": 0.6707772195339203, "epoch": 0.38248, "grad_norm": 1.4100356101989746, "learning_rate": 3.0888755502200886e-05, "loss": 0.666, "mean_token_accuracy": 0.7889838814735413, "num_tokens": 496230577.0, "step": 47810 }, { "entropy": 0.7404169052839279, "epoch": 0.38256, "grad_norm": 3.4007740020751953, "learning_rate": 3.088475390156062e-05, "loss": 0.7349, "mean_token_accuracy": 0.7911709725856781, "num_tokens": 496316487.0, "step": 47820 }, { "entropy": 0.6761103928089142, "epoch": 0.38264, "grad_norm": 1.6368077993392944, "learning_rate": 3.088075230092037e-05, "loss": 0.6711, "mean_token_accuracy": 0.8092295408248902, "num_tokens": 496409961.0, "step": 47830 }, { "entropy": 0.6747102618217469, "epoch": 0.38272, "grad_norm": 2.9662442207336426, "learning_rate": 3.087675070028012e-05, "loss": 0.666, "mean_token_accuracy": 0.7932985901832581, "num_tokens": 496547935.0, "step": 47840 }, { "entropy": 0.7200356602668763, "epoch": 0.3828, "grad_norm": 5.830690383911133, "learning_rate": 3.087274909963986e-05, "loss": 0.7222, "mean_token_accuracy": 0.8057998955249787, "num_tokens": 496590663.0, "step": 47850 }, { "entropy": 0.6584632813930511, "epoch": 0.38288, "grad_norm": 2.8776912689208984, "learning_rate": 3.08687474989996e-05, "loss": 0.6591, "mean_token_accuracy": 0.7934243738651275, "num_tokens": 496753861.0, "step": 47860 }, { "entropy": 0.7152874678373337, "epoch": 0.38296, "grad_norm": 3.734970808029175, "learning_rate": 3.086474589835934e-05, "loss": 0.7056, "mean_token_accuracy": 0.8047423422336578, "num_tokens": 496823947.0, "step": 47870 }, { "entropy": 0.7290513515472412, "epoch": 0.38304, "grad_norm": 1.8663384914398193, "learning_rate": 3.086074429771909e-05, "loss": 0.7295, "mean_token_accuracy": 0.7933045744895935, "num_tokens": 496915695.0, "step": 47880 }, { "entropy": 0.6743270218372345, "epoch": 0.38312, "grad_norm": 2.585796356201172, "learning_rate": 3.0856742697078836e-05, "loss": 0.6672, "mean_token_accuracy": 0.7959040760993957, "num_tokens": 497041065.0, "step": 47890 }, { "entropy": 0.6788066804409028, "epoch": 0.3832, "grad_norm": 4.123997211456299, "learning_rate": 3.085274109643857e-05, "loss": 0.6823, "mean_token_accuracy": 0.8178009867668152, "num_tokens": 497072079.0, "step": 47900 }, { "entropy": 0.6233389139175415, "epoch": 0.38328, "grad_norm": 1.3769493103027344, "learning_rate": 3.084873949579832e-05, "loss": 0.6184, "mean_token_accuracy": 0.8012152016162872, "num_tokens": 497235919.0, "step": 47910 }, { "entropy": 0.6564417153596878, "epoch": 0.38336, "grad_norm": 2.974011182785034, "learning_rate": 3.084473789515807e-05, "loss": 0.6482, "mean_token_accuracy": 0.8108912408351898, "num_tokens": 497332469.0, "step": 47920 }, { "entropy": 0.7019716739654541, "epoch": 0.38344, "grad_norm": 2.017162799835205, "learning_rate": 3.084073629451781e-05, "loss": 0.6922, "mean_token_accuracy": 0.7952846884727478, "num_tokens": 497426518.0, "step": 47930 }, { "entropy": 0.6797643005847931, "epoch": 0.38352, "grad_norm": 2.1489672660827637, "learning_rate": 3.083673469387755e-05, "loss": 0.6748, "mean_token_accuracy": 0.7939391493797302, "num_tokens": 497565284.0, "step": 47940 }, { "entropy": 0.5921574264764786, "epoch": 0.3836, "grad_norm": 4.470216274261475, "learning_rate": 3.08327330932373e-05, "loss": 0.5855, "mean_token_accuracy": 0.8364182710647583, "num_tokens": 497608336.0, "step": 47950 }, { "entropy": 0.695678997039795, "epoch": 0.38368, "grad_norm": 2.350497007369995, "learning_rate": 3.082873149259704e-05, "loss": 0.6987, "mean_token_accuracy": 0.7821934580802917, "num_tokens": 497772176.0, "step": 47960 }, { "entropy": 0.6512634843587876, "epoch": 0.38376, "grad_norm": 3.4631688594818115, "learning_rate": 3.0824729891956785e-05, "loss": 0.6434, "mean_token_accuracy": 0.8132407069206238, "num_tokens": 497853948.0, "step": 47970 }, { "entropy": 0.7155575722455978, "epoch": 0.38384, "grad_norm": 2.2062971591949463, "learning_rate": 3.082072829131653e-05, "loss": 0.7111, "mean_token_accuracy": 0.7976363062858581, "num_tokens": 497947606.0, "step": 47980 }, { "entropy": 0.6715629696846008, "epoch": 0.38392, "grad_norm": 3.3448116779327393, "learning_rate": 3.081672669067627e-05, "loss": 0.6591, "mean_token_accuracy": 0.7948856711387634, "num_tokens": 498081638.0, "step": 47990 }, { "entropy": 0.6919441074132919, "epoch": 0.384, "grad_norm": 4.317596912384033, "learning_rate": 3.081272509003602e-05, "loss": 0.6868, "mean_token_accuracy": 0.8145568311214447, "num_tokens": 498118214.0, "step": 48000 }, { "entropy": 0.6327556014060974, "epoch": 0.38408, "grad_norm": 1.8066585063934326, "learning_rate": 3.080872348939576e-05, "loss": 0.6348, "mean_token_accuracy": 0.7988578617572785, "num_tokens": 498280905.0, "step": 48010 }, { "entropy": 0.6542555630207062, "epoch": 0.38416, "grad_norm": 3.143749952316284, "learning_rate": 3.0804721888755504e-05, "loss": 0.6529, "mean_token_accuracy": 0.8165346920490265, "num_tokens": 498348755.0, "step": 48020 }, { "entropy": 0.7089971840381623, "epoch": 0.38424, "grad_norm": 1.9127529859542847, "learning_rate": 3.080072028811525e-05, "loss": 0.7037, "mean_token_accuracy": 0.8024952709674835, "num_tokens": 498442438.0, "step": 48030 }, { "entropy": 0.7075353562831879, "epoch": 0.38432, "grad_norm": 2.0121891498565674, "learning_rate": 3.079671868747499e-05, "loss": 0.709, "mean_token_accuracy": 0.7850686192512513, "num_tokens": 498582487.0, "step": 48040 }, { "entropy": 0.7688754916191101, "epoch": 0.3844, "grad_norm": 4.85783576965332, "learning_rate": 3.0792717086834735e-05, "loss": 0.7648, "mean_token_accuracy": 0.7974575281143188, "num_tokens": 498620423.0, "step": 48050 }, { "entropy": 0.6390243232250213, "epoch": 0.38448, "grad_norm": 1.6146525144577026, "learning_rate": 3.078871548619448e-05, "loss": 0.6438, "mean_token_accuracy": 0.7942810773849487, "num_tokens": 498783146.0, "step": 48060 }, { "entropy": 0.7262115925550461, "epoch": 0.38456, "grad_norm": 4.824716091156006, "learning_rate": 3.078471388555422e-05, "loss": 0.7019, "mean_token_accuracy": 0.8025564908981323, "num_tokens": 498856647.0, "step": 48070 }, { "entropy": 0.6980925858020782, "epoch": 0.38464, "grad_norm": 2.0552637577056885, "learning_rate": 3.0780712284913966e-05, "loss": 0.6993, "mean_token_accuracy": 0.7984511613845825, "num_tokens": 498948842.0, "step": 48080 }, { "entropy": 0.6865625321865082, "epoch": 0.38472, "grad_norm": 3.635403633117676, "learning_rate": 3.077671068427371e-05, "loss": 0.6802, "mean_token_accuracy": 0.8017281770706177, "num_tokens": 499081075.0, "step": 48090 }, { "entropy": 0.6636535316705704, "epoch": 0.3848, "grad_norm": 3.79750657081604, "learning_rate": 3.0772709083633454e-05, "loss": 0.6387, "mean_token_accuracy": 0.8272818207740784, "num_tokens": 499117562.0, "step": 48100 }, { "entropy": 0.6578599154949188, "epoch": 0.38488, "grad_norm": 2.436189889907837, "learning_rate": 3.07687074829932e-05, "loss": 0.6593, "mean_token_accuracy": 0.7902906775474549, "num_tokens": 499281402.0, "step": 48110 }, { "entropy": 0.7051636040210724, "epoch": 0.38496, "grad_norm": 3.42498517036438, "learning_rate": 3.076470588235294e-05, "loss": 0.7045, "mean_token_accuracy": 0.7976541697978974, "num_tokens": 499378701.0, "step": 48120 }, { "entropy": 0.6969667315483093, "epoch": 0.38504, "grad_norm": 1.9519245624542236, "learning_rate": 3.0760704281712685e-05, "loss": 0.6894, "mean_token_accuracy": 0.8034564137458802, "num_tokens": 499473970.0, "step": 48130 }, { "entropy": 0.7217585444450378, "epoch": 0.38512, "grad_norm": 2.6680338382720947, "learning_rate": 3.075670268107243e-05, "loss": 0.7209, "mean_token_accuracy": 0.7822882175445557, "num_tokens": 499620757.0, "step": 48140 }, { "entropy": 0.7052214503288269, "epoch": 0.3852, "grad_norm": 4.952423572540283, "learning_rate": 3.075270108043217e-05, "loss": 0.7064, "mean_token_accuracy": 0.8103481411933899, "num_tokens": 499660140.0, "step": 48150 }, { "entropy": 0.6022129833698273, "epoch": 0.38528, "grad_norm": 1.427315354347229, "learning_rate": 3.074869947979192e-05, "loss": 0.6035, "mean_token_accuracy": 0.8040669322013855, "num_tokens": 499823980.0, "step": 48160 }, { "entropy": 0.7247429490089417, "epoch": 0.38536, "grad_norm": 3.302182197570801, "learning_rate": 3.074469787915166e-05, "loss": 0.7146, "mean_token_accuracy": 0.7940555334091186, "num_tokens": 499902348.0, "step": 48170 }, { "entropy": 0.720578408241272, "epoch": 0.38544, "grad_norm": 1.436213731765747, "learning_rate": 3.0740696278511403e-05, "loss": 0.7324, "mean_token_accuracy": 0.7939706981182099, "num_tokens": 499993363.0, "step": 48180 }, { "entropy": 0.6804767489433289, "epoch": 0.38552, "grad_norm": 2.3920156955718994, "learning_rate": 3.073669467787115e-05, "loss": 0.6736, "mean_token_accuracy": 0.791608989238739, "num_tokens": 500135377.0, "step": 48190 }, { "entropy": 0.6744466155767441, "epoch": 0.3856, "grad_norm": 4.9957804679870605, "learning_rate": 3.07326930772309e-05, "loss": 0.6685, "mean_token_accuracy": 0.821891862154007, "num_tokens": 500174294.0, "step": 48200 }, { "entropy": 0.7165554165840149, "epoch": 0.38568, "grad_norm": 1.6668870449066162, "learning_rate": 3.0728691476590635e-05, "loss": 0.7161, "mean_token_accuracy": 0.7802772223949432, "num_tokens": 500337121.0, "step": 48210 }, { "entropy": 0.6114641666412354, "epoch": 0.38576, "grad_norm": 4.663923740386963, "learning_rate": 3.072468987595038e-05, "loss": 0.6115, "mean_token_accuracy": 0.822533106803894, "num_tokens": 500406263.0, "step": 48220 }, { "entropy": 0.7299150884151459, "epoch": 0.38584, "grad_norm": 2.1439340114593506, "learning_rate": 3.072068827531013e-05, "loss": 0.7312, "mean_token_accuracy": 0.7959636688232422, "num_tokens": 500498159.0, "step": 48230 }, { "entropy": 0.7180911481380463, "epoch": 0.38592, "grad_norm": 4.2758355140686035, "learning_rate": 3.071668667466987e-05, "loss": 0.7027, "mean_token_accuracy": 0.7904087662696838, "num_tokens": 500631226.0, "step": 48240 }, { "entropy": 0.7041403204202652, "epoch": 0.386, "grad_norm": 6.279050350189209, "learning_rate": 3.071268507402961e-05, "loss": 0.7041, "mean_token_accuracy": 0.8081238090991973, "num_tokens": 500668077.0, "step": 48250 }, { "entropy": 0.6934479951858521, "epoch": 0.38608, "grad_norm": 2.423851251602173, "learning_rate": 3.070868347338935e-05, "loss": 0.6915, "mean_token_accuracy": 0.7846116304397583, "num_tokens": 500831917.0, "step": 48260 }, { "entropy": 0.6309905409812927, "epoch": 0.38616, "grad_norm": 3.320765495300293, "learning_rate": 3.0704681872749104e-05, "loss": 0.6271, "mean_token_accuracy": 0.8153949439525604, "num_tokens": 500918700.0, "step": 48270 }, { "entropy": 0.6364252626895904, "epoch": 0.38624, "grad_norm": 1.3435062170028687, "learning_rate": 3.070068027210885e-05, "loss": 0.6415, "mean_token_accuracy": 0.8139662086963654, "num_tokens": 501012262.0, "step": 48280 }, { "entropy": 0.712127959728241, "epoch": 0.38632, "grad_norm": 2.2842862606048584, "learning_rate": 3.0696678671468584e-05, "loss": 0.6954, "mean_token_accuracy": 0.7870476305484772, "num_tokens": 501158745.0, "step": 48290 }, { "entropy": 0.6543712437152862, "epoch": 0.3864, "grad_norm": 5.30435037612915, "learning_rate": 3.0692677070828335e-05, "loss": 0.644, "mean_token_accuracy": 0.825432550907135, "num_tokens": 501203242.0, "step": 48300 }, { "entropy": 0.6521146237850189, "epoch": 0.38648, "grad_norm": 2.079911470413208, "learning_rate": 3.068867547018808e-05, "loss": 0.6634, "mean_token_accuracy": 0.7907730937004089, "num_tokens": 501367082.0, "step": 48310 }, { "entropy": 0.6913768887519837, "epoch": 0.38656, "grad_norm": 3.1190054416656494, "learning_rate": 3.068467386954782e-05, "loss": 0.6717, "mean_token_accuracy": 0.8044924139976501, "num_tokens": 501456391.0, "step": 48320 }, { "entropy": 0.7225266456604004, "epoch": 0.38664, "grad_norm": 2.215775489807129, "learning_rate": 3.068067226890756e-05, "loss": 0.7216, "mean_token_accuracy": 0.7945435345172882, "num_tokens": 501551573.0, "step": 48330 }, { "entropy": 0.6335220903158187, "epoch": 0.38672, "grad_norm": 1.9081710577011108, "learning_rate": 3.067667066826731e-05, "loss": 0.6365, "mean_token_accuracy": 0.8037149548530579, "num_tokens": 501694566.0, "step": 48340 }, { "entropy": 0.6479276955127716, "epoch": 0.3868, "grad_norm": 5.367987632751465, "learning_rate": 3.067266906762705e-05, "loss": 0.645, "mean_token_accuracy": 0.8223012745380401, "num_tokens": 501734414.0, "step": 48350 }, { "entropy": 0.6411489307880401, "epoch": 0.38688, "grad_norm": 1.7035455703735352, "learning_rate": 3.06686674669868e-05, "loss": 0.6398, "mean_token_accuracy": 0.8003437757492066, "num_tokens": 501895598.0, "step": 48360 }, { "entropy": 0.698896872997284, "epoch": 0.38696, "grad_norm": 3.2234413623809814, "learning_rate": 3.066466586634654e-05, "loss": 0.6818, "mean_token_accuracy": 0.8101977109909058, "num_tokens": 501971971.0, "step": 48370 }, { "entropy": 0.6294907867908478, "epoch": 0.38704, "grad_norm": 3.0727500915527344, "learning_rate": 3.0660664265706285e-05, "loss": 0.6278, "mean_token_accuracy": 0.8145390570163726, "num_tokens": 502065614.0, "step": 48380 }, { "entropy": 0.6614555954933167, "epoch": 0.38712, "grad_norm": 2.129901647567749, "learning_rate": 3.065666266506603e-05, "loss": 0.656, "mean_token_accuracy": 0.7992174386978149, "num_tokens": 502203172.0, "step": 48390 }, { "entropy": 0.6809174537658691, "epoch": 0.3872, "grad_norm": 5.910553455352783, "learning_rate": 3.065266106442577e-05, "loss": 0.7038, "mean_token_accuracy": 0.8152912735939026, "num_tokens": 502242425.0, "step": 48400 }, { "entropy": 0.587997430562973, "epoch": 0.38728, "grad_norm": 1.7444790601730347, "learning_rate": 3.0648659463785516e-05, "loss": 0.5827, "mean_token_accuracy": 0.809403920173645, "num_tokens": 502406162.0, "step": 48410 }, { "entropy": 0.6538594126701355, "epoch": 0.38736, "grad_norm": 3.2323272228240967, "learning_rate": 3.064465786314526e-05, "loss": 0.6453, "mean_token_accuracy": 0.8190969347953796, "num_tokens": 502477841.0, "step": 48420 }, { "entropy": 0.6797468602657318, "epoch": 0.38744, "grad_norm": 2.0683646202087402, "learning_rate": 3.0640656262505e-05, "loss": 0.6811, "mean_token_accuracy": 0.8019954681396484, "num_tokens": 502570889.0, "step": 48430 }, { "entropy": 0.7038350105285645, "epoch": 0.38752, "grad_norm": 2.190307855606079, "learning_rate": 3.063665466186475e-05, "loss": 0.6937, "mean_token_accuracy": 0.7902517259120941, "num_tokens": 502713523.0, "step": 48440 }, { "entropy": 0.6425594002008438, "epoch": 0.3876, "grad_norm": 4.746911525726318, "learning_rate": 3.063265306122449e-05, "loss": 0.6382, "mean_token_accuracy": 0.8264827013015748, "num_tokens": 502751972.0, "step": 48450 }, { "entropy": 0.6589545488357544, "epoch": 0.38768, "grad_norm": 1.7327876091003418, "learning_rate": 3.0628651460584234e-05, "loss": 0.6593, "mean_token_accuracy": 0.7918356239795685, "num_tokens": 502915812.0, "step": 48460 }, { "entropy": 0.6277533918619156, "epoch": 0.38776, "grad_norm": 3.320112466812134, "learning_rate": 3.062464985994398e-05, "loss": 0.6209, "mean_token_accuracy": 0.8139225840568542, "num_tokens": 503011215.0, "step": 48470 }, { "entropy": 0.7235270917415619, "epoch": 0.38784, "grad_norm": 1.4893848896026611, "learning_rate": 3.062064825930372e-05, "loss": 0.7299, "mean_token_accuracy": 0.7928475618362427, "num_tokens": 503106719.0, "step": 48480 }, { "entropy": 0.6460437506437302, "epoch": 0.38792, "grad_norm": 2.5194554328918457, "learning_rate": 3.0616646658663465e-05, "loss": 0.6437, "mean_token_accuracy": 0.8025272130966187, "num_tokens": 503243696.0, "step": 48490 }, { "entropy": 0.7120697349309921, "epoch": 0.388, "grad_norm": 3.9916858673095703, "learning_rate": 3.061264505802321e-05, "loss": 0.6992, "mean_token_accuracy": 0.8152953624725342, "num_tokens": 503282960.0, "step": 48500 }, { "entropy": 0.6233035355806351, "epoch": 0.38808, "grad_norm": 2.5811386108398438, "learning_rate": 3.060864345738296e-05, "loss": 0.6238, "mean_token_accuracy": 0.8034897327423096, "num_tokens": 503446542.0, "step": 48510 }, { "entropy": 0.6735898673534393, "epoch": 0.38816, "grad_norm": 3.7771127223968506, "learning_rate": 3.0604641856742697e-05, "loss": 0.6538, "mean_token_accuracy": 0.8107807278633118, "num_tokens": 503527463.0, "step": 48520 }, { "entropy": 0.6820917665958405, "epoch": 0.38824, "grad_norm": 2.033005952835083, "learning_rate": 3.060064025610244e-05, "loss": 0.6769, "mean_token_accuracy": 0.8048844993114471, "num_tokens": 503621160.0, "step": 48530 }, { "entropy": 0.6969133734703064, "epoch": 0.38832, "grad_norm": 2.295590877532959, "learning_rate": 3.0596638655462184e-05, "loss": 0.7, "mean_token_accuracy": 0.7909593403339386, "num_tokens": 503741935.0, "step": 48540 }, { "entropy": 0.6893895149230957, "epoch": 0.3884, "grad_norm": 5.3172478675842285, "learning_rate": 3.0592637054821934e-05, "loss": 0.6968, "mean_token_accuracy": 0.8213807940483093, "num_tokens": 503773686.0, "step": 48550 }, { "entropy": 0.6825809359550477, "epoch": 0.38848, "grad_norm": 1.7162134647369385, "learning_rate": 3.058863545418167e-05, "loss": 0.6735, "mean_token_accuracy": 0.7928798258304596, "num_tokens": 503937526.0, "step": 48560 }, { "entropy": 0.7111503958702088, "epoch": 0.38856, "grad_norm": 3.0871880054473877, "learning_rate": 3.0584633853541415e-05, "loss": 0.7004, "mean_token_accuracy": 0.8018642485141754, "num_tokens": 504026329.0, "step": 48570 }, { "entropy": 0.6682964384555816, "epoch": 0.38864, "grad_norm": 1.6602338552474976, "learning_rate": 3.0580632252901166e-05, "loss": 0.6672, "mean_token_accuracy": 0.8069787442684173, "num_tokens": 504120089.0, "step": 48580 }, { "entropy": 0.67965127825737, "epoch": 0.38872, "grad_norm": 2.5639007091522217, "learning_rate": 3.057663065226091e-05, "loss": 0.6787, "mean_token_accuracy": 0.7985149443149566, "num_tokens": 504247702.0, "step": 48590 }, { "entropy": 0.6866971909999847, "epoch": 0.3888, "grad_norm": 5.016125679016113, "learning_rate": 3.0572629051620646e-05, "loss": 0.6803, "mean_token_accuracy": 0.8169836699962616, "num_tokens": 504284554.0, "step": 48600 }, { "entropy": 0.6514616668224334, "epoch": 0.38888, "grad_norm": 1.7279775142669678, "learning_rate": 3.056862745098039e-05, "loss": 0.6444, "mean_token_accuracy": 0.798949682712555, "num_tokens": 504448394.0, "step": 48610 }, { "entropy": 0.6734361678361893, "epoch": 0.38896, "grad_norm": 2.462067127227783, "learning_rate": 3.056462585034014e-05, "loss": 0.6736, "mean_token_accuracy": 0.8060293972492218, "num_tokens": 504551951.0, "step": 48620 }, { "entropy": 0.6981667697429657, "epoch": 0.38904, "grad_norm": 2.1663291454315186, "learning_rate": 3.0560624249699884e-05, "loss": 0.6783, "mean_token_accuracy": 0.7983262419700623, "num_tokens": 504648895.0, "step": 48630 }, { "entropy": 0.6548638582229614, "epoch": 0.38912, "grad_norm": 3.411405563354492, "learning_rate": 3.055662264905962e-05, "loss": 0.6492, "mean_token_accuracy": 0.7976437866687774, "num_tokens": 504784841.0, "step": 48640 }, { "entropy": 0.6914079815149308, "epoch": 0.3892, "grad_norm": 4.240436553955078, "learning_rate": 3.0552621048419365e-05, "loss": 0.7025, "mean_token_accuracy": 0.8113301515579223, "num_tokens": 504821151.0, "step": 48650 }, { "entropy": 0.6432196021080017, "epoch": 0.38928, "grad_norm": 1.9965956211090088, "learning_rate": 3.0548619447779115e-05, "loss": 0.6381, "mean_token_accuracy": 0.7947423160076141, "num_tokens": 504984991.0, "step": 48660 }, { "entropy": 0.6399341642856597, "epoch": 0.38936, "grad_norm": 3.031916379928589, "learning_rate": 3.054461784713886e-05, "loss": 0.6347, "mean_token_accuracy": 0.8198915004730225, "num_tokens": 505059492.0, "step": 48670 }, { "entropy": 0.6713644802570343, "epoch": 0.38944, "grad_norm": 1.5341894626617432, "learning_rate": 3.0540616246498596e-05, "loss": 0.683, "mean_token_accuracy": 0.79993936419487, "num_tokens": 505154193.0, "step": 48680 }, { "entropy": 0.7117133677005768, "epoch": 0.38952, "grad_norm": 3.2797954082489014, "learning_rate": 3.0536614645858346e-05, "loss": 0.6866, "mean_token_accuracy": 0.7921667277812958, "num_tokens": 505290852.0, "step": 48690 }, { "entropy": 0.6367153912782669, "epoch": 0.3896, "grad_norm": 4.177088260650635, "learning_rate": 3.053261304521809e-05, "loss": 0.6382, "mean_token_accuracy": 0.8291225492954254, "num_tokens": 505328765.0, "step": 48700 }, { "entropy": 0.6528141319751739, "epoch": 0.38968, "grad_norm": 2.1007730960845947, "learning_rate": 3.0528611444577834e-05, "loss": 0.6552, "mean_token_accuracy": 0.7943983495235443, "num_tokens": 505491810.0, "step": 48710 }, { "entropy": 0.6352919667959214, "epoch": 0.38976, "grad_norm": 3.198702573776245, "learning_rate": 3.052460984393757e-05, "loss": 0.6261, "mean_token_accuracy": 0.8216610074043273, "num_tokens": 505564044.0, "step": 48720 }, { "entropy": 0.672950166463852, "epoch": 0.38984, "grad_norm": 1.893319010734558, "learning_rate": 3.052060824329732e-05, "loss": 0.6706, "mean_token_accuracy": 0.8109577894210815, "num_tokens": 505656498.0, "step": 48730 }, { "entropy": 0.7158263444900512, "epoch": 0.38992, "grad_norm": 1.9835786819458008, "learning_rate": 3.0516606642657065e-05, "loss": 0.7149, "mean_token_accuracy": 0.779504907131195, "num_tokens": 505803883.0, "step": 48740 }, { "entropy": 0.7286270797252655, "epoch": 0.39, "grad_norm": 6.306650161743164, "learning_rate": 3.0512605042016805e-05, "loss": 0.7233, "mean_token_accuracy": 0.8126713037490845, "num_tokens": 505844283.0, "step": 48750 }, { "entropy": 0.6757247686386109, "epoch": 0.39008, "grad_norm": 1.8555816411972046, "learning_rate": 3.0508603441376556e-05, "loss": 0.6758, "mean_token_accuracy": 0.7889411330223084, "num_tokens": 506008123.0, "step": 48760 }, { "entropy": 0.6942565500736236, "epoch": 0.39016, "grad_norm": 2.965942621231079, "learning_rate": 3.0504601840736296e-05, "loss": 0.6884, "mean_token_accuracy": 0.8010708570480347, "num_tokens": 506094640.0, "step": 48770 }, { "entropy": 0.7064875841140748, "epoch": 0.39024, "grad_norm": 2.061819076538086, "learning_rate": 3.050060024009604e-05, "loss": 0.7043, "mean_token_accuracy": 0.7991675674915314, "num_tokens": 506188844.0, "step": 48780 }, { "entropy": 0.7335965573787689, "epoch": 0.39032, "grad_norm": 3.738696336746216, "learning_rate": 3.049659863945578e-05, "loss": 0.7217, "mean_token_accuracy": 0.7846761465072631, "num_tokens": 506322718.0, "step": 48790 }, { "entropy": 0.622301971912384, "epoch": 0.3904, "grad_norm": 4.081179141998291, "learning_rate": 3.049259703881553e-05, "loss": 0.6169, "mean_token_accuracy": 0.8320433259010315, "num_tokens": 506359812.0, "step": 48800 }, { "entropy": 0.6572468221187592, "epoch": 0.39048, "grad_norm": 2.0685300827026367, "learning_rate": 3.048859543817527e-05, "loss": 0.6603, "mean_token_accuracy": 0.7906631648540496, "num_tokens": 506523652.0, "step": 48810 }, { "entropy": 0.6981642425060273, "epoch": 0.39056, "grad_norm": 3.4727139472961426, "learning_rate": 3.0484593837535015e-05, "loss": 0.6921, "mean_token_accuracy": 0.8015840411186218, "num_tokens": 506610823.0, "step": 48820 }, { "entropy": 0.6395432323217392, "epoch": 0.39064, "grad_norm": 1.8500471115112305, "learning_rate": 3.0480592236894762e-05, "loss": 0.6368, "mean_token_accuracy": 0.8124598622322082, "num_tokens": 506704769.0, "step": 48830 }, { "entropy": 0.6903632760047913, "epoch": 0.39072, "grad_norm": 2.3278777599334717, "learning_rate": 3.0476590636254506e-05, "loss": 0.688, "mean_token_accuracy": 0.793310672044754, "num_tokens": 506832607.0, "step": 48840 }, { "entropy": 0.7077936232089996, "epoch": 0.3908, "grad_norm": 4.7121758460998535, "learning_rate": 3.0472589035614246e-05, "loss": 0.7082, "mean_token_accuracy": 0.8163620054721832, "num_tokens": 506870269.0, "step": 48850 }, { "entropy": 0.677904748916626, "epoch": 0.39088, "grad_norm": 1.8280160427093506, "learning_rate": 3.046858743497399e-05, "loss": 0.6733, "mean_token_accuracy": 0.7927925705909729, "num_tokens": 507029611.0, "step": 48860 }, { "entropy": 0.6926416337490082, "epoch": 0.39096, "grad_norm": 4.848836898803711, "learning_rate": 3.0464585834333737e-05, "loss": 0.6934, "mean_token_accuracy": 0.8077634036540985, "num_tokens": 507091812.0, "step": 48870 }, { "entropy": 0.6837510526180267, "epoch": 0.39104, "grad_norm": 1.4846330881118774, "learning_rate": 3.046058423369348e-05, "loss": 0.6981, "mean_token_accuracy": 0.8032338738441467, "num_tokens": 507183248.0, "step": 48880 }, { "entropy": 0.7618474543094635, "epoch": 0.39112, "grad_norm": 4.2743635177612305, "learning_rate": 3.045658263305322e-05, "loss": 0.7391, "mean_token_accuracy": 0.7839210569858551, "num_tokens": 507315583.0, "step": 48890 }, { "entropy": 0.6222914636135102, "epoch": 0.3912, "grad_norm": 5.055088996887207, "learning_rate": 3.0452581032412968e-05, "loss": 0.6222, "mean_token_accuracy": 0.8290484786033631, "num_tokens": 507352488.0, "step": 48900 }, { "entropy": 0.6406458020210266, "epoch": 0.39128, "grad_norm": 2.678246021270752, "learning_rate": 3.044857943177271e-05, "loss": 0.6412, "mean_token_accuracy": 0.7955666840076446, "num_tokens": 507516328.0, "step": 48910 }, { "entropy": 0.670680433511734, "epoch": 0.39136, "grad_norm": 2.9398653507232666, "learning_rate": 3.0444577831132455e-05, "loss": 0.6649, "mean_token_accuracy": 0.813696700334549, "num_tokens": 507600441.0, "step": 48920 }, { "entropy": 0.6732663631439209, "epoch": 0.39144, "grad_norm": 1.989784598350525, "learning_rate": 3.0440576230492196e-05, "loss": 0.6607, "mean_token_accuracy": 0.8061537981033325, "num_tokens": 507694616.0, "step": 48930 }, { "entropy": 0.6546707153320312, "epoch": 0.39152, "grad_norm": 2.5610170364379883, "learning_rate": 3.0436574629851943e-05, "loss": 0.6484, "mean_token_accuracy": 0.7981297969818115, "num_tokens": 507841381.0, "step": 48940 }, { "entropy": 0.7154937446117401, "epoch": 0.3916, "grad_norm": 5.005180835723877, "learning_rate": 3.0432573029211686e-05, "loss": 0.7091, "mean_token_accuracy": 0.8092426240444184, "num_tokens": 507880157.0, "step": 48950 }, { "entropy": 0.663327944278717, "epoch": 0.39168, "grad_norm": 2.890275478363037, "learning_rate": 3.042857142857143e-05, "loss": 0.6587, "mean_token_accuracy": 0.7895661294460297, "num_tokens": 508041758.0, "step": 48960 }, { "entropy": 0.6879584968090058, "epoch": 0.39176, "grad_norm": 2.9265146255493164, "learning_rate": 3.0424569827931177e-05, "loss": 0.6936, "mean_token_accuracy": 0.8041995346546174, "num_tokens": 508121708.0, "step": 48970 }, { "entropy": 0.6835443139076233, "epoch": 0.39184, "grad_norm": 2.8153553009033203, "learning_rate": 3.0420568227290917e-05, "loss": 0.688, "mean_token_accuracy": 0.8022885859012604, "num_tokens": 508215772.0, "step": 48980 }, { "entropy": 0.7434072017669677, "epoch": 0.39192, "grad_norm": 2.1753504276275635, "learning_rate": 3.041656662665066e-05, "loss": 0.7382, "mean_token_accuracy": 0.771957916021347, "num_tokens": 508362286.0, "step": 48990 }, { "entropy": 0.645671808719635, "epoch": 0.392, "grad_norm": 4.679886817932129, "learning_rate": 3.0412565026010405e-05, "loss": 0.6385, "mean_token_accuracy": 0.8249021649360657, "num_tokens": 508404746.0, "step": 49000 }, { "entropy": 0.6638190746307373, "epoch": 0.39208, "grad_norm": 1.8680148124694824, "learning_rate": 3.0408563425370152e-05, "loss": 0.6649, "mean_token_accuracy": 0.7905517637729644, "num_tokens": 508568472.0, "step": 49010 }, { "entropy": 0.6556458801031113, "epoch": 0.39216, "grad_norm": 2.653085708618164, "learning_rate": 3.0404561824729892e-05, "loss": 0.6515, "mean_token_accuracy": 0.8136021375656128, "num_tokens": 508653317.0, "step": 49020 }, { "entropy": 0.7084966480731965, "epoch": 0.39224, "grad_norm": 1.4755539894104004, "learning_rate": 3.0400560224089636e-05, "loss": 0.7209, "mean_token_accuracy": 0.798294472694397, "num_tokens": 508747553.0, "step": 49030 }, { "entropy": 0.63611119389534, "epoch": 0.39232, "grad_norm": 3.4295175075531006, "learning_rate": 3.0396558623449383e-05, "loss": 0.6252, "mean_token_accuracy": 0.8054362118244172, "num_tokens": 508881222.0, "step": 49040 }, { "entropy": 0.6363982737064362, "epoch": 0.3924, "grad_norm": 5.170084476470947, "learning_rate": 3.0392557022809127e-05, "loss": 0.6507, "mean_token_accuracy": 0.8240203320980072, "num_tokens": 508921416.0, "step": 49050 }, { "entropy": 0.6702770411968231, "epoch": 0.39248, "grad_norm": 1.7863281965255737, "learning_rate": 3.0388555422168867e-05, "loss": 0.6736, "mean_token_accuracy": 0.7873351335525512, "num_tokens": 509085256.0, "step": 49060 }, { "entropy": 0.6307376861572266, "epoch": 0.39256, "grad_norm": 3.565380573272705, "learning_rate": 3.038455382152861e-05, "loss": 0.6134, "mean_token_accuracy": 0.8188802003860474, "num_tokens": 509173780.0, "step": 49070 }, { "entropy": 0.6467722475528717, "epoch": 0.39264, "grad_norm": 1.5637034177780151, "learning_rate": 3.0380552220888358e-05, "loss": 0.6458, "mean_token_accuracy": 0.8129550755023957, "num_tokens": 509266163.0, "step": 49080 }, { "entropy": 0.6594864189624786, "epoch": 0.39272, "grad_norm": 3.828199625015259, "learning_rate": 3.0376550620248102e-05, "loss": 0.6529, "mean_token_accuracy": 0.8015903830528259, "num_tokens": 509399274.0, "step": 49090 }, { "entropy": 0.6865474641323089, "epoch": 0.3928, "grad_norm": 3.9445533752441406, "learning_rate": 3.0372549019607842e-05, "loss": 0.6738, "mean_token_accuracy": 0.8223903357982636, "num_tokens": 509436118.0, "step": 49100 }, { "entropy": 0.6083915770053864, "epoch": 0.39288, "grad_norm": 2.599829912185669, "learning_rate": 3.0368547418967593e-05, "loss": 0.612, "mean_token_accuracy": 0.8036272704601288, "num_tokens": 509599958.0, "step": 49110 }, { "entropy": 0.694167646765709, "epoch": 0.39296, "grad_norm": 3.0006983280181885, "learning_rate": 3.0364545818327333e-05, "loss": 0.6874, "mean_token_accuracy": 0.804090929031372, "num_tokens": 509685772.0, "step": 49120 }, { "entropy": 0.7042025804519654, "epoch": 0.39304, "grad_norm": 1.710440993309021, "learning_rate": 3.0360544217687077e-05, "loss": 0.7005, "mean_token_accuracy": 0.7992679595947265, "num_tokens": 509778389.0, "step": 49130 }, { "entropy": 0.7299627184867858, "epoch": 0.39312, "grad_norm": 3.1314315795898438, "learning_rate": 3.0356542617046817e-05, "loss": 0.7221, "mean_token_accuracy": 0.7826919853687286, "num_tokens": 509919967.0, "step": 49140 }, { "entropy": 0.6178540170192719, "epoch": 0.3932, "grad_norm": 5.434418201446533, "learning_rate": 3.0352541016406567e-05, "loss": 0.6247, "mean_token_accuracy": 0.8253859758377076, "num_tokens": 509963736.0, "step": 49150 }, { "entropy": 0.6193260014057159, "epoch": 0.39328, "grad_norm": 1.3901337385177612, "learning_rate": 3.0348539415766308e-05, "loss": 0.6185, "mean_token_accuracy": 0.7996152997016907, "num_tokens": 510127576.0, "step": 49160 }, { "entropy": 0.7239851593971253, "epoch": 0.39336, "grad_norm": 4.360141277313232, "learning_rate": 3.034453781512605e-05, "loss": 0.7165, "mean_token_accuracy": 0.7933282732963562, "num_tokens": 510221489.0, "step": 49170 }, { "entropy": 0.7351317822933197, "epoch": 0.39344, "grad_norm": 2.1278295516967773, "learning_rate": 3.03405362144858e-05, "loss": 0.7333, "mean_token_accuracy": 0.7919486224651336, "num_tokens": 510315055.0, "step": 49180 }, { "entropy": 0.6585614442825317, "epoch": 0.39352, "grad_norm": 2.327054738998413, "learning_rate": 3.0336534613845542e-05, "loss": 0.6538, "mean_token_accuracy": 0.8008577167987824, "num_tokens": 510442310.0, "step": 49190 }, { "entropy": 0.6071186423301697, "epoch": 0.3936, "grad_norm": 4.697583198547363, "learning_rate": 3.0332533013205283e-05, "loss": 0.5877, "mean_token_accuracy": 0.8373219728469848, "num_tokens": 510478627.0, "step": 49200 }, { "entropy": 0.6787690281867981, "epoch": 0.39368, "grad_norm": 1.5331292152404785, "learning_rate": 3.0328531412565026e-05, "loss": 0.6821, "mean_token_accuracy": 0.788574755191803, "num_tokens": 510642467.0, "step": 49210 }, { "entropy": 0.7872970938682556, "epoch": 0.39376, "grad_norm": 3.154099464416504, "learning_rate": 3.0324529811924773e-05, "loss": 0.777, "mean_token_accuracy": 0.7817018270492554, "num_tokens": 510733764.0, "step": 49220 }, { "entropy": 0.6797012567520142, "epoch": 0.39384, "grad_norm": 1.95185124874115, "learning_rate": 3.0320528211284517e-05, "loss": 0.6755, "mean_token_accuracy": 0.8085356175899505, "num_tokens": 510829222.0, "step": 49230 }, { "entropy": 0.686888599395752, "epoch": 0.39392, "grad_norm": 2.23079252243042, "learning_rate": 3.0316526610644257e-05, "loss": 0.6913, "mean_token_accuracy": 0.78815056681633, "num_tokens": 510963330.0, "step": 49240 }, { "entropy": 0.5713643878698349, "epoch": 0.394, "grad_norm": 5.236164569854736, "learning_rate": 3.0312525010004e-05, "loss": 0.5801, "mean_token_accuracy": 0.8425271809101105, "num_tokens": 510997919.0, "step": 49250 }, { "entropy": 0.7105627298355103, "epoch": 0.39408, "grad_norm": 2.268068790435791, "learning_rate": 3.0308523409363748e-05, "loss": 0.7046, "mean_token_accuracy": 0.7835771024227143, "num_tokens": 511157665.0, "step": 49260 }, { "entropy": 0.6814302146434784, "epoch": 0.39416, "grad_norm": 3.6493918895721436, "learning_rate": 3.0304521808723492e-05, "loss": 0.6681, "mean_token_accuracy": 0.8120042145252228, "num_tokens": 511223795.0, "step": 49270 }, { "entropy": 0.6351679623126983, "epoch": 0.39424, "grad_norm": 1.7039594650268555, "learning_rate": 3.0300520208083232e-05, "loss": 0.6385, "mean_token_accuracy": 0.811228770017624, "num_tokens": 511316954.0, "step": 49280 }, { "entropy": 0.7140373051166534, "epoch": 0.39432, "grad_norm": 2.3185997009277344, "learning_rate": 3.029651860744298e-05, "loss": 0.7199, "mean_token_accuracy": 0.7823196947574615, "num_tokens": 511460732.0, "step": 49290 }, { "entropy": 0.7243732690811158, "epoch": 0.3944, "grad_norm": 5.00027322769165, "learning_rate": 3.0292517006802723e-05, "loss": 0.7071, "mean_token_accuracy": 0.8126551926136016, "num_tokens": 511502173.0, "step": 49300 }, { "entropy": 0.6894388496875763, "epoch": 0.39448, "grad_norm": 1.6288305521011353, "learning_rate": 3.0288515406162467e-05, "loss": 0.687, "mean_token_accuracy": 0.7875045895576477, "num_tokens": 511664679.0, "step": 49310 }, { "entropy": 0.673168420791626, "epoch": 0.39456, "grad_norm": 3.935500144958496, "learning_rate": 3.0284513805522207e-05, "loss": 0.6576, "mean_token_accuracy": 0.8121300399303436, "num_tokens": 511744725.0, "step": 49320 }, { "entropy": 0.7170416295528412, "epoch": 0.39464, "grad_norm": 1.9944148063659668, "learning_rate": 3.0280512204881954e-05, "loss": 0.7243, "mean_token_accuracy": 0.7932028293609619, "num_tokens": 511837744.0, "step": 49330 }, { "entropy": 0.6547003805637359, "epoch": 0.39472, "grad_norm": 2.1405858993530273, "learning_rate": 3.0276510604241698e-05, "loss": 0.6536, "mean_token_accuracy": 0.7991746246814728, "num_tokens": 511981486.0, "step": 49340 }, { "entropy": 0.6758216559886933, "epoch": 0.3948, "grad_norm": 4.4754228591918945, "learning_rate": 3.027250900360144e-05, "loss": 0.6703, "mean_token_accuracy": 0.815138828754425, "num_tokens": 512024199.0, "step": 49350 }, { "entropy": 0.6373512327671051, "epoch": 0.39488, "grad_norm": 2.073171615600586, "learning_rate": 3.026850740296119e-05, "loss": 0.638, "mean_token_accuracy": 0.7962441563606262, "num_tokens": 512187993.0, "step": 49360 }, { "entropy": 0.6704169988632203, "epoch": 0.39496, "grad_norm": 3.070732831954956, "learning_rate": 3.026450580232093e-05, "loss": 0.6624, "mean_token_accuracy": 0.8087790906429291, "num_tokens": 512274736.0, "step": 49370 }, { "entropy": 0.7089188277721405, "epoch": 0.39504, "grad_norm": 1.8970240354537964, "learning_rate": 3.0260504201680673e-05, "loss": 0.7083, "mean_token_accuracy": 0.7936861336231231, "num_tokens": 512369859.0, "step": 49380 }, { "entropy": 0.6887352883815765, "epoch": 0.39512, "grad_norm": 2.820201873779297, "learning_rate": 3.0256502601040417e-05, "loss": 0.685, "mean_token_accuracy": 0.7912180185317993, "num_tokens": 512505285.0, "step": 49390 }, { "entropy": 0.6367606222629547, "epoch": 0.3952, "grad_norm": 5.127089500427246, "learning_rate": 3.0252501000400164e-05, "loss": 0.6395, "mean_token_accuracy": 0.8261517643928528, "num_tokens": 512542400.0, "step": 49400 }, { "entropy": 0.640942120552063, "epoch": 0.39528, "grad_norm": 3.026026964187622, "learning_rate": 3.0248499399759904e-05, "loss": 0.6412, "mean_token_accuracy": 0.7980331361293793, "num_tokens": 512705525.0, "step": 49410 }, { "entropy": 0.6760110139846802, "epoch": 0.39536, "grad_norm": 5.705617904663086, "learning_rate": 3.0244497799119648e-05, "loss": 0.6768, "mean_token_accuracy": 0.805307924747467, "num_tokens": 512788532.0, "step": 49420 }, { "entropy": 0.6189146399497986, "epoch": 0.39544, "grad_norm": 1.545206069946289, "learning_rate": 3.0240496198479395e-05, "loss": 0.6096, "mean_token_accuracy": 0.8219277262687683, "num_tokens": 512881913.0, "step": 49430 }, { "entropy": 0.6778177976608276, "epoch": 0.39552, "grad_norm": 3.709939479827881, "learning_rate": 3.023649459783914e-05, "loss": 0.6783, "mean_token_accuracy": 0.7963400602340698, "num_tokens": 513020357.0, "step": 49440 }, { "entropy": 0.7189423233270645, "epoch": 0.3956, "grad_norm": 5.332411289215088, "learning_rate": 3.023249299719888e-05, "loss": 0.7109, "mean_token_accuracy": 0.8108816623687745, "num_tokens": 513051913.0, "step": 49450 }, { "entropy": 0.6492113590240478, "epoch": 0.39568, "grad_norm": 1.7569689750671387, "learning_rate": 3.0228491396558623e-05, "loss": 0.6451, "mean_token_accuracy": 0.7931220829486847, "num_tokens": 513214277.0, "step": 49460 }, { "entropy": 0.7438623785972596, "epoch": 0.39576, "grad_norm": 4.056646823883057, "learning_rate": 3.022448979591837e-05, "loss": 0.7367, "mean_token_accuracy": 0.7953855812549591, "num_tokens": 513281534.0, "step": 49470 }, { "entropy": 0.6853878974914551, "epoch": 0.39584, "grad_norm": 1.4523563385009766, "learning_rate": 3.0220488195278113e-05, "loss": 0.6807, "mean_token_accuracy": 0.8042033910751343, "num_tokens": 513373655.0, "step": 49480 }, { "entropy": 0.6428495019674301, "epoch": 0.39592, "grad_norm": 1.895399570465088, "learning_rate": 3.0216486594637854e-05, "loss": 0.647, "mean_token_accuracy": 0.7982647001743317, "num_tokens": 513506636.0, "step": 49490 }, { "entropy": 0.6780396223068237, "epoch": 0.396, "grad_norm": 4.602011203765869, "learning_rate": 3.0212484993997604e-05, "loss": 0.677, "mean_token_accuracy": 0.8194813787937164, "num_tokens": 513541588.0, "step": 49500 }, { "entropy": 0.616208815574646, "epoch": 0.39608, "grad_norm": 1.7943531274795532, "learning_rate": 3.0208483393357344e-05, "loss": 0.606, "mean_token_accuracy": 0.8052637994289398, "num_tokens": 513705428.0, "step": 49510 }, { "entropy": 0.7435601055622101, "epoch": 0.39616, "grad_norm": 3.9647791385650635, "learning_rate": 3.0204481792717088e-05, "loss": 0.7334, "mean_token_accuracy": 0.7946052610874176, "num_tokens": 513795199.0, "step": 49520 }, { "entropy": 0.6214833378791809, "epoch": 0.39624, "grad_norm": 2.1320738792419434, "learning_rate": 3.020048019207683e-05, "loss": 0.6289, "mean_token_accuracy": 0.816115003824234, "num_tokens": 513888270.0, "step": 49530 }, { "entropy": 0.6769629418849945, "epoch": 0.39632, "grad_norm": 2.542178153991699, "learning_rate": 3.019647859143658e-05, "loss": 0.6694, "mean_token_accuracy": 0.794828599691391, "num_tokens": 514025094.0, "step": 49540 }, { "entropy": 0.6184698224067688, "epoch": 0.3964, "grad_norm": 4.461172103881836, "learning_rate": 3.019247699079632e-05, "loss": 0.6227, "mean_token_accuracy": 0.8287138283252716, "num_tokens": 514063839.0, "step": 49550 }, { "entropy": 0.6603024721145629, "epoch": 0.39648, "grad_norm": 2.2655396461486816, "learning_rate": 3.0188475390156063e-05, "loss": 0.6568, "mean_token_accuracy": 0.791548615694046, "num_tokens": 514227679.0, "step": 49560 }, { "entropy": 0.7343362957239151, "epoch": 0.39656, "grad_norm": 3.514312744140625, "learning_rate": 3.018447378951581e-05, "loss": 0.7289, "mean_token_accuracy": 0.7999137163162231, "num_tokens": 514304005.0, "step": 49570 }, { "entropy": 0.6831176280975342, "epoch": 0.39664, "grad_norm": 2.0755951404571533, "learning_rate": 3.0180472188875554e-05, "loss": 0.69, "mean_token_accuracy": 0.8068555414676666, "num_tokens": 514396496.0, "step": 49580 }, { "entropy": 0.7472252070903778, "epoch": 0.39672, "grad_norm": 2.809180498123169, "learning_rate": 3.0176470588235294e-05, "loss": 0.7395, "mean_token_accuracy": 0.7810597360134125, "num_tokens": 514543065.0, "step": 49590 }, { "entropy": 0.6421416312456131, "epoch": 0.3968, "grad_norm": 5.030439853668213, "learning_rate": 3.0172468987595038e-05, "loss": 0.6433, "mean_token_accuracy": 0.8243786990642548, "num_tokens": 514583408.0, "step": 49600 }, { "entropy": 0.6405441224575043, "epoch": 0.39688, "grad_norm": 2.4231326580047607, "learning_rate": 3.0168467386954785e-05, "loss": 0.636, "mean_token_accuracy": 0.8003908216953277, "num_tokens": 514747248.0, "step": 49610 }, { "entropy": 0.6099374115467071, "epoch": 0.39696, "grad_norm": 3.3333377838134766, "learning_rate": 3.016446578631453e-05, "loss": 0.6117, "mean_token_accuracy": 0.8169696092605591, "num_tokens": 514853093.0, "step": 49620 }, { "entropy": 0.6920305132865906, "epoch": 0.39704, "grad_norm": 1.7916295528411865, "learning_rate": 3.016046418567427e-05, "loss": 0.6896, "mean_token_accuracy": 0.803664380311966, "num_tokens": 514950009.0, "step": 49630 }, { "entropy": 0.6774322092533112, "epoch": 0.39712, "grad_norm": 2.5025289058685303, "learning_rate": 3.0156462585034016e-05, "loss": 0.6711, "mean_token_accuracy": 0.7939774513244628, "num_tokens": 515090551.0, "step": 49640 }, { "entropy": 0.6143556416034699, "epoch": 0.3972, "grad_norm": 4.140944480895996, "learning_rate": 3.015246098439376e-05, "loss": 0.6112, "mean_token_accuracy": 0.8261198401451111, "num_tokens": 515132138.0, "step": 49650 }, { "entropy": 0.679127985239029, "epoch": 0.39728, "grad_norm": 1.56034517288208, "learning_rate": 3.0148459383753504e-05, "loss": 0.6748, "mean_token_accuracy": 0.7919088900089264, "num_tokens": 515295978.0, "step": 49660 }, { "entropy": 0.7190076351165772, "epoch": 0.39736, "grad_norm": 3.3380250930786133, "learning_rate": 3.0144457783113244e-05, "loss": 0.7062, "mean_token_accuracy": 0.7960243105888367, "num_tokens": 515389725.0, "step": 49670 }, { "entropy": 0.6373743057250977, "epoch": 0.39744, "grad_norm": 2.4327709674835205, "learning_rate": 3.014045618247299e-05, "loss": 0.6348, "mean_token_accuracy": 0.8171901226043701, "num_tokens": 515482305.0, "step": 49680 }, { "entropy": 0.6535311639308929, "epoch": 0.39752, "grad_norm": 2.1493782997131348, "learning_rate": 3.0136454581832735e-05, "loss": 0.6475, "mean_token_accuracy": 0.8029964804649353, "num_tokens": 515618564.0, "step": 49690 }, { "entropy": 0.6927102565765381, "epoch": 0.3976, "grad_norm": 4.9516401290893555, "learning_rate": 3.013245298119248e-05, "loss": 0.6922, "mean_token_accuracy": 0.8167658746242523, "num_tokens": 515653913.0, "step": 49700 }, { "entropy": 0.6825534164905548, "epoch": 0.39768, "grad_norm": 2.1781272888183594, "learning_rate": 3.0128451380552226e-05, "loss": 0.686, "mean_token_accuracy": 0.7888109862804413, "num_tokens": 515817361.0, "step": 49710 }, { "entropy": 0.6929363995790482, "epoch": 0.39776, "grad_norm": 3.0939226150512695, "learning_rate": 3.0124449779911966e-05, "loss": 0.6788, "mean_token_accuracy": 0.8025253355503082, "num_tokens": 515899064.0, "step": 49720 }, { "entropy": 0.7230187773704528, "epoch": 0.39784, "grad_norm": 1.576042652130127, "learning_rate": 3.012044817927171e-05, "loss": 0.7177, "mean_token_accuracy": 0.7948563814163208, "num_tokens": 515992664.0, "step": 49730 }, { "entropy": 0.7084539473056793, "epoch": 0.39792, "grad_norm": 3.3064355850219727, "learning_rate": 3.0116446578631453e-05, "loss": 0.6979, "mean_token_accuracy": 0.7865897119045258, "num_tokens": 516138682.0, "step": 49740 }, { "entropy": 0.684683209657669, "epoch": 0.398, "grad_norm": 4.9392266273498535, "learning_rate": 3.01124449779912e-05, "loss": 0.6763, "mean_token_accuracy": 0.8136269211769104, "num_tokens": 516183578.0, "step": 49750 }, { "entropy": 0.626330703496933, "epoch": 0.39808, "grad_norm": 1.5883899927139282, "learning_rate": 3.010844337735094e-05, "loss": 0.6304, "mean_token_accuracy": 0.8009159743785859, "num_tokens": 516347418.0, "step": 49760 }, { "entropy": 0.6687073945999146, "epoch": 0.39816, "grad_norm": 3.0746448040008545, "learning_rate": 3.0104441776710684e-05, "loss": 0.6684, "mean_token_accuracy": 0.8044385850429535, "num_tokens": 516445129.0, "step": 49770 }, { "entropy": 0.6714947283267975, "epoch": 0.39824, "grad_norm": 2.1512610912323, "learning_rate": 3.0100440176070428e-05, "loss": 0.6601, "mean_token_accuracy": 0.8083550095558166, "num_tokens": 516540536.0, "step": 49780 }, { "entropy": 0.5858157336711883, "epoch": 0.39832, "grad_norm": 2.5719006061553955, "learning_rate": 3.0096438575430175e-05, "loss": 0.5786, "mean_token_accuracy": 0.8173336505889892, "num_tokens": 516677694.0, "step": 49790 }, { "entropy": 0.7027340114116669, "epoch": 0.3984, "grad_norm": 7.573315620422363, "learning_rate": 3.0092436974789916e-05, "loss": 0.7005, "mean_token_accuracy": 0.8160129845142364, "num_tokens": 516711475.0, "step": 49800 }, { "entropy": 0.6693604767322541, "epoch": 0.39848, "grad_norm": 1.4207916259765625, "learning_rate": 3.008843537414966e-05, "loss": 0.6638, "mean_token_accuracy": 0.7922569692134858, "num_tokens": 516875315.0, "step": 49810 }, { "entropy": 0.6565242081880569, "epoch": 0.39856, "grad_norm": 3.1290409564971924, "learning_rate": 3.0084433773509406e-05, "loss": 0.6527, "mean_token_accuracy": 0.8111157298088074, "num_tokens": 516971329.0, "step": 49820 }, { "entropy": 0.7385498940944671, "epoch": 0.39864, "grad_norm": 2.5427191257476807, "learning_rate": 3.008043217286915e-05, "loss": 0.7408, "mean_token_accuracy": 0.7916995167732239, "num_tokens": 517066972.0, "step": 49830 }, { "entropy": 0.7114845395088196, "epoch": 0.39872, "grad_norm": 3.344881534576416, "learning_rate": 3.007643057222889e-05, "loss": 0.7126, "mean_token_accuracy": 0.7878223180770874, "num_tokens": 517205544.0, "step": 49840 }, { "entropy": 0.7068558663129807, "epoch": 0.3988, "grad_norm": 3.668231725692749, "learning_rate": 3.0072428971588634e-05, "loss": 0.698, "mean_token_accuracy": 0.8113301277160645, "num_tokens": 517247240.0, "step": 49850 }, { "entropy": 0.6647745370864868, "epoch": 0.39888, "grad_norm": 1.5446232557296753, "learning_rate": 3.006842737094838e-05, "loss": 0.6627, "mean_token_accuracy": 0.7909807801246643, "num_tokens": 517410396.0, "step": 49860 }, { "entropy": 0.6415772497653961, "epoch": 0.39896, "grad_norm": 3.051448345184326, "learning_rate": 3.0064425770308125e-05, "loss": 0.6221, "mean_token_accuracy": 0.8213976085186004, "num_tokens": 517484515.0, "step": 49870 }, { "entropy": 0.6856166899204255, "epoch": 0.39904, "grad_norm": 2.6904475688934326, "learning_rate": 3.0060424169667865e-05, "loss": 0.6942, "mean_token_accuracy": 0.796797001361847, "num_tokens": 517577682.0, "step": 49880 }, { "entropy": 0.735369673371315, "epoch": 0.39912, "grad_norm": 1.8613076210021973, "learning_rate": 3.0056422569027616e-05, "loss": 0.7303, "mean_token_accuracy": 0.7827201426029206, "num_tokens": 517710198.0, "step": 49890 }, { "entropy": 0.6882302314043045, "epoch": 0.3992, "grad_norm": 4.976855278015137, "learning_rate": 3.0052420968387356e-05, "loss": 0.6784, "mean_token_accuracy": 0.8158285617828369, "num_tokens": 517746956.0, "step": 49900 }, { "entropy": 0.6181832671165466, "epoch": 0.39928, "grad_norm": 3.193019390106201, "learning_rate": 3.00484193677471e-05, "loss": 0.6216, "mean_token_accuracy": 0.8014594554901123, "num_tokens": 517910796.0, "step": 49910 }, { "entropy": 0.6544101387262344, "epoch": 0.39936, "grad_norm": 3.251941442489624, "learning_rate": 3.004441776710684e-05, "loss": 0.6458, "mean_token_accuracy": 0.8114955425262451, "num_tokens": 518005355.0, "step": 49920 }, { "entropy": 0.7171759068965912, "epoch": 0.39944, "grad_norm": 2.770240068435669, "learning_rate": 3.004041616646659e-05, "loss": 0.7084, "mean_token_accuracy": 0.796361094713211, "num_tokens": 518100815.0, "step": 49930 }, { "entropy": 0.6712304592132569, "epoch": 0.39952, "grad_norm": 3.002027988433838, "learning_rate": 3.003641456582633e-05, "loss": 0.6706, "mean_token_accuracy": 0.7919191002845765, "num_tokens": 518242475.0, "step": 49940 }, { "entropy": 0.6349035114049911, "epoch": 0.3996, "grad_norm": 4.277710437774658, "learning_rate": 3.0032412965186075e-05, "loss": 0.624, "mean_token_accuracy": 0.8328610301017761, "num_tokens": 518280515.0, "step": 49950 }, { "entropy": 0.6860797941684723, "epoch": 0.39968, "grad_norm": 1.9553481340408325, "learning_rate": 3.0028411364545822e-05, "loss": 0.693, "mean_token_accuracy": 0.7843094646930695, "num_tokens": 518440735.0, "step": 49960 }, { "entropy": 0.6339080661535264, "epoch": 0.39976, "grad_norm": 2.7413651943206787, "learning_rate": 3.0024409763905565e-05, "loss": 0.6194, "mean_token_accuracy": 0.8202547371387482, "num_tokens": 518510697.0, "step": 49970 }, { "entropy": 0.7218900382518768, "epoch": 0.39984, "grad_norm": 1.966145396232605, "learning_rate": 3.0020408163265306e-05, "loss": 0.7161, "mean_token_accuracy": 0.7926383376121521, "num_tokens": 518605023.0, "step": 49980 }, { "entropy": 0.6951115220785141, "epoch": 0.39992, "grad_norm": 2.0819272994995117, "learning_rate": 3.001640656262505e-05, "loss": 0.6924, "mean_token_accuracy": 0.793353408575058, "num_tokens": 518732422.0, "step": 49990 }, { "entropy": 0.684211328625679, "epoch": 0.4, "grad_norm": 4.078556060791016, "learning_rate": 3.0012404961984797e-05, "loss": 0.6886, "mean_token_accuracy": 0.8171404838562012, "num_tokens": 518768543.0, "step": 50000 }, { "entropy": 0.6404822945594788, "epoch": 0.40008, "grad_norm": 2.490393877029419, "learning_rate": 3.000840336134454e-05, "loss": 0.6353, "mean_token_accuracy": 0.7971910178661347, "num_tokens": 518932383.0, "step": 50010 }, { "entropy": 0.7091218948364257, "epoch": 0.40016, "grad_norm": 2.8360068798065186, "learning_rate": 3.000440176070428e-05, "loss": 0.6991, "mean_token_accuracy": 0.8023778557777405, "num_tokens": 519031822.0, "step": 50020 }, { "entropy": 0.6663925170898437, "epoch": 0.40024, "grad_norm": 2.5256922245025635, "learning_rate": 3.0000400160064028e-05, "loss": 0.6674, "mean_token_accuracy": 0.8068336725234986, "num_tokens": 519127936.0, "step": 50030 }, { "entropy": 0.7092965126037598, "epoch": 0.40032, "grad_norm": 2.440389633178711, "learning_rate": 2.999639855942377e-05, "loss": 0.7027, "mean_token_accuracy": 0.787737512588501, "num_tokens": 519261441.0, "step": 50040 }, { "entropy": 0.6505443334579468, "epoch": 0.4004, "grad_norm": 4.261160850524902, "learning_rate": 2.9992396958783515e-05, "loss": 0.6413, "mean_token_accuracy": 0.8286980986595154, "num_tokens": 519299698.0, "step": 50050 }, { "entropy": 0.6849947988986969, "epoch": 0.40048, "grad_norm": 1.4134825468063354, "learning_rate": 2.9988395358143255e-05, "loss": 0.6841, "mean_token_accuracy": 0.7869504213333129, "num_tokens": 519463538.0, "step": 50060 }, { "entropy": 0.6811006605625153, "epoch": 0.40056, "grad_norm": 4.529680252075195, "learning_rate": 2.9984393757503003e-05, "loss": 0.673, "mean_token_accuracy": 0.8038317143917084, "num_tokens": 519565263.0, "step": 50070 }, { "entropy": 0.6610076487064361, "epoch": 0.40064, "grad_norm": 2.400975465774536, "learning_rate": 2.9980392156862746e-05, "loss": 0.6476, "mean_token_accuracy": 0.8137752115726471, "num_tokens": 519660870.0, "step": 50080 }, { "entropy": 0.7444034278392792, "epoch": 0.40072, "grad_norm": 3.804774522781372, "learning_rate": 2.997639055622249e-05, "loss": 0.7439, "mean_token_accuracy": 0.7808337986469269, "num_tokens": 519791776.0, "step": 50090 }, { "entropy": 0.6138006657361984, "epoch": 0.4008, "grad_norm": 5.895019054412842, "learning_rate": 2.9972388955582237e-05, "loss": 0.6083, "mean_token_accuracy": 0.8290708661079407, "num_tokens": 519832583.0, "step": 50100 }, { "entropy": 0.6907003045082092, "epoch": 0.40088, "grad_norm": 1.6111669540405273, "learning_rate": 2.9968387354941977e-05, "loss": 0.6847, "mean_token_accuracy": 0.785051304101944, "num_tokens": 519996423.0, "step": 50110 }, { "entropy": 0.6502778589725494, "epoch": 0.40096, "grad_norm": 2.8445658683776855, "learning_rate": 2.996438575430172e-05, "loss": 0.6373, "mean_token_accuracy": 0.810995590686798, "num_tokens": 520096388.0, "step": 50120 }, { "entropy": 0.7026365220546722, "epoch": 0.40104, "grad_norm": 1.6491310596466064, "learning_rate": 2.9960384153661465e-05, "loss": 0.702, "mean_token_accuracy": 0.8011414885520936, "num_tokens": 520190574.0, "step": 50130 }, { "entropy": 0.6642014741897583, "epoch": 0.40112, "grad_norm": 2.399272918701172, "learning_rate": 2.9956382553021212e-05, "loss": 0.6654, "mean_token_accuracy": 0.7935007154941559, "num_tokens": 520337866.0, "step": 50140 }, { "entropy": 0.7138448566198349, "epoch": 0.4012, "grad_norm": 5.380245208740234, "learning_rate": 2.9952380952380952e-05, "loss": 0.7099, "mean_token_accuracy": 0.8111225962638855, "num_tokens": 520377755.0, "step": 50150 }, { "entropy": 0.6133090913295746, "epoch": 0.40128, "grad_norm": 1.4547231197357178, "learning_rate": 2.9948379351740696e-05, "loss": 0.6063, "mean_token_accuracy": 0.8069003462791443, "num_tokens": 520541595.0, "step": 50160 }, { "entropy": 0.6637088000774384, "epoch": 0.40136, "grad_norm": 2.598992109298706, "learning_rate": 2.9944377751100443e-05, "loss": 0.6604, "mean_token_accuracy": 0.8046857655048371, "num_tokens": 520633705.0, "step": 50170 }, { "entropy": 0.7017431378364563, "epoch": 0.40144, "grad_norm": 1.940639615058899, "learning_rate": 2.9940376150460187e-05, "loss": 0.7122, "mean_token_accuracy": 0.8014005184173584, "num_tokens": 520728890.0, "step": 50180 }, { "entropy": 0.6545563161373138, "epoch": 0.40152, "grad_norm": 2.312546491622925, "learning_rate": 2.9936374549819927e-05, "loss": 0.6519, "mean_token_accuracy": 0.7967836439609528, "num_tokens": 520874114.0, "step": 50190 }, { "entropy": 0.6860456109046936, "epoch": 0.4016, "grad_norm": 4.457313060760498, "learning_rate": 2.993237294917967e-05, "loss": 0.6872, "mean_token_accuracy": 0.8160540401935578, "num_tokens": 520911369.0, "step": 50200 }, { "entropy": 0.6524262070655823, "epoch": 0.40168, "grad_norm": 1.7495555877685547, "learning_rate": 2.9928371348539418e-05, "loss": 0.654, "mean_token_accuracy": 0.795563805103302, "num_tokens": 521073461.0, "step": 50210 }, { "entropy": 0.6812061607837677, "epoch": 0.40176, "grad_norm": 3.786257266998291, "learning_rate": 2.992436974789916e-05, "loss": 0.6683, "mean_token_accuracy": 0.8105785310268402, "num_tokens": 521153265.0, "step": 50220 }, { "entropy": 0.7224964022636413, "epoch": 0.40184, "grad_norm": 2.3143117427825928, "learning_rate": 2.9920368147258902e-05, "loss": 0.7244, "mean_token_accuracy": 0.796177339553833, "num_tokens": 521248151.0, "step": 50230 }, { "entropy": 0.6534191846847535, "epoch": 0.40192, "grad_norm": 3.0824134349823, "learning_rate": 2.9916366546618653e-05, "loss": 0.6421, "mean_token_accuracy": 0.8033808290958404, "num_tokens": 521375037.0, "step": 50240 }, { "entropy": 0.6185627907514573, "epoch": 0.402, "grad_norm": 4.29425048828125, "learning_rate": 2.9912364945978393e-05, "loss": 0.6192, "mean_token_accuracy": 0.8318287014961243, "num_tokens": 521410187.0, "step": 50250 }, { "entropy": 0.7258134841918945, "epoch": 0.40208, "grad_norm": 1.7170350551605225, "learning_rate": 2.9908363345338137e-05, "loss": 0.7243, "mean_token_accuracy": 0.7729252338409424, "num_tokens": 521573473.0, "step": 50260 }, { "entropy": 0.6482838332653046, "epoch": 0.40216, "grad_norm": 3.1086816787719727, "learning_rate": 2.9904361744697877e-05, "loss": 0.6352, "mean_token_accuracy": 0.8167142748832703, "num_tokens": 521644954.0, "step": 50270 }, { "entropy": 0.6319164991378784, "epoch": 0.40224, "grad_norm": 2.2841649055480957, "learning_rate": 2.9900360144057627e-05, "loss": 0.625, "mean_token_accuracy": 0.8155062377452851, "num_tokens": 521738812.0, "step": 50280 }, { "entropy": 0.7159552276134491, "epoch": 0.40232, "grad_norm": 1.9573845863342285, "learning_rate": 2.9896358543417368e-05, "loss": 0.7226, "mean_token_accuracy": 0.7843569636344909, "num_tokens": 521873483.0, "step": 50290 }, { "entropy": 0.7151774227619171, "epoch": 0.4024, "grad_norm": 3.8558998107910156, "learning_rate": 2.989235694277711e-05, "loss": 0.7024, "mean_token_accuracy": 0.8137183725833893, "num_tokens": 521909789.0, "step": 50300 }, { "entropy": 0.6639424085617065, "epoch": 0.40248, "grad_norm": 2.5071685314178467, "learning_rate": 2.9888355342136852e-05, "loss": 0.6564, "mean_token_accuracy": 0.7911516904830933, "num_tokens": 522073629.0, "step": 50310 }, { "entropy": 0.7334841161966323, "epoch": 0.40256, "grad_norm": 2.8741185665130615, "learning_rate": 2.9884353741496602e-05, "loss": 0.7265, "mean_token_accuracy": 0.7946219682693482, "num_tokens": 522162132.0, "step": 50320 }, { "entropy": 0.7131624460220337, "epoch": 0.40264, "grad_norm": 1.6631990671157837, "learning_rate": 2.9880352140856343e-05, "loss": 0.7065, "mean_token_accuracy": 0.8039771854877472, "num_tokens": 522255130.0, "step": 50330 }, { "entropy": 0.6814754426479339, "epoch": 0.40272, "grad_norm": 1.7322598695755005, "learning_rate": 2.9876350540216086e-05, "loss": 0.6768, "mean_token_accuracy": 0.7938241183757782, "num_tokens": 522400175.0, "step": 50340 }, { "entropy": 0.5861280351877213, "epoch": 0.4028, "grad_norm": 4.884880542755127, "learning_rate": 2.9872348939575833e-05, "loss": 0.5756, "mean_token_accuracy": 0.8393024265766144, "num_tokens": 522439287.0, "step": 50350 }, { "entropy": 0.6355552792549133, "epoch": 0.40288, "grad_norm": 2.353724956512451, "learning_rate": 2.9868347338935577e-05, "loss": 0.6337, "mean_token_accuracy": 0.7986158847808837, "num_tokens": 522602934.0, "step": 50360 }, { "entropy": 0.616444593667984, "epoch": 0.40296, "grad_norm": 3.312511444091797, "learning_rate": 2.9864345738295317e-05, "loss": 0.6153, "mean_token_accuracy": 0.8199515819549561, "num_tokens": 522677458.0, "step": 50370 }, { "entropy": 0.7167836368083954, "epoch": 0.40304, "grad_norm": 1.525252342224121, "learning_rate": 2.986034413765506e-05, "loss": 0.7151, "mean_token_accuracy": 0.7993304550647735, "num_tokens": 522770053.0, "step": 50380 }, { "entropy": 0.6373322606086731, "epoch": 0.40312, "grad_norm": 1.9631495475769043, "learning_rate": 2.9856342537014808e-05, "loss": 0.6331, "mean_token_accuracy": 0.7995127439498901, "num_tokens": 522915374.0, "step": 50390 }, { "entropy": 0.6965822368860245, "epoch": 0.4032, "grad_norm": 5.278838634490967, "learning_rate": 2.9852340936374552e-05, "loss": 0.6919, "mean_token_accuracy": 0.8177274882793426, "num_tokens": 522955972.0, "step": 50400 }, { "entropy": 0.6628970086574555, "epoch": 0.40328, "grad_norm": 1.4659525156021118, "learning_rate": 2.9848339335734292e-05, "loss": 0.6676, "mean_token_accuracy": 0.7909515738487244, "num_tokens": 523119293.0, "step": 50410 }, { "entropy": 0.6676518112421036, "epoch": 0.40336, "grad_norm": 4.51335334777832, "learning_rate": 2.9844337735094043e-05, "loss": 0.6536, "mean_token_accuracy": 0.8101001501083374, "num_tokens": 523198067.0, "step": 50420 }, { "entropy": 0.6479476511478424, "epoch": 0.40344, "grad_norm": 2.103853702545166, "learning_rate": 2.9840336134453783e-05, "loss": 0.6451, "mean_token_accuracy": 0.8131450235843658, "num_tokens": 523289809.0, "step": 50430 }, { "entropy": 0.6876732766628265, "epoch": 0.40352, "grad_norm": 2.1446378231048584, "learning_rate": 2.9836334533813527e-05, "loss": 0.6872, "mean_token_accuracy": 0.7947833955287933, "num_tokens": 523420420.0, "step": 50440 }, { "entropy": 0.7293965190649032, "epoch": 0.4036, "grad_norm": 5.665153980255127, "learning_rate": 2.9832332933173267e-05, "loss": 0.7441, "mean_token_accuracy": 0.8101040899753571, "num_tokens": 523453935.0, "step": 50450 }, { "entropy": 0.6970599949359894, "epoch": 0.40368, "grad_norm": 1.5313056707382202, "learning_rate": 2.9828331332533018e-05, "loss": 0.6909, "mean_token_accuracy": 0.7844345510005951, "num_tokens": 523617775.0, "step": 50460 }, { "entropy": 0.7019632816314697, "epoch": 0.40376, "grad_norm": 3.627279758453369, "learning_rate": 2.9824329731892758e-05, "loss": 0.7019, "mean_token_accuracy": 0.7977622807025909, "num_tokens": 523696087.0, "step": 50470 }, { "entropy": 0.7259955406188965, "epoch": 0.40384, "grad_norm": 1.3937686681747437, "learning_rate": 2.98203281312525e-05, "loss": 0.7374, "mean_token_accuracy": 0.7890217721462249, "num_tokens": 523789332.0, "step": 50480 }, { "entropy": 0.6577545583248139, "epoch": 0.40392, "grad_norm": 3.0997395515441895, "learning_rate": 2.981632653061225e-05, "loss": 0.6477, "mean_token_accuracy": 0.7992175340652465, "num_tokens": 523925061.0, "step": 50490 }, { "entropy": 0.6312888264656067, "epoch": 0.404, "grad_norm": 4.358757495880127, "learning_rate": 2.9812324929971992e-05, "loss": 0.61, "mean_token_accuracy": 0.831916868686676, "num_tokens": 523960872.0, "step": 50500 }, { "entropy": 0.6441643357276916, "epoch": 0.40408, "grad_norm": 2.1538383960723877, "learning_rate": 2.9808323329331733e-05, "loss": 0.6425, "mean_token_accuracy": 0.8000239431858063, "num_tokens": 524124494.0, "step": 50510 }, { "entropy": 0.6722118437290192, "epoch": 0.40416, "grad_norm": 3.0761771202087402, "learning_rate": 2.9804321728691476e-05, "loss": 0.678, "mean_token_accuracy": 0.8088852047920227, "num_tokens": 524200166.0, "step": 50520 }, { "entropy": 0.700489616394043, "epoch": 0.40424, "grad_norm": 2.872244119644165, "learning_rate": 2.9800320128051224e-05, "loss": 0.6934, "mean_token_accuracy": 0.8051131963729858, "num_tokens": 524292286.0, "step": 50530 }, { "entropy": 0.7169010698795318, "epoch": 0.40432, "grad_norm": 2.6917905807495117, "learning_rate": 2.9796318527410967e-05, "loss": 0.7106, "mean_token_accuracy": 0.7851924180984498, "num_tokens": 524436990.0, "step": 50540 }, { "entropy": 0.7385374665260315, "epoch": 0.4044, "grad_norm": 4.309754848480225, "learning_rate": 2.9792316926770708e-05, "loss": 0.7253, "mean_token_accuracy": 0.8044875681400299, "num_tokens": 524479876.0, "step": 50550 }, { "entropy": 0.6026701211929322, "epoch": 0.40448, "grad_norm": 2.361996650695801, "learning_rate": 2.9788315326130455e-05, "loss": 0.6044, "mean_token_accuracy": 0.8037005424499511, "num_tokens": 524643716.0, "step": 50560 }, { "entropy": 0.6605087101459504, "epoch": 0.40456, "grad_norm": 3.069337844848633, "learning_rate": 2.97843137254902e-05, "loss": 0.6474, "mean_token_accuracy": 0.8150285124778748, "num_tokens": 524736023.0, "step": 50570 }, { "entropy": 0.6863620281219482, "epoch": 0.40464, "grad_norm": 2.195253610610962, "learning_rate": 2.9780312124849942e-05, "loss": 0.6978, "mean_token_accuracy": 0.8005589425563813, "num_tokens": 524830635.0, "step": 50580 }, { "entropy": 0.7437217354774475, "epoch": 0.40472, "grad_norm": 1.9356262683868408, "learning_rate": 2.9776310524209682e-05, "loss": 0.7395, "mean_token_accuracy": 0.7803162097930908, "num_tokens": 524970606.0, "step": 50590 }, { "entropy": 0.7096214890480042, "epoch": 0.4048, "grad_norm": 4.680418491363525, "learning_rate": 2.977230892356943e-05, "loss": 0.7089, "mean_token_accuracy": 0.8091149926185608, "num_tokens": 525011909.0, "step": 50600 }, { "entropy": 0.633923077583313, "epoch": 0.40488, "grad_norm": 1.6082948446273804, "learning_rate": 2.9768307322929173e-05, "loss": 0.6362, "mean_token_accuracy": 0.7970790565013885, "num_tokens": 525175699.0, "step": 50610 }, { "entropy": 0.733033013343811, "epoch": 0.40496, "grad_norm": 3.2796616554260254, "learning_rate": 2.9764305722288917e-05, "loss": 0.7411, "mean_token_accuracy": 0.7942307531833649, "num_tokens": 525256777.0, "step": 50620 }, { "entropy": 0.7469334185123444, "epoch": 0.40504, "grad_norm": 1.4256733655929565, "learning_rate": 2.9760304121648664e-05, "loss": 0.7278, "mean_token_accuracy": 0.7901299238204956, "num_tokens": 525349837.0, "step": 50630 }, { "entropy": 0.6691813558340073, "epoch": 0.40512, "grad_norm": 2.9289560317993164, "learning_rate": 2.9756302521008404e-05, "loss": 0.6718, "mean_token_accuracy": 0.7927367627620697, "num_tokens": 525485861.0, "step": 50640 }, { "entropy": 0.6655180811882019, "epoch": 0.4052, "grad_norm": 5.487708568572998, "learning_rate": 2.9752300920368148e-05, "loss": 0.6583, "mean_token_accuracy": 0.8181239068508148, "num_tokens": 525525039.0, "step": 50650 }, { "entropy": 0.6287687599658967, "epoch": 0.40528, "grad_norm": 1.8917014598846436, "learning_rate": 2.9748299319727892e-05, "loss": 0.6328, "mean_token_accuracy": 0.7976551115512848, "num_tokens": 525688879.0, "step": 50660 }, { "entropy": 0.6620897084474564, "epoch": 0.40536, "grad_norm": 3.678250312805176, "learning_rate": 2.974429771908764e-05, "loss": 0.647, "mean_token_accuracy": 0.8131633520126342, "num_tokens": 525771641.0, "step": 50670 }, { "entropy": 0.6846060931682587, "epoch": 0.40544, "grad_norm": 1.6542203426361084, "learning_rate": 2.974029611844738e-05, "loss": 0.6857, "mean_token_accuracy": 0.8032978236675262, "num_tokens": 525864938.0, "step": 50680 }, { "entropy": 0.6555369824171067, "epoch": 0.40552, "grad_norm": 2.3449459075927734, "learning_rate": 2.9736294517807123e-05, "loss": 0.6537, "mean_token_accuracy": 0.801622998714447, "num_tokens": 525997069.0, "step": 50690 }, { "entropy": 0.774574089050293, "epoch": 0.4056, "grad_norm": 4.663685321807861, "learning_rate": 2.973229291716687e-05, "loss": 0.7631, "mean_token_accuracy": 0.8009090483188629, "num_tokens": 526033499.0, "step": 50700 }, { "entropy": 0.6606496512889862, "epoch": 0.40568, "grad_norm": 1.8288935422897339, "learning_rate": 2.9728291316526614e-05, "loss": 0.6525, "mean_token_accuracy": 0.7949438273906708, "num_tokens": 526197339.0, "step": 50710 }, { "entropy": 0.6425746440887451, "epoch": 0.40576, "grad_norm": 3.1348602771759033, "learning_rate": 2.9724289715886354e-05, "loss": 0.6429, "mean_token_accuracy": 0.8112997889518738, "num_tokens": 526286512.0, "step": 50720 }, { "entropy": 0.7002111971378326, "epoch": 0.40584, "grad_norm": 2.333141803741455, "learning_rate": 2.9720288115246098e-05, "loss": 0.7056, "mean_token_accuracy": 0.7990638375282287, "num_tokens": 526380902.0, "step": 50730 }, { "entropy": 0.6837886571884155, "epoch": 0.40592, "grad_norm": 2.48979115486145, "learning_rate": 2.9716286514605845e-05, "loss": 0.6849, "mean_token_accuracy": 0.7889256715774536, "num_tokens": 526514269.0, "step": 50740 }, { "entropy": 0.724712786078453, "epoch": 0.406, "grad_norm": 4.279899597167969, "learning_rate": 2.971228491396559e-05, "loss": 0.706, "mean_token_accuracy": 0.8099919080734252, "num_tokens": 526553546.0, "step": 50750 }, { "entropy": 0.6571646869182587, "epoch": 0.40608, "grad_norm": 2.298583507537842, "learning_rate": 2.970828331332533e-05, "loss": 0.6509, "mean_token_accuracy": 0.7929286777973175, "num_tokens": 526717386.0, "step": 50760 }, { "entropy": 0.6958920329809188, "epoch": 0.40616, "grad_norm": 3.334075450897217, "learning_rate": 2.970428171268508e-05, "loss": 0.6859, "mean_token_accuracy": 0.8024401307106018, "num_tokens": 526805470.0, "step": 50770 }, { "entropy": 0.6658696234226227, "epoch": 0.40624, "grad_norm": 2.703673839569092, "learning_rate": 2.970028011204482e-05, "loss": 0.6644, "mean_token_accuracy": 0.8060468852519989, "num_tokens": 526899428.0, "step": 50780 }, { "entropy": 0.716825008392334, "epoch": 0.40632, "grad_norm": 2.732933759689331, "learning_rate": 2.9696278511404564e-05, "loss": 0.7142, "mean_token_accuracy": 0.7828901588916779, "num_tokens": 527030641.0, "step": 50790 }, { "entropy": 0.6954958498477936, "epoch": 0.4064, "grad_norm": 4.952491283416748, "learning_rate": 2.9692276910764304e-05, "loss": 0.6838, "mean_token_accuracy": 0.8145350217819214, "num_tokens": 527066480.0, "step": 50800 }, { "entropy": 0.6149311810731888, "epoch": 0.40648, "grad_norm": 1.588791012763977, "learning_rate": 2.9688275310124054e-05, "loss": 0.6103, "mean_token_accuracy": 0.8018075168132782, "num_tokens": 527230320.0, "step": 50810 }, { "entropy": 0.5702236592769623, "epoch": 0.40656, "grad_norm": 3.802398443222046, "learning_rate": 2.9684273709483795e-05, "loss": 0.571, "mean_token_accuracy": 0.8301055133342743, "num_tokens": 527323258.0, "step": 50820 }, { "entropy": 0.6690941631793976, "epoch": 0.40664, "grad_norm": 2.023360013961792, "learning_rate": 2.968027210884354e-05, "loss": 0.6639, "mean_token_accuracy": 0.8075933873653411, "num_tokens": 527417973.0, "step": 50830 }, { "entropy": 0.6572821080684662, "epoch": 0.40672, "grad_norm": 3.048624277114868, "learning_rate": 2.9676270508203285e-05, "loss": 0.6553, "mean_token_accuracy": 0.7929469525814057, "num_tokens": 527570227.0, "step": 50840 }, { "entropy": 0.6176987886428833, "epoch": 0.4068, "grad_norm": 5.4547505378723145, "learning_rate": 2.967226890756303e-05, "loss": 0.6276, "mean_token_accuracy": 0.8257993161678314, "num_tokens": 527615756.0, "step": 50850 }, { "entropy": 0.6391693949699402, "epoch": 0.40688, "grad_norm": 1.6107397079467773, "learning_rate": 2.966826730692277e-05, "loss": 0.6322, "mean_token_accuracy": 0.797789454460144, "num_tokens": 527779596.0, "step": 50860 }, { "entropy": 0.7476710319519043, "epoch": 0.40696, "grad_norm": 4.1680803298950195, "learning_rate": 2.9664265706282513e-05, "loss": 0.7502, "mean_token_accuracy": 0.787436705827713, "num_tokens": 527869244.0, "step": 50870 }, { "entropy": 0.7496363937854766, "epoch": 0.40704, "grad_norm": 2.339698314666748, "learning_rate": 2.966026410564226e-05, "loss": 0.7524, "mean_token_accuracy": 0.7855402231216431, "num_tokens": 527963717.0, "step": 50880 }, { "entropy": 0.7576177954673767, "epoch": 0.40712, "grad_norm": 2.760620355606079, "learning_rate": 2.9656262505002004e-05, "loss": 0.7507, "mean_token_accuracy": 0.7755637109279633, "num_tokens": 528107560.0, "step": 50890 }, { "entropy": 0.7401514142751694, "epoch": 0.4072, "grad_norm": 5.996659755706787, "learning_rate": 2.9652260904361744e-05, "loss": 0.7317, "mean_token_accuracy": 0.8065586268901825, "num_tokens": 528150148.0, "step": 50900 }, { "entropy": 0.6662038445472718, "epoch": 0.40728, "grad_norm": 1.4430612325668335, "learning_rate": 2.9648259303721488e-05, "loss": 0.6717, "mean_token_accuracy": 0.7869504153728485, "num_tokens": 528313988.0, "step": 50910 }, { "entropy": 0.7584986388683319, "epoch": 0.40736, "grad_norm": 3.1838719844818115, "learning_rate": 2.9644257703081235e-05, "loss": 0.7475, "mean_token_accuracy": 0.7897109389305115, "num_tokens": 528409392.0, "step": 50920 }, { "entropy": 0.6849200367927551, "epoch": 0.40744, "grad_norm": 1.803957223892212, "learning_rate": 2.964025610244098e-05, "loss": 0.6816, "mean_token_accuracy": 0.8025969445705414, "num_tokens": 528504422.0, "step": 50930 }, { "entropy": 0.7002504348754883, "epoch": 0.40752, "grad_norm": 3.238354444503784, "learning_rate": 2.963625450180072e-05, "loss": 0.6997, "mean_token_accuracy": 0.7852774024009704, "num_tokens": 528636874.0, "step": 50940 }, { "entropy": 0.6813717901706695, "epoch": 0.4076, "grad_norm": 5.042660713195801, "learning_rate": 2.9632252901160466e-05, "loss": 0.6671, "mean_token_accuracy": 0.823151272535324, "num_tokens": 528674344.0, "step": 50950 }, { "entropy": 0.6729088604450226, "epoch": 0.40768, "grad_norm": 2.3721113204956055, "learning_rate": 2.962825130052021e-05, "loss": 0.6693, "mean_token_accuracy": 0.7908558964729309, "num_tokens": 528836470.0, "step": 50960 }, { "entropy": 0.6922871828079223, "epoch": 0.40776, "grad_norm": 3.132615327835083, "learning_rate": 2.9624249699879954e-05, "loss": 0.6841, "mean_token_accuracy": 0.8036684334278107, "num_tokens": 528910481.0, "step": 50970 }, { "entropy": 0.7026836037635803, "epoch": 0.40784, "grad_norm": 1.404301404953003, "learning_rate": 2.9620248099239694e-05, "loss": 0.7156, "mean_token_accuracy": 0.7942646503448486, "num_tokens": 529002904.0, "step": 50980 }, { "entropy": 0.652877128124237, "epoch": 0.40792, "grad_norm": 2.2497310638427734, "learning_rate": 2.961624649859944e-05, "loss": 0.6542, "mean_token_accuracy": 0.7960518062114715, "num_tokens": 529143394.0, "step": 50990 }, { "entropy": 0.6768942832946777, "epoch": 0.408, "grad_norm": 3.879371404647827, "learning_rate": 2.9612244897959185e-05, "loss": 0.6588, "mean_token_accuracy": 0.8197352588176727, "num_tokens": 529179563.0, "step": 51000 }, { "entropy": 0.6841889321804047, "epoch": 0.40808, "grad_norm": 1.5022796392440796, "learning_rate": 2.960824329731893e-05, "loss": 0.6868, "mean_token_accuracy": 0.7879396736621856, "num_tokens": 529343403.0, "step": 51010 }, { "entropy": 0.7023737460374833, "epoch": 0.40816, "grad_norm": 2.956246852874756, "learning_rate": 2.9604241696678676e-05, "loss": 0.6943, "mean_token_accuracy": 0.7964373350143432, "num_tokens": 529441527.0, "step": 51020 }, { "entropy": 0.6600752294063568, "epoch": 0.40824, "grad_norm": 2.0375442504882812, "learning_rate": 2.9600240096038416e-05, "loss": 0.6595, "mean_token_accuracy": 0.806390130519867, "num_tokens": 529537956.0, "step": 51030 }, { "entropy": 0.6320308446884155, "epoch": 0.40832, "grad_norm": 2.1229195594787598, "learning_rate": 2.959623849539816e-05, "loss": 0.6229, "mean_token_accuracy": 0.8037603735923767, "num_tokens": 529682196.0, "step": 51040 }, { "entropy": 0.6607522010803223, "epoch": 0.4084, "grad_norm": 5.212480068206787, "learning_rate": 2.9592236894757903e-05, "loss": 0.6675, "mean_token_accuracy": 0.8194278359413147, "num_tokens": 529719533.0, "step": 51050 }, { "entropy": 0.6115743339061737, "epoch": 0.40848, "grad_norm": 1.5658308267593384, "learning_rate": 2.958823529411765e-05, "loss": 0.6158, "mean_token_accuracy": 0.8026624321937561, "num_tokens": 529883373.0, "step": 51060 }, { "entropy": 0.660503375530243, "epoch": 0.40856, "grad_norm": 3.199188232421875, "learning_rate": 2.958423369347739e-05, "loss": 0.6468, "mean_token_accuracy": 0.8144006729125977, "num_tokens": 529976375.0, "step": 51070 }, { "entropy": 0.7295238077640533, "epoch": 0.40864, "grad_norm": 1.2519466876983643, "learning_rate": 2.9580232092837135e-05, "loss": 0.7263, "mean_token_accuracy": 0.7901140570640564, "num_tokens": 530071833.0, "step": 51080 }, { "entropy": 0.7147917926311493, "epoch": 0.40872, "grad_norm": 2.531419277191162, "learning_rate": 2.9576230492196882e-05, "loss": 0.7032, "mean_token_accuracy": 0.7886357128620147, "num_tokens": 530204258.0, "step": 51090 }, { "entropy": 0.756617671251297, "epoch": 0.4088, "grad_norm": 5.888422012329102, "learning_rate": 2.9572228891556625e-05, "loss": 0.7693, "mean_token_accuracy": 0.8050333678722381, "num_tokens": 530236772.0, "step": 51100 }, { "entropy": 0.6968115210533142, "epoch": 0.40888, "grad_norm": 1.5040340423583984, "learning_rate": 2.9568227290916366e-05, "loss": 0.6933, "mean_token_accuracy": 0.7848253607749939, "num_tokens": 530400612.0, "step": 51110 }, { "entropy": 0.7018423467874527, "epoch": 0.40896, "grad_norm": 3.3538448810577393, "learning_rate": 2.956422569027611e-05, "loss": 0.6918, "mean_token_accuracy": 0.8020709216594696, "num_tokens": 530499953.0, "step": 51120 }, { "entropy": 0.6103925287723542, "epoch": 0.40904, "grad_norm": 1.6362866163253784, "learning_rate": 2.9560224089635857e-05, "loss": 0.6191, "mean_token_accuracy": 0.8129365265369415, "num_tokens": 530596036.0, "step": 51130 }, { "entropy": 0.7374842017889023, "epoch": 0.40912, "grad_norm": 2.9021785259246826, "learning_rate": 2.95562224889956e-05, "loss": 0.7289, "mean_token_accuracy": 0.7837775528430939, "num_tokens": 530731051.0, "step": 51140 }, { "entropy": 0.7296054482460022, "epoch": 0.4092, "grad_norm": 4.530934810638428, "learning_rate": 2.955222088835534e-05, "loss": 0.709, "mean_token_accuracy": 0.8144909501075744, "num_tokens": 530770677.0, "step": 51150 }, { "entropy": 0.6457547247409821, "epoch": 0.40928, "grad_norm": 1.79297935962677, "learning_rate": 2.954821928771509e-05, "loss": 0.6528, "mean_token_accuracy": 0.7930508077144622, "num_tokens": 530934517.0, "step": 51160 }, { "entropy": 0.656926891207695, "epoch": 0.40936, "grad_norm": 3.339303970336914, "learning_rate": 2.954421768707483e-05, "loss": 0.6363, "mean_token_accuracy": 0.8119154572486877, "num_tokens": 531023014.0, "step": 51170 }, { "entropy": 0.6463059306144714, "epoch": 0.40944, "grad_norm": 1.8462895154953003, "learning_rate": 2.9540216086434575e-05, "loss": 0.6436, "mean_token_accuracy": 0.809916615486145, "num_tokens": 531118627.0, "step": 51180 }, { "entropy": 0.6645931303501129, "epoch": 0.40952, "grad_norm": 2.4207687377929688, "learning_rate": 2.9536214485794315e-05, "loss": 0.6587, "mean_token_accuracy": 0.7988218367099762, "num_tokens": 531250512.0, "step": 51190 }, { "entropy": 0.7561896741390228, "epoch": 0.4096, "grad_norm": 4.207883358001709, "learning_rate": 2.9532212885154066e-05, "loss": 0.74, "mean_token_accuracy": 0.8054018557071686, "num_tokens": 531284664.0, "step": 51200 }, { "entropy": 0.7129358232021332, "epoch": 0.40968, "grad_norm": 2.333796262741089, "learning_rate": 2.9528211284513806e-05, "loss": 0.7173, "mean_token_accuracy": 0.7789026021957397, "num_tokens": 531448290.0, "step": 51210 }, { "entropy": 0.6616519629955292, "epoch": 0.40976, "grad_norm": 4.385740756988525, "learning_rate": 2.952420968387355e-05, "loss": 0.6527, "mean_token_accuracy": 0.816333281993866, "num_tokens": 531524285.0, "step": 51220 }, { "entropy": 0.7270779728889465, "epoch": 0.40984, "grad_norm": 1.6194006204605103, "learning_rate": 2.9520208083233297e-05, "loss": 0.726, "mean_token_accuracy": 0.7974663496017456, "num_tokens": 531618955.0, "step": 51230 }, { "entropy": 0.6754335105419159, "epoch": 0.40992, "grad_norm": 2.996929407119751, "learning_rate": 2.951620648259304e-05, "loss": 0.6703, "mean_token_accuracy": 0.7982273817062377, "num_tokens": 531743122.0, "step": 51240 }, { "entropy": 0.6849055767059327, "epoch": 0.41, "grad_norm": 4.178227424621582, "learning_rate": 2.951220488195278e-05, "loss": 0.666, "mean_token_accuracy": 0.8234157919883728, "num_tokens": 531777611.0, "step": 51250 }, { "entropy": 0.6518210947513581, "epoch": 0.41008, "grad_norm": 1.2282251119613647, "learning_rate": 2.9508203281312525e-05, "loss": 0.6533, "mean_token_accuracy": 0.792879831790924, "num_tokens": 531941451.0, "step": 51260 }, { "entropy": 0.6708609819412231, "epoch": 0.41016, "grad_norm": 2.7634830474853516, "learning_rate": 2.9504201680672272e-05, "loss": 0.6603, "mean_token_accuracy": 0.8075559616088868, "num_tokens": 532040172.0, "step": 51270 }, { "entropy": 0.6726635575294495, "epoch": 0.41024, "grad_norm": 2.1822164058685303, "learning_rate": 2.9500200080032016e-05, "loss": 0.6933, "mean_token_accuracy": 0.8029662549495697, "num_tokens": 532134545.0, "step": 51280 }, { "entropy": 0.7059501707553864, "epoch": 0.41032, "grad_norm": 1.9329354763031006, "learning_rate": 2.9496198479391756e-05, "loss": 0.6907, "mean_token_accuracy": 0.7892370402812958, "num_tokens": 532277477.0, "step": 51290 }, { "entropy": 0.6968993842601776, "epoch": 0.4104, "grad_norm": 4.52732515335083, "learning_rate": 2.9492196878751503e-05, "loss": 0.6971, "mean_token_accuracy": 0.8109697163105011, "num_tokens": 532314100.0, "step": 51300 }, { "entropy": 0.6436306595802307, "epoch": 0.41048, "grad_norm": 1.8414133787155151, "learning_rate": 2.9488195278111247e-05, "loss": 0.6428, "mean_token_accuracy": 0.79507817029953, "num_tokens": 532477940.0, "step": 51310 }, { "entropy": 0.6840819865465164, "epoch": 0.41056, "grad_norm": 3.2795064449310303, "learning_rate": 2.948419367747099e-05, "loss": 0.6686, "mean_token_accuracy": 0.8077594339847565, "num_tokens": 532568086.0, "step": 51320 }, { "entropy": 0.6941798090934753, "epoch": 0.41064, "grad_norm": 1.724082112312317, "learning_rate": 2.948019207683073e-05, "loss": 0.6907, "mean_token_accuracy": 0.8013415992259979, "num_tokens": 532662986.0, "step": 51330 }, { "entropy": 0.6963998317718506, "epoch": 0.41072, "grad_norm": 3.8577895164489746, "learning_rate": 2.9476190476190478e-05, "loss": 0.6895, "mean_token_accuracy": 0.7967088103294373, "num_tokens": 532784412.0, "step": 51340 }, { "entropy": 0.6292377144098282, "epoch": 0.4108, "grad_norm": 4.232131004333496, "learning_rate": 2.947218887555022e-05, "loss": 0.6262, "mean_token_accuracy": 0.8270057737827301, "num_tokens": 532818605.0, "step": 51350 }, { "entropy": 0.6653087079524994, "epoch": 0.41088, "grad_norm": 1.8614085912704468, "learning_rate": 2.9468187274909965e-05, "loss": 0.6698, "mean_token_accuracy": 0.7857046902179718, "num_tokens": 532982445.0, "step": 51360 }, { "entropy": 0.7081003189086914, "epoch": 0.41096, "grad_norm": 5.00791072845459, "learning_rate": 2.9464185674269712e-05, "loss": 0.6956, "mean_token_accuracy": 0.8007605314254761, "num_tokens": 533064379.0, "step": 51370 }, { "entropy": 0.7052516400814056, "epoch": 0.41104, "grad_norm": 2.2168993949890137, "learning_rate": 2.9460184073629453e-05, "loss": 0.6987, "mean_token_accuracy": 0.7976643741130829, "num_tokens": 533158768.0, "step": 51380 }, { "entropy": 0.7173013627529145, "epoch": 0.41112, "grad_norm": 3.079326629638672, "learning_rate": 2.9456182472989196e-05, "loss": 0.7207, "mean_token_accuracy": 0.7863037407398223, "num_tokens": 533295229.0, "step": 51390 }, { "entropy": 0.7138747036457062, "epoch": 0.4112, "grad_norm": 4.770480155944824, "learning_rate": 2.945218087234894e-05, "loss": 0.7148, "mean_token_accuracy": 0.8090245008468628, "num_tokens": 533328499.0, "step": 51400 }, { "entropy": 0.6871739625930786, "epoch": 0.41128, "grad_norm": 1.5186630487442017, "learning_rate": 2.9448179271708687e-05, "loss": 0.6806, "mean_token_accuracy": 0.7850390851497651, "num_tokens": 533492339.0, "step": 51410 }, { "entropy": 0.5775824129581452, "epoch": 0.41136, "grad_norm": 2.822561502456665, "learning_rate": 2.9444177671068428e-05, "loss": 0.5774, "mean_token_accuracy": 0.8247947096824646, "num_tokens": 533588768.0, "step": 51420 }, { "entropy": 0.6796074032783508, "epoch": 0.41144, "grad_norm": 1.3961381912231445, "learning_rate": 2.944017607042817e-05, "loss": 0.679, "mean_token_accuracy": 0.7999856829643249, "num_tokens": 533681526.0, "step": 51430 }, { "entropy": 0.6520513445138931, "epoch": 0.41152, "grad_norm": 2.1286063194274902, "learning_rate": 2.9436174469787915e-05, "loss": 0.6472, "mean_token_accuracy": 0.8043249547481537, "num_tokens": 533802244.0, "step": 51440 }, { "entropy": 0.7001585185527801, "epoch": 0.4116, "grad_norm": 5.411666393280029, "learning_rate": 2.9432172869147662e-05, "loss": 0.6933, "mean_token_accuracy": 0.8152265131473542, "num_tokens": 533839363.0, "step": 51450 }, { "entropy": 0.651751559972763, "epoch": 0.41168, "grad_norm": 2.6791088581085205, "learning_rate": 2.9428171268507402e-05, "loss": 0.6446, "mean_token_accuracy": 0.7943026304244996, "num_tokens": 534003203.0, "step": 51460 }, { "entropy": 0.6874550104141235, "epoch": 0.41176, "grad_norm": 2.849853277206421, "learning_rate": 2.9424169667867146e-05, "loss": 0.6709, "mean_token_accuracy": 0.811663293838501, "num_tokens": 534086154.0, "step": 51470 }, { "entropy": 0.6553621888160706, "epoch": 0.41184, "grad_norm": 2.893944501876831, "learning_rate": 2.9420168067226893e-05, "loss": 0.6626, "mean_token_accuracy": 0.8069335699081421, "num_tokens": 534179082.0, "step": 51480 }, { "entropy": 0.7167642951011658, "epoch": 0.41192, "grad_norm": 2.419512987136841, "learning_rate": 2.9416166466586637e-05, "loss": 0.7169, "mean_token_accuracy": 0.7844461560249328, "num_tokens": 534326302.0, "step": 51490 }, { "entropy": 0.6684401154518127, "epoch": 0.412, "grad_norm": 4.75552225112915, "learning_rate": 2.9412164865946377e-05, "loss": 0.6433, "mean_token_accuracy": 0.8198746681213379, "num_tokens": 534373340.0, "step": 51500 }, { "entropy": 0.6604857325553894, "epoch": 0.41208, "grad_norm": 2.447918653488159, "learning_rate": 2.940816326530612e-05, "loss": 0.6602, "mean_token_accuracy": 0.7949982225894928, "num_tokens": 534536660.0, "step": 51510 }, { "entropy": 0.6812653541564941, "epoch": 0.41216, "grad_norm": 2.846470355987549, "learning_rate": 2.9404161664665868e-05, "loss": 0.6765, "mean_token_accuracy": 0.8083024203777314, "num_tokens": 534613445.0, "step": 51520 }, { "entropy": 0.6882525682449341, "epoch": 0.41224, "grad_norm": 1.8688855171203613, "learning_rate": 2.9400160064025612e-05, "loss": 0.6848, "mean_token_accuracy": 0.7987753748893738, "num_tokens": 534708496.0, "step": 51530 }, { "entropy": 0.6370799303054809, "epoch": 0.41232, "grad_norm": 3.462246894836426, "learning_rate": 2.9396158463385352e-05, "loss": 0.6312, "mean_token_accuracy": 0.8071986198425293, "num_tokens": 534836568.0, "step": 51540 }, { "entropy": 0.7026017487049103, "epoch": 0.4124, "grad_norm": 3.8910248279571533, "learning_rate": 2.9392156862745103e-05, "loss": 0.6895, "mean_token_accuracy": 0.8099341571331025, "num_tokens": 534877922.0, "step": 51550 }, { "entropy": 0.6347281813621521, "epoch": 0.41248, "grad_norm": 1.626420259475708, "learning_rate": 2.9388155262104843e-05, "loss": 0.6368, "mean_token_accuracy": 0.7978627264499665, "num_tokens": 535041762.0, "step": 51560 }, { "entropy": 0.65045924782753, "epoch": 0.41256, "grad_norm": 3.0042474269866943, "learning_rate": 2.9384153661464587e-05, "loss": 0.6464, "mean_token_accuracy": 0.8155676484107971, "num_tokens": 535121329.0, "step": 51570 }, { "entropy": 0.7413877308368683, "epoch": 0.41264, "grad_norm": 1.8574711084365845, "learning_rate": 2.9380152060824327e-05, "loss": 0.7346, "mean_token_accuracy": 0.7886195302009582, "num_tokens": 535215350.0, "step": 51580 }, { "entropy": 0.6635432779788971, "epoch": 0.41272, "grad_norm": 2.404557466506958, "learning_rate": 2.9376150460184078e-05, "loss": 0.6616, "mean_token_accuracy": 0.7991208612918854, "num_tokens": 535341932.0, "step": 51590 }, { "entropy": 0.6941531479358674, "epoch": 0.4128, "grad_norm": 4.450069427490234, "learning_rate": 2.9372148859543818e-05, "loss": 0.6782, "mean_token_accuracy": 0.8123260557651519, "num_tokens": 535375633.0, "step": 51600 }, { "entropy": 0.6401824414730072, "epoch": 0.41288, "grad_norm": 1.9821633100509644, "learning_rate": 2.936814725890356e-05, "loss": 0.6439, "mean_token_accuracy": 0.7941072404384613, "num_tokens": 535539473.0, "step": 51610 }, { "entropy": 0.6299016296863555, "epoch": 0.41296, "grad_norm": 3.4406235218048096, "learning_rate": 2.936414565826331e-05, "loss": 0.6247, "mean_token_accuracy": 0.821971207857132, "num_tokens": 535617813.0, "step": 51620 }, { "entropy": 0.7278630614280701, "epoch": 0.41304, "grad_norm": 1.6938607692718506, "learning_rate": 2.9360144057623052e-05, "loss": 0.7327, "mean_token_accuracy": 0.7969301044940948, "num_tokens": 535711405.0, "step": 51630 }, { "entropy": 0.6835149049758911, "epoch": 0.41312, "grad_norm": 2.9965291023254395, "learning_rate": 2.9356142456982793e-05, "loss": 0.6824, "mean_token_accuracy": 0.7998447895050049, "num_tokens": 535834233.0, "step": 51640 }, { "entropy": 0.7436796218156815, "epoch": 0.4132, "grad_norm": 4.3398051261901855, "learning_rate": 2.9352140856342536e-05, "loss": 0.7338, "mean_token_accuracy": 0.8113023817539216, "num_tokens": 535866029.0, "step": 51650 }, { "entropy": 0.6598318874835968, "epoch": 0.41328, "grad_norm": 2.245907783508301, "learning_rate": 2.9348139255702284e-05, "loss": 0.6582, "mean_token_accuracy": 0.7889228165149689, "num_tokens": 536029869.0, "step": 51660 }, { "entropy": 0.6760265827178955, "epoch": 0.41336, "grad_norm": 5.135639190673828, "learning_rate": 2.9344137655062027e-05, "loss": 0.671, "mean_token_accuracy": 0.8112832367420196, "num_tokens": 536106988.0, "step": 51670 }, { "entropy": 0.729202789068222, "epoch": 0.41344, "grad_norm": 1.652721643447876, "learning_rate": 2.9340136054421768e-05, "loss": 0.7386, "mean_token_accuracy": 0.7924241423606873, "num_tokens": 536198329.0, "step": 51680 }, { "entropy": 0.7146779894828796, "epoch": 0.41352, "grad_norm": 3.202854871749878, "learning_rate": 2.9336134453781515e-05, "loss": 0.7116, "mean_token_accuracy": 0.7840606212615967, "num_tokens": 536337526.0, "step": 51690 }, { "entropy": 0.6986596345901489, "epoch": 0.4136, "grad_norm": 4.714614391326904, "learning_rate": 2.933213285314126e-05, "loss": 0.7041, "mean_token_accuracy": 0.8143972396850586, "num_tokens": 536373922.0, "step": 51700 }, { "entropy": 0.6818543374538422, "epoch": 0.41368, "grad_norm": 1.4469813108444214, "learning_rate": 2.9328131252501002e-05, "loss": 0.6815, "mean_token_accuracy": 0.7848620057106018, "num_tokens": 536537762.0, "step": 51710 }, { "entropy": 0.654667204618454, "epoch": 0.41376, "grad_norm": 2.847823143005371, "learning_rate": 2.9324129651860742e-05, "loss": 0.6506, "mean_token_accuracy": 0.8112498879432678, "num_tokens": 536622823.0, "step": 51720 }, { "entropy": 0.6972311496734619, "epoch": 0.41384, "grad_norm": 3.7739791870117188, "learning_rate": 2.932012805122049e-05, "loss": 0.6957, "mean_token_accuracy": 0.7972743570804596, "num_tokens": 536717828.0, "step": 51730 }, { "entropy": 0.7255133271217347, "epoch": 0.41392, "grad_norm": 1.8876910209655762, "learning_rate": 2.9316126450580233e-05, "loss": 0.7205, "mean_token_accuracy": 0.7856502115726471, "num_tokens": 536859345.0, "step": 51740 }, { "entropy": 0.5958248257637024, "epoch": 0.414, "grad_norm": 4.729808807373047, "learning_rate": 2.9312124849939977e-05, "loss": 0.583, "mean_token_accuracy": 0.8407339632511139, "num_tokens": 536900170.0, "step": 51750 }, { "entropy": 0.7075584471225739, "epoch": 0.41408, "grad_norm": 1.7028239965438843, "learning_rate": 2.9308123249299724e-05, "loss": 0.709, "mean_token_accuracy": 0.7809458374977112, "num_tokens": 537063122.0, "step": 51760 }, { "entropy": 0.6393716692924499, "epoch": 0.41416, "grad_norm": 3.6629927158355713, "learning_rate": 2.9304121648659464e-05, "loss": 0.635, "mean_token_accuracy": 0.8128202795982361, "num_tokens": 537144032.0, "step": 51770 }, { "entropy": 0.7185204744338989, "epoch": 0.41424, "grad_norm": 1.81069815158844, "learning_rate": 2.9300120048019208e-05, "loss": 0.7113, "mean_token_accuracy": 0.8022958040237427, "num_tokens": 537235807.0, "step": 51780 }, { "entropy": 0.7249463021755218, "epoch": 0.41432, "grad_norm": 2.4118261337280273, "learning_rate": 2.9296118447378952e-05, "loss": 0.7255, "mean_token_accuracy": 0.7828229069709778, "num_tokens": 537375801.0, "step": 51790 }, { "entropy": 0.7211729347705841, "epoch": 0.4144, "grad_norm": 4.206562042236328, "learning_rate": 2.92921168467387e-05, "loss": 0.7129, "mean_token_accuracy": 0.8131644189357757, "num_tokens": 537415760.0, "step": 51800 }, { "entropy": 0.6553893923759461, "epoch": 0.41448, "grad_norm": 2.3863213062286377, "learning_rate": 2.928811524609844e-05, "loss": 0.6503, "mean_token_accuracy": 0.795569920539856, "num_tokens": 537577406.0, "step": 51810 }, { "entropy": 0.6091086357831955, "epoch": 0.41456, "grad_norm": 3.619676351547241, "learning_rate": 2.9284113645458183e-05, "loss": 0.6126, "mean_token_accuracy": 0.8221276521682739, "num_tokens": 537642292.0, "step": 51820 }, { "entropy": 0.7249935507774353, "epoch": 0.41464, "grad_norm": 1.992210030555725, "learning_rate": 2.928011204481793e-05, "loss": 0.7053, "mean_token_accuracy": 0.8000841438770294, "num_tokens": 537735067.0, "step": 51830 }, { "entropy": 0.5838660776615143, "epoch": 0.41472, "grad_norm": 2.330272912979126, "learning_rate": 2.9276110444177674e-05, "loss": 0.5727, "mean_token_accuracy": 0.8218566000461578, "num_tokens": 537864031.0, "step": 51840 }, { "entropy": 0.6204015135765075, "epoch": 0.4148, "grad_norm": 4.465489864349365, "learning_rate": 2.9272108843537414e-05, "loss": 0.6207, "mean_token_accuracy": 0.8327187776565552, "num_tokens": 537899387.0, "step": 51850 }, { "entropy": 0.6584060251712799, "epoch": 0.41488, "grad_norm": 1.6552631855010986, "learning_rate": 2.9268107242897158e-05, "loss": 0.6637, "mean_token_accuracy": 0.7903273165225982, "num_tokens": 538063227.0, "step": 51860 }, { "entropy": 0.6241390854120255, "epoch": 0.41496, "grad_norm": 3.277277708053589, "learning_rate": 2.9264105642256905e-05, "loss": 0.6145, "mean_token_accuracy": 0.8187557816505432, "num_tokens": 538135721.0, "step": 51870 }, { "entropy": 0.6698412477970124, "epoch": 0.41504, "grad_norm": 3.1483681201934814, "learning_rate": 2.926010404161665e-05, "loss": 0.6562, "mean_token_accuracy": 0.8122509896755219, "num_tokens": 538227751.0, "step": 51880 }, { "entropy": 0.6685207843780517, "epoch": 0.41512, "grad_norm": 2.0821807384490967, "learning_rate": 2.925610244097639e-05, "loss": 0.6644, "mean_token_accuracy": 0.793997460603714, "num_tokens": 538372136.0, "step": 51890 }, { "entropy": 0.6638856440782547, "epoch": 0.4152, "grad_norm": 5.460867404937744, "learning_rate": 2.925210084033614e-05, "loss": 0.6541, "mean_token_accuracy": 0.8251558423042298, "num_tokens": 538410719.0, "step": 51900 }, { "entropy": 0.6502697765827179, "epoch": 0.41528, "grad_norm": 1.6376267671585083, "learning_rate": 2.924809923969588e-05, "loss": 0.6559, "mean_token_accuracy": 0.7914528727531434, "num_tokens": 538574504.0, "step": 51910 }, { "entropy": 0.7190989434719086, "epoch": 0.41536, "grad_norm": 3.8994362354278564, "learning_rate": 2.9244097639055623e-05, "loss": 0.7163, "mean_token_accuracy": 0.7973930716514588, "num_tokens": 538654731.0, "step": 51920 }, { "entropy": 0.6765764713287353, "epoch": 0.41544, "grad_norm": 2.6609811782836914, "learning_rate": 2.9240096038415364e-05, "loss": 0.673, "mean_token_accuracy": 0.8044710040092469, "num_tokens": 538747801.0, "step": 51930 }, { "entropy": 0.6836043894290924, "epoch": 0.41552, "grad_norm": 2.590750217437744, "learning_rate": 2.9236094437775114e-05, "loss": 0.677, "mean_token_accuracy": 0.7928146183490753, "num_tokens": 538887894.0, "step": 51940 }, { "entropy": 0.6854047238826751, "epoch": 0.4156, "grad_norm": 4.4678449630737305, "learning_rate": 2.9232092837134855e-05, "loss": 0.6889, "mean_token_accuracy": 0.8117929577827454, "num_tokens": 538925738.0, "step": 51950 }, { "entropy": 0.6797748446464539, "epoch": 0.41568, "grad_norm": 1.4370189905166626, "learning_rate": 2.92280912364946e-05, "loss": 0.6785, "mean_token_accuracy": 0.7866267800331116, "num_tokens": 539089578.0, "step": 51960 }, { "entropy": 0.5942582547664642, "epoch": 0.41576, "grad_norm": 3.666468381881714, "learning_rate": 2.9224089635854345e-05, "loss": 0.581, "mean_token_accuracy": 0.8257328867912292, "num_tokens": 539184648.0, "step": 51970 }, { "entropy": 0.7122686326503753, "epoch": 0.41584, "grad_norm": 1.9676439762115479, "learning_rate": 2.922008803521409e-05, "loss": 0.7156, "mean_token_accuracy": 0.7929820656776428, "num_tokens": 539281417.0, "step": 51980 }, { "entropy": 0.6856464922428132, "epoch": 0.41592, "grad_norm": 2.544132947921753, "learning_rate": 2.921608643457383e-05, "loss": 0.6761, "mean_token_accuracy": 0.791552048921585, "num_tokens": 539428524.0, "step": 51990 }, { "entropy": 0.698443454504013, "epoch": 0.416, "grad_norm": 4.66214656829834, "learning_rate": 2.9212084833933573e-05, "loss": 0.6978, "mean_token_accuracy": 0.8103549182415009, "num_tokens": 539471282.0, "step": 52000 }, { "entropy": 0.6606318771839141, "epoch": 0.41608, "grad_norm": 2.000454902648926, "learning_rate": 2.920808323329332e-05, "loss": 0.6608, "mean_token_accuracy": 0.7938256859779358, "num_tokens": 539631737.0, "step": 52010 }, { "entropy": 0.6330953061580658, "epoch": 0.41616, "grad_norm": 3.585606336593628, "learning_rate": 2.9204081632653064e-05, "loss": 0.6209, "mean_token_accuracy": 0.8190378308296203, "num_tokens": 539695229.0, "step": 52020 }, { "entropy": 0.7125839650630951, "epoch": 0.41624, "grad_norm": 2.543571710586548, "learning_rate": 2.9200080032012804e-05, "loss": 0.7154, "mean_token_accuracy": 0.7980775952339172, "num_tokens": 539786773.0, "step": 52030 }, { "entropy": 0.6269274443387985, "epoch": 0.41632, "grad_norm": 2.4270966053009033, "learning_rate": 2.9196078431372548e-05, "loss": 0.6288, "mean_token_accuracy": 0.8020234942436218, "num_tokens": 539933269.0, "step": 52040 }, { "entropy": 0.7774703681468964, "epoch": 0.4164, "grad_norm": 4.6785054206848145, "learning_rate": 2.9192076830732295e-05, "loss": 0.7713, "mean_token_accuracy": 0.8001867175102234, "num_tokens": 539978301.0, "step": 52050 }, { "entropy": 0.6449629902839661, "epoch": 0.41648, "grad_norm": 2.0922229290008545, "learning_rate": 2.918807523009204e-05, "loss": 0.6431, "mean_token_accuracy": 0.7917073786258697, "num_tokens": 540142141.0, "step": 52060 }, { "entropy": 0.6861846745014191, "epoch": 0.41656, "grad_norm": 3.233856201171875, "learning_rate": 2.918407362945178e-05, "loss": 0.6768, "mean_token_accuracy": 0.8030146181583404, "num_tokens": 540233583.0, "step": 52070 }, { "entropy": 0.7369484066963196, "epoch": 0.41664, "grad_norm": 1.8711086511611938, "learning_rate": 2.918007202881153e-05, "loss": 0.7298, "mean_token_accuracy": 0.7896906435489655, "num_tokens": 540327883.0, "step": 52080 }, { "entropy": 0.7255127787590027, "epoch": 0.41672, "grad_norm": 2.8825361728668213, "learning_rate": 2.917607042817127e-05, "loss": 0.7304, "mean_token_accuracy": 0.7827469289302826, "num_tokens": 540459872.0, "step": 52090 }, { "entropy": 0.7275029331445694, "epoch": 0.4168, "grad_norm": 5.336118221282959, "learning_rate": 2.9172068827531014e-05, "loss": 0.7175, "mean_token_accuracy": 0.8085861146450043, "num_tokens": 540499863.0, "step": 52100 }, { "entropy": 0.6565698742866516, "epoch": 0.41688, "grad_norm": 1.5457744598388672, "learning_rate": 2.9168067226890754e-05, "loss": 0.6444, "mean_token_accuracy": 0.7950766026973725, "num_tokens": 540662622.0, "step": 52110 }, { "entropy": 0.6045698076486588, "epoch": 0.41696, "grad_norm": 3.4751381874084473, "learning_rate": 2.9164065626250505e-05, "loss": 0.6001, "mean_token_accuracy": 0.8261426031589508, "num_tokens": 540739787.0, "step": 52120 }, { "entropy": 0.6638117730617523, "epoch": 0.41704, "grad_norm": 1.7285473346710205, "learning_rate": 2.9160064025610245e-05, "loss": 0.6707, "mean_token_accuracy": 0.8084216117858887, "num_tokens": 540834506.0, "step": 52130 }, { "entropy": 0.6910745084285737, "epoch": 0.41712, "grad_norm": 2.7212917804718018, "learning_rate": 2.915606242496999e-05, "loss": 0.6906, "mean_token_accuracy": 0.7879131436347961, "num_tokens": 540982948.0, "step": 52140 }, { "entropy": 0.6214039623737335, "epoch": 0.4172, "grad_norm": 4.9845123291015625, "learning_rate": 2.9152060824329736e-05, "loss": 0.6039, "mean_token_accuracy": 0.8289270341396332, "num_tokens": 541027675.0, "step": 52150 }, { "entropy": 0.6920628368854522, "epoch": 0.41728, "grad_norm": 1.978832483291626, "learning_rate": 2.914805922368948e-05, "loss": 0.6912, "mean_token_accuracy": 0.7836773276329041, "num_tokens": 541191515.0, "step": 52160 }, { "entropy": 0.6497759401798249, "epoch": 0.41736, "grad_norm": 2.8612749576568604, "learning_rate": 2.914405762304922e-05, "loss": 0.6405, "mean_token_accuracy": 0.8186691999435425, "num_tokens": 541272027.0, "step": 52170 }, { "entropy": 0.6959039568901062, "epoch": 0.41744, "grad_norm": 2.7211036682128906, "learning_rate": 2.9140056022408963e-05, "loss": 0.6848, "mean_token_accuracy": 0.8029132306575775, "num_tokens": 541365318.0, "step": 52180 }, { "entropy": 0.7154450714588165, "epoch": 0.41752, "grad_norm": 3.5459232330322266, "learning_rate": 2.913605442176871e-05, "loss": 0.7121, "mean_token_accuracy": 0.7867560148239136, "num_tokens": 541504462.0, "step": 52190 }, { "entropy": 0.6147065758705139, "epoch": 0.4176, "grad_norm": 5.339626789093018, "learning_rate": 2.9132052821128454e-05, "loss": 0.6091, "mean_token_accuracy": 0.8286318898200988, "num_tokens": 541544190.0, "step": 52200 }, { "entropy": 0.6613409519195557, "epoch": 0.41768, "grad_norm": 2.2279791831970215, "learning_rate": 2.9128051220488195e-05, "loss": 0.6584, "mean_token_accuracy": 0.7941194474697113, "num_tokens": 541708030.0, "step": 52210 }, { "entropy": 0.6410815387964248, "epoch": 0.41776, "grad_norm": 4.353662014007568, "learning_rate": 2.912404961984794e-05, "loss": 0.628, "mean_token_accuracy": 0.8150706946849823, "num_tokens": 541807423.0, "step": 52220 }, { "entropy": 0.7718287467956543, "epoch": 0.41784, "grad_norm": 1.6013332605361938, "learning_rate": 2.9120048019207685e-05, "loss": 0.7757, "mean_token_accuracy": 0.7830961227416993, "num_tokens": 541902443.0, "step": 52230 }, { "entropy": 0.7108214855194092, "epoch": 0.41792, "grad_norm": 4.176759243011475, "learning_rate": 2.911604641856743e-05, "loss": 0.7132, "mean_token_accuracy": 0.7880415558815003, "num_tokens": 542031765.0, "step": 52240 }, { "entropy": 0.6831983983516693, "epoch": 0.418, "grad_norm": 4.38815975189209, "learning_rate": 2.911204481792717e-05, "loss": 0.677, "mean_token_accuracy": 0.8227638721466064, "num_tokens": 542065013.0, "step": 52250 }, { "entropy": 0.6186177611351014, "epoch": 0.41808, "grad_norm": 1.5180974006652832, "learning_rate": 2.9108043217286917e-05, "loss": 0.6156, "mean_token_accuracy": 0.8013617515563964, "num_tokens": 542228853.0, "step": 52260 }, { "entropy": 0.6689725071191788, "epoch": 0.41816, "grad_norm": 2.623894453048706, "learning_rate": 2.910404161664666e-05, "loss": 0.661, "mean_token_accuracy": 0.8090157151222229, "num_tokens": 542319103.0, "step": 52270 }, { "entropy": 0.7127198994159698, "epoch": 0.41824, "grad_norm": 1.7468205690383911, "learning_rate": 2.9100040016006404e-05, "loss": 0.7123, "mean_token_accuracy": 0.7941734373569489, "num_tokens": 542412498.0, "step": 52280 }, { "entropy": 0.6967289268970489, "epoch": 0.41832, "grad_norm": 2.193641424179077, "learning_rate": 2.909603841536615e-05, "loss": 0.685, "mean_token_accuracy": 0.7959650695323944, "num_tokens": 542543468.0, "step": 52290 }, { "entropy": 0.7102492928504944, "epoch": 0.4184, "grad_norm": 4.9689459800720215, "learning_rate": 2.909203681472589e-05, "loss": 0.7139, "mean_token_accuracy": 0.8115334928035736, "num_tokens": 542580760.0, "step": 52300 }, { "entropy": 0.6228028059005737, "epoch": 0.41848, "grad_norm": 1.5944056510925293, "learning_rate": 2.9088035214085635e-05, "loss": 0.6217, "mean_token_accuracy": 0.7984123229980469, "num_tokens": 542744600.0, "step": 52310 }, { "entropy": 0.6581234574317932, "epoch": 0.41856, "grad_norm": 3.2900755405426025, "learning_rate": 2.908403361344538e-05, "loss": 0.6494, "mean_token_accuracy": 0.8100004076957703, "num_tokens": 542835654.0, "step": 52320 }, { "entropy": 0.6757437705993652, "epoch": 0.41864, "grad_norm": 2.022045612335205, "learning_rate": 2.9080032012805126e-05, "loss": 0.6644, "mean_token_accuracy": 0.8066428065299988, "num_tokens": 542929453.0, "step": 52330 }, { "entropy": 0.726404196023941, "epoch": 0.41872, "grad_norm": 2.2313947677612305, "learning_rate": 2.9076030412164866e-05, "loss": 0.7216, "mean_token_accuracy": 0.7843997299671173, "num_tokens": 543072297.0, "step": 52340 }, { "entropy": 0.651555410027504, "epoch": 0.4188, "grad_norm": 5.096184253692627, "learning_rate": 2.907202881152461e-05, "loss": 0.6546, "mean_token_accuracy": 0.8217381119728089, "num_tokens": 543114844.0, "step": 52350 }, { "entropy": 0.6111860662698746, "epoch": 0.41888, "grad_norm": 1.4747800827026367, "learning_rate": 2.9068027210884357e-05, "loss": 0.6083, "mean_token_accuracy": 0.8068714261054992, "num_tokens": 543278107.0, "step": 52360 }, { "entropy": 0.6653506577014923, "epoch": 0.41896, "grad_norm": 2.900118827819824, "learning_rate": 2.90640256102441e-05, "loss": 0.6563, "mean_token_accuracy": 0.812064790725708, "num_tokens": 543359549.0, "step": 52370 }, { "entropy": 0.6978509485721588, "epoch": 0.41904, "grad_norm": 2.0942680835723877, "learning_rate": 2.906002400960384e-05, "loss": 0.6921, "mean_token_accuracy": 0.8003321766853333, "num_tokens": 543454539.0, "step": 52380 }, { "entropy": 0.6816999673843384, "epoch": 0.41912, "grad_norm": 2.6317102909088135, "learning_rate": 2.9056022408963585e-05, "loss": 0.6731, "mean_token_accuracy": 0.7952609300613404, "num_tokens": 543587980.0, "step": 52390 }, { "entropy": 0.7212762653827667, "epoch": 0.4192, "grad_norm": 5.503876209259033, "learning_rate": 2.9052020808323332e-05, "loss": 0.7389, "mean_token_accuracy": 0.8073168337345124, "num_tokens": 543622874.0, "step": 52400 }, { "entropy": 0.6498690962791442, "epoch": 0.41928, "grad_norm": 1.5079147815704346, "learning_rate": 2.9048019207683076e-05, "loss": 0.6422, "mean_token_accuracy": 0.7958292603492737, "num_tokens": 543786714.0, "step": 52410 }, { "entropy": 0.6338560938835144, "epoch": 0.41936, "grad_norm": 3.361389636993408, "learning_rate": 2.9044017607042816e-05, "loss": 0.6297, "mean_token_accuracy": 0.8183945477008819, "num_tokens": 543873758.0, "step": 52420 }, { "entropy": 0.688662201166153, "epoch": 0.41944, "grad_norm": 1.4475716352462769, "learning_rate": 2.9040016006402566e-05, "loss": 0.6741, "mean_token_accuracy": 0.8056186974048615, "num_tokens": 543967434.0, "step": 52430 }, { "entropy": 0.6815206706523895, "epoch": 0.41952, "grad_norm": 3.5642457008361816, "learning_rate": 2.9036014405762307e-05, "loss": 0.6827, "mean_token_accuracy": 0.7977997839450837, "num_tokens": 544083936.0, "step": 52440 }, { "entropy": 0.6001524686813354, "epoch": 0.4196, "grad_norm": 4.918628215789795, "learning_rate": 2.903201280512205e-05, "loss": 0.6008, "mean_token_accuracy": 0.8337616145610809, "num_tokens": 544117312.0, "step": 52450 }, { "entropy": 0.663663512468338, "epoch": 0.41968, "grad_norm": 1.4849978685379028, "learning_rate": 2.902801120448179e-05, "loss": 0.6629, "mean_token_accuracy": 0.7913898468017578, "num_tokens": 544281152.0, "step": 52460 }, { "entropy": 0.6610800504684449, "epoch": 0.41976, "grad_norm": 2.912935733795166, "learning_rate": 2.902400960384154e-05, "loss": 0.6554, "mean_token_accuracy": 0.8075158596038818, "num_tokens": 544376773.0, "step": 52470 }, { "entropy": 0.6453124225139618, "epoch": 0.41984, "grad_norm": 1.5772440433502197, "learning_rate": 2.902000800320128e-05, "loss": 0.6354, "mean_token_accuracy": 0.8128203928470612, "num_tokens": 544471302.0, "step": 52480 }, { "entropy": 0.7223567724227905, "epoch": 0.41992, "grad_norm": 3.0671615600585938, "learning_rate": 2.9016006402561025e-05, "loss": 0.7167, "mean_token_accuracy": 0.7860154330730438, "num_tokens": 544596993.0, "step": 52490 }, { "entropy": 0.6609803140163422, "epoch": 0.42, "grad_norm": 5.101886749267578, "learning_rate": 2.9012004801920772e-05, "loss": 0.6736, "mean_token_accuracy": 0.8156159102916718, "num_tokens": 544631324.0, "step": 52500 }, { "entropy": 0.6279685974121094, "epoch": 0.42008, "grad_norm": 1.8035918474197388, "learning_rate": 2.9008003201280516e-05, "loss": 0.6208, "mean_token_accuracy": 0.8006961405277252, "num_tokens": 544795164.0, "step": 52510 }, { "entropy": 0.6363225042819977, "epoch": 0.42016, "grad_norm": 4.06240177154541, "learning_rate": 2.9004001600640256e-05, "loss": 0.6358, "mean_token_accuracy": 0.8173358321189881, "num_tokens": 544876716.0, "step": 52520 }, { "entropy": 0.7152969658374786, "epoch": 0.42024, "grad_norm": 1.6223440170288086, "learning_rate": 2.9e-05, "loss": 0.7218, "mean_token_accuracy": 0.7964786350727081, "num_tokens": 544970257.0, "step": 52530 }, { "entropy": 0.6632617890834809, "epoch": 0.42032, "grad_norm": 3.0545101165771484, "learning_rate": 2.8995998399359747e-05, "loss": 0.6615, "mean_token_accuracy": 0.7983690202236176, "num_tokens": 545105799.0, "step": 52540 }, { "entropy": 0.6195655524730682, "epoch": 0.4204, "grad_norm": 5.776262283325195, "learning_rate": 2.899199679871949e-05, "loss": 0.6092, "mean_token_accuracy": 0.8322685539722443, "num_tokens": 545145441.0, "step": 52550 }, { "entropy": 0.6619426250457764, "epoch": 0.42048, "grad_norm": 1.3910280466079712, "learning_rate": 2.898799519807923e-05, "loss": 0.6626, "mean_token_accuracy": 0.7906387448310852, "num_tokens": 545309281.0, "step": 52560 }, { "entropy": 0.7767057180404663, "epoch": 0.42056, "grad_norm": 3.415855646133423, "learning_rate": 2.8983993597438975e-05, "loss": 0.7742, "mean_token_accuracy": 0.782163143157959, "num_tokens": 545398453.0, "step": 52570 }, { "entropy": 0.7711441934108734, "epoch": 0.42064, "grad_norm": 2.8683698177337646, "learning_rate": 2.8979991996798722e-05, "loss": 0.7557, "mean_token_accuracy": 0.788657134771347, "num_tokens": 545492779.0, "step": 52580 }, { "entropy": 0.6581938654184342, "epoch": 0.42072, "grad_norm": 2.5055549144744873, "learning_rate": 2.8975990396158466e-05, "loss": 0.6575, "mean_token_accuracy": 0.7985273540019989, "num_tokens": 545631051.0, "step": 52590 }, { "entropy": 0.7002320170402527, "epoch": 0.4208, "grad_norm": 4.418627738952637, "learning_rate": 2.8971988795518206e-05, "loss": 0.7061, "mean_token_accuracy": 0.8099491357803345, "num_tokens": 545667786.0, "step": 52600 }, { "entropy": 0.6417409479618073, "epoch": 0.42088, "grad_norm": 3.0885865688323975, "learning_rate": 2.8967987194877953e-05, "loss": 0.6395, "mean_token_accuracy": 0.7963116824626922, "num_tokens": 545831626.0, "step": 52610 }, { "entropy": 0.6612623870372772, "epoch": 0.42096, "grad_norm": 3.9894161224365234, "learning_rate": 2.8963985594237697e-05, "loss": 0.6462, "mean_token_accuracy": 0.8130492925643921, "num_tokens": 545915147.0, "step": 52620 }, { "entropy": 0.702339380979538, "epoch": 0.42104, "grad_norm": 2.8094632625579834, "learning_rate": 2.895998399359744e-05, "loss": 0.7011, "mean_token_accuracy": 0.8021171391010284, "num_tokens": 546009106.0, "step": 52630 }, { "entropy": 0.7502898097038269, "epoch": 0.42112, "grad_norm": 2.8263039588928223, "learning_rate": 2.895598239295718e-05, "loss": 0.7554, "mean_token_accuracy": 0.7794299900531769, "num_tokens": 546141649.0, "step": 52640 }, { "entropy": 0.693969550728798, "epoch": 0.4212, "grad_norm": 6.400012493133545, "learning_rate": 2.8951980792316928e-05, "loss": 0.6853, "mean_token_accuracy": 0.8178661584854126, "num_tokens": 546176649.0, "step": 52650 }, { "entropy": 0.6198349237442017, "epoch": 0.42128, "grad_norm": 1.7976640462875366, "learning_rate": 2.8947979191676672e-05, "loss": 0.6157, "mean_token_accuracy": 0.8032563149929046, "num_tokens": 546339913.0, "step": 52660 }, { "entropy": 0.6204670071601868, "epoch": 0.42136, "grad_norm": 3.0942840576171875, "learning_rate": 2.8943977591036416e-05, "loss": 0.6164, "mean_token_accuracy": 0.8199349105358124, "num_tokens": 546418712.0, "step": 52670 }, { "entropy": 0.6496906876564026, "epoch": 0.42144, "grad_norm": 2.0385501384735107, "learning_rate": 2.8939975990396163e-05, "loss": 0.6439, "mean_token_accuracy": 0.8125462234020233, "num_tokens": 546512932.0, "step": 52680 }, { "entropy": 0.7004411578178406, "epoch": 0.42152, "grad_norm": 2.758690357208252, "learning_rate": 2.8935974389755903e-05, "loss": 0.7021, "mean_token_accuracy": 0.7879583477973938, "num_tokens": 546647290.0, "step": 52690 }, { "entropy": 0.6892888575792313, "epoch": 0.4216, "grad_norm": 4.094995498657227, "learning_rate": 2.8931972789115647e-05, "loss": 0.6734, "mean_token_accuracy": 0.8191736578941345, "num_tokens": 546684743.0, "step": 52700 }, { "entropy": 0.6397799670696258, "epoch": 0.42168, "grad_norm": 2.0344674587249756, "learning_rate": 2.892797118847539e-05, "loss": 0.633, "mean_token_accuracy": 0.8013128876686096, "num_tokens": 546848583.0, "step": 52710 }, { "entropy": 0.7449756443500519, "epoch": 0.42176, "grad_norm": 2.513418674468994, "learning_rate": 2.8923969587835138e-05, "loss": 0.7437, "mean_token_accuracy": 0.7846182703971862, "num_tokens": 546952986.0, "step": 52720 }, { "entropy": 0.7781567752361298, "epoch": 0.42184, "grad_norm": 1.609905481338501, "learning_rate": 2.8919967987194878e-05, "loss": 0.7863, "mean_token_accuracy": 0.7834672152996063, "num_tokens": 547049407.0, "step": 52730 }, { "entropy": 0.7460554003715515, "epoch": 0.42192, "grad_norm": 2.6111466884613037, "learning_rate": 2.891596638655462e-05, "loss": 0.7372, "mean_token_accuracy": 0.7871830642223359, "num_tokens": 547174173.0, "step": 52740 }, { "entropy": 0.7148799777030945, "epoch": 0.422, "grad_norm": 4.422404766082764, "learning_rate": 2.891196478591437e-05, "loss": 0.7078, "mean_token_accuracy": 0.8121344447135925, "num_tokens": 547205053.0, "step": 52750 }, { "entropy": 0.6454868912696838, "epoch": 0.42208, "grad_norm": 1.6617252826690674, "learning_rate": 2.8907963185274112e-05, "loss": 0.6483, "mean_token_accuracy": 0.7954323470592499, "num_tokens": 547368893.0, "step": 52760 }, { "entropy": 0.7651320278644562, "epoch": 0.42216, "grad_norm": 3.868079900741577, "learning_rate": 2.8903961584633853e-05, "loss": 0.7681, "mean_token_accuracy": 0.7930985510349273, "num_tokens": 547445661.0, "step": 52770 }, { "entropy": 0.7120117604732513, "epoch": 0.42224, "grad_norm": 2.364811897277832, "learning_rate": 2.8899959983993596e-05, "loss": 0.7023, "mean_token_accuracy": 0.7993740022182465, "num_tokens": 547540488.0, "step": 52780 }, { "entropy": 0.6898960649967194, "epoch": 0.42232, "grad_norm": 3.0984270572662354, "learning_rate": 2.8895958383353343e-05, "loss": 0.6811, "mean_token_accuracy": 0.7975174844264984, "num_tokens": 547669309.0, "step": 52790 }, { "entropy": 0.6939249366521836, "epoch": 0.4224, "grad_norm": 5.949067115783691, "learning_rate": 2.8891956782713087e-05, "loss": 0.6973, "mean_token_accuracy": 0.8135190844535828, "num_tokens": 547704368.0, "step": 52800 }, { "entropy": 0.660643070936203, "epoch": 0.42248, "grad_norm": 2.217099905014038, "learning_rate": 2.8887955182072828e-05, "loss": 0.654, "mean_token_accuracy": 0.7981343686580658, "num_tokens": 547868149.0, "step": 52810 }, { "entropy": 0.7010912418365478, "epoch": 0.42256, "grad_norm": 3.55771803855896, "learning_rate": 2.8883953581432578e-05, "loss": 0.6965, "mean_token_accuracy": 0.8042058229446412, "num_tokens": 547945489.0, "step": 52820 }, { "entropy": 0.6751731395721435, "epoch": 0.42264, "grad_norm": 1.5244596004486084, "learning_rate": 2.887995198079232e-05, "loss": 0.6738, "mean_token_accuracy": 0.8017697095870971, "num_tokens": 548040385.0, "step": 52830 }, { "entropy": 0.7158174633979797, "epoch": 0.42272, "grad_norm": 2.2944815158843994, "learning_rate": 2.8875950380152062e-05, "loss": 0.7146, "mean_token_accuracy": 0.7819898903369904, "num_tokens": 548184116.0, "step": 52840 }, { "entropy": 0.6379714697599411, "epoch": 0.4228, "grad_norm": 5.608603000640869, "learning_rate": 2.8871948779511802e-05, "loss": 0.6223, "mean_token_accuracy": 0.8285577535629273, "num_tokens": 548223900.0, "step": 52850 }, { "entropy": 0.6703892290592194, "epoch": 0.42288, "grad_norm": 2.0835623741149902, "learning_rate": 2.8867947178871553e-05, "loss": 0.6644, "mean_token_accuracy": 0.7914898157119751, "num_tokens": 548387498.0, "step": 52860 }, { "entropy": 0.6845916867256164, "epoch": 0.42296, "grad_norm": 2.938324451446533, "learning_rate": 2.8863945578231293e-05, "loss": 0.6845, "mean_token_accuracy": 0.8048830568790436, "num_tokens": 548474269.0, "step": 52870 }, { "entropy": 0.7597257375717164, "epoch": 0.42304, "grad_norm": 1.967324137687683, "learning_rate": 2.8859943977591037e-05, "loss": 0.761, "mean_token_accuracy": 0.7833857059478759, "num_tokens": 548568265.0, "step": 52880 }, { "entropy": 0.6629722952842713, "epoch": 0.42312, "grad_norm": 2.4133224487304688, "learning_rate": 2.8855942376950784e-05, "loss": 0.6518, "mean_token_accuracy": 0.8016808509826661, "num_tokens": 548701509.0, "step": 52890 }, { "entropy": 0.6979192495346069, "epoch": 0.4232, "grad_norm": 4.605630397796631, "learning_rate": 2.8851940776310528e-05, "loss": 0.6871, "mean_token_accuracy": 0.8176469027996063, "num_tokens": 548735180.0, "step": 52900 }, { "entropy": 0.6676866888999939, "epoch": 0.42328, "grad_norm": 2.2519545555114746, "learning_rate": 2.8847939175670268e-05, "loss": 0.6735, "mean_token_accuracy": 0.7909562766551972, "num_tokens": 548899020.0, "step": 52910 }, { "entropy": 0.6288916498422623, "epoch": 0.42336, "grad_norm": 3.4690494537353516, "learning_rate": 2.8843937575030012e-05, "loss": 0.6207, "mean_token_accuracy": 0.8217413187026977, "num_tokens": 548982857.0, "step": 52920 }, { "entropy": 0.7429714143276215, "epoch": 0.42344, "grad_norm": 1.5527597665786743, "learning_rate": 2.883993597438976e-05, "loss": 0.7454, "mean_token_accuracy": 0.7926325380802155, "num_tokens": 549075246.0, "step": 52930 }, { "entropy": 0.7065321624279022, "epoch": 0.42352, "grad_norm": 2.523735523223877, "learning_rate": 2.8835934373749503e-05, "loss": 0.7007, "mean_token_accuracy": 0.7916175544261932, "num_tokens": 549207523.0, "step": 52940 }, { "entropy": 0.6952352523803711, "epoch": 0.4236, "grad_norm": 5.54334831237793, "learning_rate": 2.8831932773109243e-05, "loss": 0.6842, "mean_token_accuracy": 0.8129722774028778, "num_tokens": 549244759.0, "step": 52950 }, { "entropy": 0.6472134113311767, "epoch": 0.42368, "grad_norm": 1.3894044160842896, "learning_rate": 2.882793117246899e-05, "loss": 0.6521, "mean_token_accuracy": 0.7936919867992401, "num_tokens": 549408599.0, "step": 52960 }, { "entropy": 0.6686941295862198, "epoch": 0.42376, "grad_norm": 3.596846342086792, "learning_rate": 2.8823929571828734e-05, "loss": 0.6623, "mean_token_accuracy": 0.8078998029232025, "num_tokens": 549497897.0, "step": 52970 }, { "entropy": 0.6753411710262298, "epoch": 0.42384, "grad_norm": 1.4833260774612427, "learning_rate": 2.8819927971188477e-05, "loss": 0.6717, "mean_token_accuracy": 0.8055395305156707, "num_tokens": 549591484.0, "step": 52980 }, { "entropy": 0.7458774268627166, "epoch": 0.42392, "grad_norm": 2.4141321182250977, "learning_rate": 2.8815926370548218e-05, "loss": 0.7404, "mean_token_accuracy": 0.7804592311382293, "num_tokens": 549722807.0, "step": 52990 }, { "entropy": 0.6455355167388916, "epoch": 0.424, "grad_norm": 4.272228240966797, "learning_rate": 2.8811924769907965e-05, "loss": 0.6423, "mean_token_accuracy": 0.8272695362567901, "num_tokens": 549762205.0, "step": 53000 }, { "entropy": 0.6820107460021972, "epoch": 0.42408, "grad_norm": 1.8875868320465088, "learning_rate": 2.880792316926771e-05, "loss": 0.6807, "mean_token_accuracy": 0.7869848251342774, "num_tokens": 549924698.0, "step": 53010 }, { "entropy": 0.6736159563064575, "epoch": 0.42416, "grad_norm": 4.636483669281006, "learning_rate": 2.8803921568627452e-05, "loss": 0.6546, "mean_token_accuracy": 0.8132243096828461, "num_tokens": 549990063.0, "step": 53020 }, { "entropy": 0.7033023297786712, "epoch": 0.42424, "grad_norm": 2.7002596855163574, "learning_rate": 2.87999199679872e-05, "loss": 0.7153, "mean_token_accuracy": 0.8040586471557617, "num_tokens": 550080666.0, "step": 53030 }, { "entropy": 0.6647043883800506, "epoch": 0.42432, "grad_norm": 2.445175886154175, "learning_rate": 2.879591836734694e-05, "loss": 0.6549, "mean_token_accuracy": 0.7981244504451752, "num_tokens": 550222502.0, "step": 53040 }, { "entropy": 0.6603065103292465, "epoch": 0.4244, "grad_norm": 5.009878158569336, "learning_rate": 2.8791916766706683e-05, "loss": 0.6668, "mean_token_accuracy": 0.8165116310119629, "num_tokens": 550261644.0, "step": 53050 }, { "entropy": 0.6572042286396027, "epoch": 0.42448, "grad_norm": 1.8705767393112183, "learning_rate": 2.8787915166066427e-05, "loss": 0.6632, "mean_token_accuracy": 0.7876904010772705, "num_tokens": 550424971.0, "step": 53060 }, { "entropy": 0.6666522085666656, "epoch": 0.42456, "grad_norm": 3.258152723312378, "learning_rate": 2.8783913565426174e-05, "loss": 0.6505, "mean_token_accuracy": 0.815691202878952, "num_tokens": 550497988.0, "step": 53070 }, { "entropy": 0.6731131374835968, "epoch": 0.42464, "grad_norm": 2.0217227935791016, "learning_rate": 2.8779911964785915e-05, "loss": 0.6709, "mean_token_accuracy": 0.8058754205703735, "num_tokens": 550591472.0, "step": 53080 }, { "entropy": 0.7098927557468414, "epoch": 0.42472, "grad_norm": 2.469550132751465, "learning_rate": 2.8775910364145658e-05, "loss": 0.709, "mean_token_accuracy": 0.7891478478908539, "num_tokens": 550730164.0, "step": 53090 }, { "entropy": 0.6857457816600799, "epoch": 0.4248, "grad_norm": 4.648335933685303, "learning_rate": 2.8771908763505402e-05, "loss": 0.6775, "mean_token_accuracy": 0.8145728766918182, "num_tokens": 550775073.0, "step": 53100 }, { "entropy": 0.6414927303791046, "epoch": 0.42488, "grad_norm": 1.9317283630371094, "learning_rate": 2.876790716286515e-05, "loss": 0.6404, "mean_token_accuracy": 0.8009524524211884, "num_tokens": 550938908.0, "step": 53110 }, { "entropy": 0.795388001203537, "epoch": 0.42496, "grad_norm": 4.2272491455078125, "learning_rate": 2.876390556222489e-05, "loss": 0.7902, "mean_token_accuracy": 0.7814200758934021, "num_tokens": 551020080.0, "step": 53120 }, { "entropy": 0.737438690662384, "epoch": 0.42504, "grad_norm": 1.8823684453964233, "learning_rate": 2.8759903961584633e-05, "loss": 0.7149, "mean_token_accuracy": 0.7881715714931488, "num_tokens": 551116188.0, "step": 53130 }, { "entropy": 0.672093254327774, "epoch": 0.42512, "grad_norm": 2.6900382041931152, "learning_rate": 2.875590236094438e-05, "loss": 0.6721, "mean_token_accuracy": 0.7941788017749787, "num_tokens": 551251975.0, "step": 53140 }, { "entropy": 0.6705451548099518, "epoch": 0.4252, "grad_norm": 5.057597637176514, "learning_rate": 2.8751900760304124e-05, "loss": 0.6792, "mean_token_accuracy": 0.8163677155971527, "num_tokens": 551287067.0, "step": 53150 }, { "entropy": 0.662723571062088, "epoch": 0.42528, "grad_norm": 2.015564203262329, "learning_rate": 2.8747899159663864e-05, "loss": 0.6615, "mean_token_accuracy": 0.793753057718277, "num_tokens": 551450907.0, "step": 53160 }, { "entropy": 0.6226648330688477, "epoch": 0.42536, "grad_norm": 2.986586332321167, "learning_rate": 2.8743897559023608e-05, "loss": 0.6087, "mean_token_accuracy": 0.8192445397377014, "num_tokens": 551539092.0, "step": 53170 }, { "entropy": 0.6613687455654145, "epoch": 0.42544, "grad_norm": 1.8425579071044922, "learning_rate": 2.8739895958383355e-05, "loss": 0.6623, "mean_token_accuracy": 0.8044004142284393, "num_tokens": 551634610.0, "step": 53180 }, { "entropy": 0.7104377627372742, "epoch": 0.42552, "grad_norm": 2.061981678009033, "learning_rate": 2.87358943577431e-05, "loss": 0.709, "mean_token_accuracy": 0.785182785987854, "num_tokens": 551775708.0, "step": 53190 }, { "entropy": 0.5881969004869461, "epoch": 0.4256, "grad_norm": 4.1873393058776855, "learning_rate": 2.873189275710284e-05, "loss": 0.5749, "mean_token_accuracy": 0.838344132900238, "num_tokens": 551815838.0, "step": 53200 }, { "entropy": 0.664241474866867, "epoch": 0.42568, "grad_norm": 1.4175128936767578, "learning_rate": 2.872789115646259e-05, "loss": 0.662, "mean_token_accuracy": 0.7932075738906861, "num_tokens": 551979651.0, "step": 53210 }, { "entropy": 0.7311245858669281, "epoch": 0.42576, "grad_norm": 2.396296501159668, "learning_rate": 2.872388955582233e-05, "loss": 0.7263, "mean_token_accuracy": 0.7978987216949462, "num_tokens": 552063689.0, "step": 53220 }, { "entropy": 0.7318383514881134, "epoch": 0.42584, "grad_norm": 1.551737666130066, "learning_rate": 2.8719887955182074e-05, "loss": 0.7482, "mean_token_accuracy": 0.7887659907341004, "num_tokens": 552157502.0, "step": 53230 }, { "entropy": 0.7274752736091614, "epoch": 0.42592, "grad_norm": 3.724339008331299, "learning_rate": 2.8715886354541814e-05, "loss": 0.7238, "mean_token_accuracy": 0.7815745174884796, "num_tokens": 552300915.0, "step": 53240 }, { "entropy": 0.7703506499528885, "epoch": 0.426, "grad_norm": 5.27943229675293, "learning_rate": 2.8711884753901564e-05, "loss": 0.7525, "mean_token_accuracy": 0.798606526851654, "num_tokens": 552341403.0, "step": 53250 }, { "entropy": 0.7322824835777283, "epoch": 0.42608, "grad_norm": 2.616551637649536, "learning_rate": 2.8707883153261305e-05, "loss": 0.73, "mean_token_accuracy": 0.7759098827838897, "num_tokens": 552505243.0, "step": 53260 }, { "entropy": 0.6231567859649658, "epoch": 0.42616, "grad_norm": 3.614980459213257, "learning_rate": 2.870388155262105e-05, "loss": 0.6115, "mean_token_accuracy": 0.8198675215244293, "num_tokens": 552592115.0, "step": 53270 }, { "entropy": 0.7037962973117828, "epoch": 0.42624, "grad_norm": 2.0693655014038086, "learning_rate": 2.8699879951980796e-05, "loss": 0.7174, "mean_token_accuracy": 0.7937294185161591, "num_tokens": 552687626.0, "step": 53280 }, { "entropy": 0.7232131749391556, "epoch": 0.42632, "grad_norm": 2.056196928024292, "learning_rate": 2.869587835134054e-05, "loss": 0.7112, "mean_token_accuracy": 0.789281690120697, "num_tokens": 552822466.0, "step": 53290 }, { "entropy": 0.6236946970224381, "epoch": 0.4264, "grad_norm": 4.588089466094971, "learning_rate": 2.869187675070028e-05, "loss": 0.6312, "mean_token_accuracy": 0.8314584732055664, "num_tokens": 552861498.0, "step": 53300 }, { "entropy": 0.6613345474004746, "epoch": 0.42648, "grad_norm": 1.8741931915283203, "learning_rate": 2.8687875150060023e-05, "loss": 0.6502, "mean_token_accuracy": 0.7928977608680725, "num_tokens": 553024748.0, "step": 53310 }, { "entropy": 0.7627342045307159, "epoch": 0.42656, "grad_norm": 3.2097349166870117, "learning_rate": 2.868387354941977e-05, "loss": 0.7443, "mean_token_accuracy": 0.7916028916835784, "num_tokens": 553102719.0, "step": 53320 }, { "entropy": 0.6924751162528991, "epoch": 0.42664, "grad_norm": 1.4409745931625366, "learning_rate": 2.8679871948779514e-05, "loss": 0.6945, "mean_token_accuracy": 0.7978807508945465, "num_tokens": 553195701.0, "step": 53330 }, { "entropy": 0.6512379467487335, "epoch": 0.42672, "grad_norm": 4.621510028839111, "learning_rate": 2.8675870348139255e-05, "loss": 0.6443, "mean_token_accuracy": 0.8035091876983642, "num_tokens": 553318025.0, "step": 53340 }, { "entropy": 0.7375192880630493, "epoch": 0.4268, "grad_norm": 5.644539833068848, "learning_rate": 2.8671868747499e-05, "loss": 0.7281, "mean_token_accuracy": 0.8066483259201049, "num_tokens": 553352248.0, "step": 53350 }, { "entropy": 0.6538298785686493, "epoch": 0.42688, "grad_norm": 1.9416894912719727, "learning_rate": 2.8667867146858745e-05, "loss": 0.6587, "mean_token_accuracy": 0.7953041136264801, "num_tokens": 553516088.0, "step": 53360 }, { "entropy": 0.6787088274955749, "epoch": 0.42696, "grad_norm": 3.2128589153289795, "learning_rate": 2.866386554621849e-05, "loss": 0.6615, "mean_token_accuracy": 0.804832273721695, "num_tokens": 553615491.0, "step": 53370 }, { "entropy": 0.6173387885093689, "epoch": 0.42704, "grad_norm": 1.7866263389587402, "learning_rate": 2.865986394557823e-05, "loss": 0.6157, "mean_token_accuracy": 0.8206899046897889, "num_tokens": 553711097.0, "step": 53380 }, { "entropy": 0.6669993042945862, "epoch": 0.42712, "grad_norm": 2.1256909370422363, "learning_rate": 2.8655862344937976e-05, "loss": 0.6635, "mean_token_accuracy": 0.795079094171524, "num_tokens": 553836069.0, "step": 53390 }, { "entropy": 0.6928116381168365, "epoch": 0.4272, "grad_norm": 4.1580047607421875, "learning_rate": 2.865186074429772e-05, "loss": 0.6816, "mean_token_accuracy": 0.821675568819046, "num_tokens": 553870697.0, "step": 53400 }, { "entropy": 0.6472295939922332, "epoch": 0.42728, "grad_norm": 1.8862390518188477, "learning_rate": 2.8647859143657464e-05, "loss": 0.6464, "mean_token_accuracy": 0.7948705434799195, "num_tokens": 554034537.0, "step": 53410 }, { "entropy": 0.5532104820013046, "epoch": 0.42736, "grad_norm": 2.9337193965911865, "learning_rate": 2.864385754301721e-05, "loss": 0.5445, "mean_token_accuracy": 0.8368477165699005, "num_tokens": 554120574.0, "step": 53420 }, { "entropy": 0.6883225262165069, "epoch": 0.42744, "grad_norm": 1.8504483699798584, "learning_rate": 2.863985594237695e-05, "loss": 0.6713, "mean_token_accuracy": 0.8050265431404113, "num_tokens": 554214402.0, "step": 53430 }, { "entropy": 0.6710894882678986, "epoch": 0.42752, "grad_norm": 2.129467487335205, "learning_rate": 2.8635854341736695e-05, "loss": 0.6794, "mean_token_accuracy": 0.7913025677204132, "num_tokens": 554362012.0, "step": 53440 }, { "entropy": 0.666138517856598, "epoch": 0.4276, "grad_norm": 4.881961345672607, "learning_rate": 2.863185274109644e-05, "loss": 0.6518, "mean_token_accuracy": 0.8229348540306092, "num_tokens": 554400455.0, "step": 53450 }, { "entropy": 0.6500069320201873, "epoch": 0.42768, "grad_norm": 1.688261866569519, "learning_rate": 2.8627851140456186e-05, "loss": 0.6463, "mean_token_accuracy": 0.7946140706539154, "num_tokens": 554564295.0, "step": 53460 }, { "entropy": 0.6255737781524658, "epoch": 0.42776, "grad_norm": 2.9671924114227295, "learning_rate": 2.8623849539815926e-05, "loss": 0.6196, "mean_token_accuracy": 0.8162643849849701, "num_tokens": 554658313.0, "step": 53470 }, { "entropy": 0.6546736896038056, "epoch": 0.42784, "grad_norm": 2.284956693649292, "learning_rate": 2.861984793917567e-05, "loss": 0.641, "mean_token_accuracy": 0.8100716650485993, "num_tokens": 554753308.0, "step": 53480 }, { "entropy": 0.6697166681289672, "epoch": 0.42792, "grad_norm": 2.109011173248291, "learning_rate": 2.8615846338535417e-05, "loss": 0.6781, "mean_token_accuracy": 0.7958092033863068, "num_tokens": 554892285.0, "step": 53490 }, { "entropy": 0.6853932678699494, "epoch": 0.428, "grad_norm": 4.519793510437012, "learning_rate": 2.861184473789516e-05, "loss": 0.6751, "mean_token_accuracy": 0.8212285935878754, "num_tokens": 554933226.0, "step": 53500 }, { "entropy": 0.6569585084915162, "epoch": 0.42808, "grad_norm": 2.791949987411499, "learning_rate": 2.86078431372549e-05, "loss": 0.6533, "mean_token_accuracy": 0.7947850525379181, "num_tokens": 555097066.0, "step": 53510 }, { "entropy": 0.7174835741519928, "epoch": 0.42816, "grad_norm": 3.4746291637420654, "learning_rate": 2.8603841536614645e-05, "loss": 0.7019, "mean_token_accuracy": 0.7982165217399597, "num_tokens": 555187926.0, "step": 53520 }, { "entropy": 0.7244193971157074, "epoch": 0.42824, "grad_norm": 1.4844813346862793, "learning_rate": 2.8599839935974392e-05, "loss": 0.7303, "mean_token_accuracy": 0.78880655169487, "num_tokens": 555281477.0, "step": 53530 }, { "entropy": 0.6674659252166748, "epoch": 0.42832, "grad_norm": 2.7447755336761475, "learning_rate": 2.8595838335334136e-05, "loss": 0.6568, "mean_token_accuracy": 0.8004979193210602, "num_tokens": 555424153.0, "step": 53540 }, { "entropy": 0.6483856201171875, "epoch": 0.4284, "grad_norm": 4.689212799072266, "learning_rate": 2.8591836734693876e-05, "loss": 0.6557, "mean_token_accuracy": 0.8192518055438995, "num_tokens": 555470220.0, "step": 53550 }, { "entropy": 0.6160064756870269, "epoch": 0.42848, "grad_norm": 2.4246609210968018, "learning_rate": 2.8587835134053626e-05, "loss": 0.6097, "mean_token_accuracy": 0.8066805064678192, "num_tokens": 555634060.0, "step": 53560 }, { "entropy": 0.75473872423172, "epoch": 0.42856, "grad_norm": 2.702657699584961, "learning_rate": 2.8583833533413367e-05, "loss": 0.7422, "mean_token_accuracy": 0.7929945290088654, "num_tokens": 555723008.0, "step": 53570 }, { "entropy": 0.6969464004039765, "epoch": 0.42864, "grad_norm": 1.7445975542068481, "learning_rate": 2.857983193277311e-05, "loss": 0.6995, "mean_token_accuracy": 0.7990360021591186, "num_tokens": 555818088.0, "step": 53580 }, { "entropy": 0.6981653809547425, "epoch": 0.42872, "grad_norm": 3.9060096740722656, "learning_rate": 2.857583033213285e-05, "loss": 0.6994, "mean_token_accuracy": 0.7925305128097534, "num_tokens": 555945093.0, "step": 53590 }, { "entropy": 0.6978337585926055, "epoch": 0.4288, "grad_norm": 5.064356803894043, "learning_rate": 2.85718287314926e-05, "loss": 0.6779, "mean_token_accuracy": 0.8143746137619019, "num_tokens": 555977983.0, "step": 53600 }, { "entropy": 0.65308758020401, "epoch": 0.42888, "grad_norm": 2.5903944969177246, "learning_rate": 2.856782713085234e-05, "loss": 0.6549, "mean_token_accuracy": 0.793234008550644, "num_tokens": 556141823.0, "step": 53610 }, { "entropy": 0.6701125502586365, "epoch": 0.42896, "grad_norm": 3.178912878036499, "learning_rate": 2.8563825530212085e-05, "loss": 0.6601, "mean_token_accuracy": 0.8073947608470917, "num_tokens": 556226128.0, "step": 53620 }, { "entropy": 0.7191742539405823, "epoch": 0.42904, "grad_norm": 1.760996699333191, "learning_rate": 2.8559823929571832e-05, "loss": 0.7071, "mean_token_accuracy": 0.7936659097671509, "num_tokens": 556321028.0, "step": 53630 }, { "entropy": 0.7024272382259369, "epoch": 0.42912, "grad_norm": 2.3529505729675293, "learning_rate": 2.8555822328931576e-05, "loss": 0.7069, "mean_token_accuracy": 0.7863725423812866, "num_tokens": 556454357.0, "step": 53640 }, { "entropy": 0.7131786406040191, "epoch": 0.4292, "grad_norm": 4.646273612976074, "learning_rate": 2.8551820728291316e-05, "loss": 0.7194, "mean_token_accuracy": 0.811264717578888, "num_tokens": 556491958.0, "step": 53650 }, { "entropy": 0.645288223028183, "epoch": 0.42928, "grad_norm": 2.59653377532959, "learning_rate": 2.854781912765106e-05, "loss": 0.6377, "mean_token_accuracy": 0.7960473597049713, "num_tokens": 556655000.0, "step": 53660 }, { "entropy": 0.7082450896501541, "epoch": 0.42936, "grad_norm": 3.4641590118408203, "learning_rate": 2.8543817527010807e-05, "loss": 0.7039, "mean_token_accuracy": 0.8055981814861297, "num_tokens": 556732089.0, "step": 53670 }, { "entropy": 0.7390750348567963, "epoch": 0.42944, "grad_norm": 2.7369086742401123, "learning_rate": 2.853981592637055e-05, "loss": 0.7427, "mean_token_accuracy": 0.7882510721683502, "num_tokens": 556826491.0, "step": 53680 }, { "entropy": 0.6406025290489197, "epoch": 0.42952, "grad_norm": 2.536424160003662, "learning_rate": 2.853581432573029e-05, "loss": 0.6259, "mean_token_accuracy": 0.8106443285942078, "num_tokens": 556948483.0, "step": 53690 }, { "entropy": 0.7125771701335907, "epoch": 0.4296, "grad_norm": 5.712788105010986, "learning_rate": 2.8531812725090035e-05, "loss": 0.7164, "mean_token_accuracy": 0.8140623152256012, "num_tokens": 556979446.0, "step": 53700 }, { "entropy": 0.6312325298786163, "epoch": 0.42968, "grad_norm": 1.5243337154388428, "learning_rate": 2.8527811124449782e-05, "loss": 0.6312, "mean_token_accuracy": 0.795945280790329, "num_tokens": 557143286.0, "step": 53710 }, { "entropy": 0.6659706890583038, "epoch": 0.42976, "grad_norm": 2.840195655822754, "learning_rate": 2.8523809523809526e-05, "loss": 0.663, "mean_token_accuracy": 0.8052142500877381, "num_tokens": 557240689.0, "step": 53720 }, { "entropy": 0.7669829368591309, "epoch": 0.42984, "grad_norm": 1.6515454053878784, "learning_rate": 2.8519807923169266e-05, "loss": 0.7525, "mean_token_accuracy": 0.7875296592712402, "num_tokens": 557335022.0, "step": 53730 }, { "entropy": 0.609719169139862, "epoch": 0.42992, "grad_norm": 2.4441335201263428, "learning_rate": 2.8515806322529017e-05, "loss": 0.6067, "mean_token_accuracy": 0.8100821137428283, "num_tokens": 557471843.0, "step": 53740 }, { "entropy": 0.7135925263166427, "epoch": 0.43, "grad_norm": 4.350500106811523, "learning_rate": 2.8511804721888757e-05, "loss": 0.707, "mean_token_accuracy": 0.8080021381378174, "num_tokens": 557507947.0, "step": 53750 }, { "entropy": 0.6346000969409943, "epoch": 0.43008, "grad_norm": 1.4151188135147095, "learning_rate": 2.85078031212485e-05, "loss": 0.6333, "mean_token_accuracy": 0.7987298429012298, "num_tokens": 557671787.0, "step": 53760 }, { "entropy": 0.6539320886135102, "epoch": 0.43016, "grad_norm": 4.98284912109375, "learning_rate": 2.850380152060824e-05, "loss": 0.6497, "mean_token_accuracy": 0.8137187778949737, "num_tokens": 557759077.0, "step": 53770 }, { "entropy": 0.7022192180156708, "epoch": 0.43024, "grad_norm": 2.1211564540863037, "learning_rate": 2.849979991996799e-05, "loss": 0.7122, "mean_token_accuracy": 0.7954435467720031, "num_tokens": 557853263.0, "step": 53780 }, { "entropy": 0.7147580027580261, "epoch": 0.43032, "grad_norm": 2.1214587688446045, "learning_rate": 2.8495798319327732e-05, "loss": 0.6938, "mean_token_accuracy": 0.7904074907302856, "num_tokens": 557992725.0, "step": 53790 }, { "entropy": 0.7096282809972763, "epoch": 0.4304, "grad_norm": 4.76278829574585, "learning_rate": 2.8491796718687476e-05, "loss": 0.706, "mean_token_accuracy": 0.8127110481262207, "num_tokens": 558026668.0, "step": 53800 }, { "entropy": 0.6287114560604096, "epoch": 0.43048, "grad_norm": 1.6036500930786133, "learning_rate": 2.8487795118047223e-05, "loss": 0.6354, "mean_token_accuracy": 0.7983940005302429, "num_tokens": 558190508.0, "step": 53810 }, { "entropy": 0.633879229426384, "epoch": 0.43056, "grad_norm": 2.8544259071350098, "learning_rate": 2.8483793517406966e-05, "loss": 0.6188, "mean_token_accuracy": 0.816365772485733, "num_tokens": 558291392.0, "step": 53820 }, { "entropy": 0.639908492565155, "epoch": 0.43064, "grad_norm": 1.4777811765670776, "learning_rate": 2.8479791916766707e-05, "loss": 0.6417, "mean_token_accuracy": 0.8136543393135071, "num_tokens": 558387913.0, "step": 53830 }, { "entropy": 0.7004949510097503, "epoch": 0.43072, "grad_norm": 3.1656107902526855, "learning_rate": 2.847579031612645e-05, "loss": 0.6879, "mean_token_accuracy": 0.7878770112991333, "num_tokens": 558532790.0, "step": 53840 }, { "entropy": 0.7606079936027527, "epoch": 0.4308, "grad_norm": 4.134767055511475, "learning_rate": 2.8471788715486197e-05, "loss": 0.7683, "mean_token_accuracy": 0.80091952085495, "num_tokens": 558574854.0, "step": 53850 }, { "entropy": 0.6772935032844544, "epoch": 0.43088, "grad_norm": 1.4700489044189453, "learning_rate": 2.846778711484594e-05, "loss": 0.6777, "mean_token_accuracy": 0.7867672264575958, "num_tokens": 558738694.0, "step": 53860 }, { "entropy": 0.7066800177097321, "epoch": 0.43096, "grad_norm": 3.1019093990325928, "learning_rate": 2.846378551420568e-05, "loss": 0.6991, "mean_token_accuracy": 0.7971839964389801, "num_tokens": 558831640.0, "step": 53870 }, { "entropy": 0.7147356927394867, "epoch": 0.43104, "grad_norm": 1.9740591049194336, "learning_rate": 2.845978391356543e-05, "loss": 0.7077, "mean_token_accuracy": 0.7998514890670776, "num_tokens": 558926387.0, "step": 53880 }, { "entropy": 0.6850273191928864, "epoch": 0.43112, "grad_norm": 2.6567225456237793, "learning_rate": 2.8455782312925172e-05, "loss": 0.6775, "mean_token_accuracy": 0.7915135383605957, "num_tokens": 559062025.0, "step": 53890 }, { "entropy": 0.6407431364059448, "epoch": 0.4312, "grad_norm": 4.534041404724121, "learning_rate": 2.8451780712284916e-05, "loss": 0.6415, "mean_token_accuracy": 0.8239154279232025, "num_tokens": 559099119.0, "step": 53900 }, { "entropy": 0.6486319184303284, "epoch": 0.43128, "grad_norm": 1.8570058345794678, "learning_rate": 2.8447779111644656e-05, "loss": 0.6488, "mean_token_accuracy": 0.7942818760871887, "num_tokens": 559256533.0, "step": 53910 }, { "entropy": 0.6862446784973144, "epoch": 0.43136, "grad_norm": 3.979266405105591, "learning_rate": 2.8443777511004403e-05, "loss": 0.6795, "mean_token_accuracy": 0.8064122200012207, "num_tokens": 559322210.0, "step": 53920 }, { "entropy": 0.6819309234619141, "epoch": 0.43144, "grad_norm": 1.6077598333358765, "learning_rate": 2.8439775910364147e-05, "loss": 0.6706, "mean_token_accuracy": 0.8090448141098022, "num_tokens": 559412924.0, "step": 53930 }, { "entropy": 0.6993845373392105, "epoch": 0.43152, "grad_norm": 3.5015645027160645, "learning_rate": 2.843577430972389e-05, "loss": 0.6959, "mean_token_accuracy": 0.7870662152767182, "num_tokens": 559553825.0, "step": 53940 }, { "entropy": 0.6746145099401474, "epoch": 0.4316, "grad_norm": 4.608726978302002, "learning_rate": 2.8431772709083638e-05, "loss": 0.6755, "mean_token_accuracy": 0.8133716762065888, "num_tokens": 559598534.0, "step": 53950 }, { "entropy": 0.6361712515354156, "epoch": 0.43168, "grad_norm": 1.527184009552002, "learning_rate": 2.842777110844338e-05, "loss": 0.6322, "mean_token_accuracy": 0.8002864360809326, "num_tokens": 559762103.0, "step": 53960 }, { "entropy": 0.7494345247745514, "epoch": 0.43176, "grad_norm": 4.046987056732178, "learning_rate": 2.8423769507803122e-05, "loss": 0.7381, "mean_token_accuracy": 0.7910093605518341, "num_tokens": 559847624.0, "step": 53970 }, { "entropy": 0.6836788296699524, "epoch": 0.43184, "grad_norm": 2.08978271484375, "learning_rate": 2.8419767907162866e-05, "loss": 0.674, "mean_token_accuracy": 0.8039055407047272, "num_tokens": 559942165.0, "step": 53980 }, { "entropy": 0.7051366031169891, "epoch": 0.43192, "grad_norm": 2.0574872493743896, "learning_rate": 2.8415766306522613e-05, "loss": 0.7052, "mean_token_accuracy": 0.7833358466625213, "num_tokens": 560088407.0, "step": 53990 }, { "entropy": 0.7363025784492493, "epoch": 0.432, "grad_norm": 4.440171718597412, "learning_rate": 2.8411764705882353e-05, "loss": 0.7236, "mean_token_accuracy": 0.8051105260848999, "num_tokens": 560129356.0, "step": 54000 }, { "entropy": 0.6446264207363128, "epoch": 0.43208, "grad_norm": 1.6075670719146729, "learning_rate": 2.8407763105242097e-05, "loss": 0.6414, "mean_token_accuracy": 0.7992916524410247, "num_tokens": 560293196.0, "step": 54010 }, { "entropy": 0.700880891084671, "epoch": 0.43216, "grad_norm": 3.995035171508789, "learning_rate": 2.8403761504601844e-05, "loss": 0.6915, "mean_token_accuracy": 0.8023451030254364, "num_tokens": 560386533.0, "step": 54020 }, { "entropy": 0.6753308713436127, "epoch": 0.43224, "grad_norm": 1.8888564109802246, "learning_rate": 2.8399759903961588e-05, "loss": 0.6754, "mean_token_accuracy": 0.8066798090934754, "num_tokens": 560481568.0, "step": 54030 }, { "entropy": 0.6631960690021514, "epoch": 0.43232, "grad_norm": 2.261530876159668, "learning_rate": 2.8395758303321328e-05, "loss": 0.6585, "mean_token_accuracy": 0.8006359994411468, "num_tokens": 560611821.0, "step": 54040 }, { "entropy": 0.7843233823776246, "epoch": 0.4324, "grad_norm": 5.192469120025635, "learning_rate": 2.8391756702681072e-05, "loss": 0.7915, "mean_token_accuracy": 0.7963722348213196, "num_tokens": 560644596.0, "step": 54050 }, { "entropy": 0.6939183235168457, "epoch": 0.43248, "grad_norm": 1.7328581809997559, "learning_rate": 2.838775510204082e-05, "loss": 0.6958, "mean_token_accuracy": 0.7834941446781158, "num_tokens": 560808436.0, "step": 54060 }, { "entropy": 0.6029542595148086, "epoch": 0.43256, "grad_norm": 3.924576997756958, "learning_rate": 2.8383753501400563e-05, "loss": 0.5916, "mean_token_accuracy": 0.8253691673278809, "num_tokens": 560894917.0, "step": 54070 }, { "entropy": 0.7272971451282502, "epoch": 0.43264, "grad_norm": 1.493873953819275, "learning_rate": 2.8379751900760303e-05, "loss": 0.7314, "mean_token_accuracy": 0.7921535491943359, "num_tokens": 560989943.0, "step": 54080 }, { "entropy": 0.6723476827144623, "epoch": 0.43272, "grad_norm": 2.1784465312957764, "learning_rate": 2.8375750300120053e-05, "loss": 0.6637, "mean_token_accuracy": 0.7971231043338776, "num_tokens": 561134064.0, "step": 54090 }, { "entropy": 0.6472405016422271, "epoch": 0.4328, "grad_norm": 5.064034938812256, "learning_rate": 2.8371748699479794e-05, "loss": 0.6479, "mean_token_accuracy": 0.8192877054214478, "num_tokens": 561178801.0, "step": 54100 }, { "entropy": 0.6599346399307251, "epoch": 0.43288, "grad_norm": 2.502291679382324, "learning_rate": 2.8367747098839537e-05, "loss": 0.6608, "mean_token_accuracy": 0.791084623336792, "num_tokens": 561342417.0, "step": 54110 }, { "entropy": 0.751614561676979, "epoch": 0.43296, "grad_norm": 3.7564697265625, "learning_rate": 2.8363745498199278e-05, "loss": 0.7376, "mean_token_accuracy": 0.7968299090862274, "num_tokens": 561423286.0, "step": 54120 }, { "entropy": 0.6843616247177124, "epoch": 0.43304, "grad_norm": 1.7874239683151245, "learning_rate": 2.8359743897559028e-05, "loss": 0.6709, "mean_token_accuracy": 0.8070968806743621, "num_tokens": 561516836.0, "step": 54130 }, { "entropy": 0.6942179262638092, "epoch": 0.43312, "grad_norm": 1.8873335123062134, "learning_rate": 2.835574229691877e-05, "loss": 0.6927, "mean_token_accuracy": 0.7861264288425446, "num_tokens": 561668446.0, "step": 54140 }, { "entropy": 0.6414608597755432, "epoch": 0.4332, "grad_norm": 3.9565465450286865, "learning_rate": 2.8351740696278512e-05, "loss": 0.6367, "mean_token_accuracy": 0.8253090739250183, "num_tokens": 561714199.0, "step": 54150 }, { "entropy": 0.6115969538688659, "epoch": 0.43328, "grad_norm": 1.578507900238037, "learning_rate": 2.834773909563826e-05, "loss": 0.6113, "mean_token_accuracy": 0.8047691762447358, "num_tokens": 561878039.0, "step": 54160 }, { "entropy": 0.6964025974273682, "epoch": 0.43336, "grad_norm": 2.9223732948303223, "learning_rate": 2.8343737494998003e-05, "loss": 0.6967, "mean_token_accuracy": 0.7999995589256287, "num_tokens": 561967572.0, "step": 54170 }, { "entropy": 0.7043707132339477, "epoch": 0.43344, "grad_norm": 1.7329152822494507, "learning_rate": 2.8339735894357743e-05, "loss": 0.6941, "mean_token_accuracy": 0.8017815411090851, "num_tokens": 562061393.0, "step": 54180 }, { "entropy": 0.6504238963127136, "epoch": 0.43352, "grad_norm": 2.130033016204834, "learning_rate": 2.8335734293717487e-05, "loss": 0.6415, "mean_token_accuracy": 0.7992504179477692, "num_tokens": 562197851.0, "step": 54190 }, { "entropy": 0.7346269279718399, "epoch": 0.4336, "grad_norm": 4.6009368896484375, "learning_rate": 2.8331732693077234e-05, "loss": 0.7332, "mean_token_accuracy": 0.8043490529060364, "num_tokens": 562238752.0, "step": 54200 }, { "entropy": 0.6368505358695984, "epoch": 0.43368, "grad_norm": 1.7222957611083984, "learning_rate": 2.8327731092436978e-05, "loss": 0.6388, "mean_token_accuracy": 0.7976296782493592, "num_tokens": 562400449.0, "step": 54210 }, { "entropy": 0.6056838452816009, "epoch": 0.43376, "grad_norm": 3.3735265731811523, "learning_rate": 2.8323729491796718e-05, "loss": 0.6045, "mean_token_accuracy": 0.8261923730373383, "num_tokens": 562473724.0, "step": 54220 }, { "entropy": 0.6826184272766114, "epoch": 0.43384, "grad_norm": 2.092472553253174, "learning_rate": 2.8319727891156462e-05, "loss": 0.6907, "mean_token_accuracy": 0.8005540847778321, "num_tokens": 562566978.0, "step": 54230 }, { "entropy": 0.6770190954208374, "epoch": 0.43392, "grad_norm": 3.12219500541687, "learning_rate": 2.831572629051621e-05, "loss": 0.6687, "mean_token_accuracy": 0.7923814952373505, "num_tokens": 562705395.0, "step": 54240 }, { "entropy": 0.5777842283248902, "epoch": 0.434, "grad_norm": 4.2152557373046875, "learning_rate": 2.8311724689875953e-05, "loss": 0.578, "mean_token_accuracy": 0.8381675124168396, "num_tokens": 562742099.0, "step": 54250 }, { "entropy": 0.6369244515895843, "epoch": 0.43408, "grad_norm": 1.6667494773864746, "learning_rate": 2.8307723089235693e-05, "loss": 0.6308, "mean_token_accuracy": 0.7983577072620391, "num_tokens": 562905929.0, "step": 54260 }, { "entropy": 0.6809191644191742, "epoch": 0.43416, "grad_norm": 2.8639719486236572, "learning_rate": 2.830372148859544e-05, "loss": 0.6903, "mean_token_accuracy": 0.8065424084663391, "num_tokens": 562985519.0, "step": 54270 }, { "entropy": 0.6799829781055451, "epoch": 0.43424, "grad_norm": 2.1968162059783936, "learning_rate": 2.8299719887955184e-05, "loss": 0.6726, "mean_token_accuracy": 0.8052019894123077, "num_tokens": 563078480.0, "step": 54280 }, { "entropy": 0.7454510569572449, "epoch": 0.43432, "grad_norm": 2.46195650100708, "learning_rate": 2.8295718287314928e-05, "loss": 0.7361, "mean_token_accuracy": 0.7800271093845368, "num_tokens": 563217909.0, "step": 54290 }, { "entropy": 0.7287124574184418, "epoch": 0.4344, "grad_norm": 5.2334184646606445, "learning_rate": 2.8291716686674668e-05, "loss": 0.7251, "mean_token_accuracy": 0.8061529576778412, "num_tokens": 563253083.0, "step": 54300 }, { "entropy": 0.6429554164409638, "epoch": 0.43448, "grad_norm": 2.126836061477661, "learning_rate": 2.8287715086034415e-05, "loss": 0.647, "mean_token_accuracy": 0.7960876286029815, "num_tokens": 563416795.0, "step": 54310 }, { "entropy": 0.6533930689096451, "epoch": 0.43456, "grad_norm": 3.0259318351745605, "learning_rate": 2.828371348539416e-05, "loss": 0.6447, "mean_token_accuracy": 0.815314245223999, "num_tokens": 563500304.0, "step": 54320 }, { "entropy": 0.6646231532096862, "epoch": 0.43464, "grad_norm": 1.6870800256729126, "learning_rate": 2.8279711884753902e-05, "loss": 0.6726, "mean_token_accuracy": 0.8075805246829987, "num_tokens": 563595458.0, "step": 54330 }, { "entropy": 0.7264810681343079, "epoch": 0.43472, "grad_norm": 2.81884765625, "learning_rate": 2.827571028411365e-05, "loss": 0.7195, "mean_token_accuracy": 0.7846909463405609, "num_tokens": 563722711.0, "step": 54340 }, { "entropy": 0.7215853273868561, "epoch": 0.4348, "grad_norm": 4.203418254852295, "learning_rate": 2.827170868347339e-05, "loss": 0.6933, "mean_token_accuracy": 0.8178810775279999, "num_tokens": 563756586.0, "step": 54350 }, { "entropy": 0.6727319955825806, "epoch": 0.43488, "grad_norm": 1.5142134428024292, "learning_rate": 2.8267707082833134e-05, "loss": 0.6731, "mean_token_accuracy": 0.7897044479846954, "num_tokens": 563920426.0, "step": 54360 }, { "entropy": 0.5981236129999161, "epoch": 0.43496, "grad_norm": 2.258087635040283, "learning_rate": 2.8263705482192877e-05, "loss": 0.5932, "mean_token_accuracy": 0.8276039302349091, "num_tokens": 564006942.0, "step": 54370 }, { "entropy": 0.8042637228965759, "epoch": 0.43504, "grad_norm": 2.5228710174560547, "learning_rate": 2.8259703881552624e-05, "loss": 0.8069, "mean_token_accuracy": 0.7767291724681854, "num_tokens": 564101624.0, "step": 54380 }, { "entropy": 0.6997182548046113, "epoch": 0.43512, "grad_norm": 2.4034619331359863, "learning_rate": 2.8255702280912365e-05, "loss": 0.6926, "mean_token_accuracy": 0.7873366117477417, "num_tokens": 564239073.0, "step": 54390 }, { "entropy": 0.6653899729251862, "epoch": 0.4352, "grad_norm": 3.9591174125671387, "learning_rate": 2.825170068027211e-05, "loss": 0.6538, "mean_token_accuracy": 0.8228933155536652, "num_tokens": 564275584.0, "step": 54400 }, { "entropy": 0.6498513340950012, "epoch": 0.43528, "grad_norm": 1.3310680389404297, "learning_rate": 2.8247699079631856e-05, "loss": 0.6483, "mean_token_accuracy": 0.7941194534301758, "num_tokens": 564439424.0, "step": 54410 }, { "entropy": 0.5994804173707962, "epoch": 0.43536, "grad_norm": 3.1389219760894775, "learning_rate": 2.82436974789916e-05, "loss": 0.5809, "mean_token_accuracy": 0.8263863742351532, "num_tokens": 564521194.0, "step": 54420 }, { "entropy": 0.6760534882545471, "epoch": 0.43544, "grad_norm": 1.4430058002471924, "learning_rate": 2.823969587835134e-05, "loss": 0.6708, "mean_token_accuracy": 0.807055938243866, "num_tokens": 564612603.0, "step": 54430 }, { "entropy": 0.6601543307304383, "epoch": 0.43552, "grad_norm": 2.399785280227661, "learning_rate": 2.8235694277711083e-05, "loss": 0.668, "mean_token_accuracy": 0.7984581530094147, "num_tokens": 564749830.0, "step": 54440 }, { "entropy": 0.7374581456184387, "epoch": 0.4356, "grad_norm": 5.349143981933594, "learning_rate": 2.823169267707083e-05, "loss": 0.7263, "mean_token_accuracy": 0.8104393899440765, "num_tokens": 564785841.0, "step": 54450 }, { "entropy": 0.6800575077533721, "epoch": 0.43568, "grad_norm": 1.9336870908737183, "learning_rate": 2.8227691076430574e-05, "loss": 0.6752, "mean_token_accuracy": 0.7894667685031891, "num_tokens": 564949500.0, "step": 54460 }, { "entropy": 0.6850882589817047, "epoch": 0.43576, "grad_norm": 3.2913730144500732, "learning_rate": 2.8223689475790314e-05, "loss": 0.6815, "mean_token_accuracy": 0.8023960888385773, "num_tokens": 565028592.0, "step": 54470 }, { "entropy": 0.7218836367130279, "epoch": 0.43584, "grad_norm": 1.7735649347305298, "learning_rate": 2.8219687875150065e-05, "loss": 0.7301, "mean_token_accuracy": 0.7973542988300324, "num_tokens": 565121213.0, "step": 54480 }, { "entropy": 0.6669366776943206, "epoch": 0.43592, "grad_norm": 1.9041330814361572, "learning_rate": 2.8215686274509805e-05, "loss": 0.6657, "mean_token_accuracy": 0.7959933817386627, "num_tokens": 565266890.0, "step": 54490 }, { "entropy": 0.7248986572027206, "epoch": 0.436, "grad_norm": 4.2643208503723145, "learning_rate": 2.821168467386955e-05, "loss": 0.7147, "mean_token_accuracy": 0.8100713193416595, "num_tokens": 565309211.0, "step": 54500 }, { "entropy": 0.6179283857345581, "epoch": 0.43608, "grad_norm": 1.302668571472168, "learning_rate": 2.820768307322929e-05, "loss": 0.6174, "mean_token_accuracy": 0.802474993467331, "num_tokens": 565473047.0, "step": 54510 }, { "entropy": 0.615666338801384, "epoch": 0.43616, "grad_norm": 2.8927857875823975, "learning_rate": 2.820368147258904e-05, "loss": 0.6104, "mean_token_accuracy": 0.8284643650054931, "num_tokens": 565542092.0, "step": 54520 }, { "entropy": 0.7336043119430542, "epoch": 0.43624, "grad_norm": 2.0536458492279053, "learning_rate": 2.819967987194878e-05, "loss": 0.7383, "mean_token_accuracy": 0.7942773640155792, "num_tokens": 565635029.0, "step": 54530 }, { "entropy": 0.684371018409729, "epoch": 0.43632, "grad_norm": 2.3945748805999756, "learning_rate": 2.8195678271308524e-05, "loss": 0.674, "mean_token_accuracy": 0.7950979351997376, "num_tokens": 565782024.0, "step": 54540 }, { "entropy": 0.7899927735328675, "epoch": 0.4364, "grad_norm": 4.168496131896973, "learning_rate": 2.819167667066827e-05, "loss": 0.7812, "mean_token_accuracy": 0.7992257654666901, "num_tokens": 565823564.0, "step": 54550 }, { "entropy": 0.6618624866008759, "epoch": 0.43648, "grad_norm": 1.5959371328353882, "learning_rate": 2.8187675070028015e-05, "loss": 0.6565, "mean_token_accuracy": 0.7959269404411315, "num_tokens": 565985328.0, "step": 54560 }, { "entropy": 0.6929536283016204, "epoch": 0.43656, "grad_norm": 4.113441467285156, "learning_rate": 2.8183673469387755e-05, "loss": 0.6911, "mean_token_accuracy": 0.8050774931907654, "num_tokens": 566059974.0, "step": 54570 }, { "entropy": 0.7480387628078461, "epoch": 0.43664, "grad_norm": 2.1003713607788086, "learning_rate": 2.81796718687475e-05, "loss": 0.7387, "mean_token_accuracy": 0.7910386741161346, "num_tokens": 566154141.0, "step": 54580 }, { "entropy": 0.7028233110904694, "epoch": 0.43672, "grad_norm": 4.429018974304199, "learning_rate": 2.8175670268107246e-05, "loss": 0.6994, "mean_token_accuracy": 0.7882965385913849, "num_tokens": 566298810.0, "step": 54590 }, { "entropy": 0.7147275656461716, "epoch": 0.4368, "grad_norm": 5.420651912689209, "learning_rate": 2.817166866746699e-05, "loss": 0.7081, "mean_token_accuracy": 0.8114259421825409, "num_tokens": 566340838.0, "step": 54600 }, { "entropy": 0.6193817377090454, "epoch": 0.43688, "grad_norm": 1.6107707023620605, "learning_rate": 2.816766706682673e-05, "loss": 0.619, "mean_token_accuracy": 0.8009587228298187, "num_tokens": 566504678.0, "step": 54610 }, { "entropy": 0.6340424358844757, "epoch": 0.43696, "grad_norm": 2.957489252090454, "learning_rate": 2.8163665466186477e-05, "loss": 0.6271, "mean_token_accuracy": 0.8133575320243835, "num_tokens": 566592239.0, "step": 54620 }, { "entropy": 0.696477210521698, "epoch": 0.43704, "grad_norm": 3.2308292388916016, "learning_rate": 2.815966386554622e-05, "loss": 0.6839, "mean_token_accuracy": 0.8065255165100098, "num_tokens": 566686518.0, "step": 54630 }, { "entropy": 0.6844488084316254, "epoch": 0.43712, "grad_norm": 3.4672627449035645, "learning_rate": 2.8155662264905964e-05, "loss": 0.6844, "mean_token_accuracy": 0.795584374666214, "num_tokens": 566821472.0, "step": 54640 }, { "entropy": 0.6323912650346756, "epoch": 0.4372, "grad_norm": 4.620676517486572, "learning_rate": 2.8151660664265705e-05, "loss": 0.6127, "mean_token_accuracy": 0.827002763748169, "num_tokens": 566862638.0, "step": 54650 }, { "entropy": 0.6137083113193512, "epoch": 0.43728, "grad_norm": 1.9600080251693726, "learning_rate": 2.8147659063625452e-05, "loss": 0.6151, "mean_token_accuracy": 0.8004091322422028, "num_tokens": 567026478.0, "step": 54660 }, { "entropy": 0.7210587441921235, "epoch": 0.43736, "grad_norm": 3.21441912651062, "learning_rate": 2.8143657462985196e-05, "loss": 0.7182, "mean_token_accuracy": 0.797627592086792, "num_tokens": 567119491.0, "step": 54670 }, { "entropy": 0.7277274310588837, "epoch": 0.43744, "grad_norm": 1.7290849685668945, "learning_rate": 2.813965586234494e-05, "loss": 0.7125, "mean_token_accuracy": 0.7996485888957977, "num_tokens": 567212027.0, "step": 54680 }, { "entropy": 0.6618791222572327, "epoch": 0.43752, "grad_norm": 3.0932369232177734, "learning_rate": 2.8135654261704686e-05, "loss": 0.6781, "mean_token_accuracy": 0.7916409909725189, "num_tokens": 567361262.0, "step": 54690 }, { "entropy": 0.7696364343166351, "epoch": 0.4376, "grad_norm": 4.471238136291504, "learning_rate": 2.8131652661064427e-05, "loss": 0.767, "mean_token_accuracy": 0.8061314940452575, "num_tokens": 567400370.0, "step": 54700 }, { "entropy": 0.604970532655716, "epoch": 0.43768, "grad_norm": 2.3856663703918457, "learning_rate": 2.812765106042417e-05, "loss": 0.6034, "mean_token_accuracy": 0.8071395337581635, "num_tokens": 567564158.0, "step": 54710 }, { "entropy": 0.6950815826654434, "epoch": 0.43776, "grad_norm": 3.24584698677063, "learning_rate": 2.8123649459783914e-05, "loss": 0.6843, "mean_token_accuracy": 0.8043416976928711, "num_tokens": 567646283.0, "step": 54720 }, { "entropy": 0.7567066848278046, "epoch": 0.43784, "grad_norm": 2.4127824306488037, "learning_rate": 2.811964785914366e-05, "loss": 0.7479, "mean_token_accuracy": 0.7913560509681702, "num_tokens": 567739810.0, "step": 54730 }, { "entropy": 0.6887143909931183, "epoch": 0.43792, "grad_norm": 2.13942289352417, "learning_rate": 2.81156462585034e-05, "loss": 0.6971, "mean_token_accuracy": 0.7872442126274108, "num_tokens": 567886532.0, "step": 54740 }, { "entropy": 0.691374534368515, "epoch": 0.438, "grad_norm": 4.170211315155029, "learning_rate": 2.8111644657863145e-05, "loss": 0.6876, "mean_token_accuracy": 0.8112572610378266, "num_tokens": 567931787.0, "step": 54750 }, { "entropy": 0.6822041273117065, "epoch": 0.43808, "grad_norm": 1.9171996116638184, "learning_rate": 2.8107643057222892e-05, "loss": 0.6783, "mean_token_accuracy": 0.7875488638877869, "num_tokens": 568095627.0, "step": 54760 }, { "entropy": 0.7367985904216766, "epoch": 0.43816, "grad_norm": 4.1540961265563965, "learning_rate": 2.8103641456582636e-05, "loss": 0.7196, "mean_token_accuracy": 0.7965340197086335, "num_tokens": 568179193.0, "step": 54770 }, { "entropy": 0.6445921391248703, "epoch": 0.43824, "grad_norm": 1.8208730220794678, "learning_rate": 2.8099639855942376e-05, "loss": 0.6584, "mean_token_accuracy": 0.8101401209831238, "num_tokens": 568272616.0, "step": 54780 }, { "entropy": 0.7374309062957763, "epoch": 0.43832, "grad_norm": 3.0126540660858154, "learning_rate": 2.809563825530212e-05, "loss": 0.7315, "mean_token_accuracy": 0.7794933140277862, "num_tokens": 568409773.0, "step": 54790 }, { "entropy": 0.6467891931533813, "epoch": 0.4384, "grad_norm": 4.620591640472412, "learning_rate": 2.8091636654661867e-05, "loss": 0.637, "mean_token_accuracy": 0.8251047432422638, "num_tokens": 568450780.0, "step": 54800 }, { "entropy": 0.6463239550590515, "epoch": 0.43848, "grad_norm": 1.4681777954101562, "learning_rate": 2.808763505402161e-05, "loss": 0.651, "mean_token_accuracy": 0.7931057691574097, "num_tokens": 568614620.0, "step": 54810 }, { "entropy": 0.6956274092197419, "epoch": 0.43856, "grad_norm": 3.612934112548828, "learning_rate": 2.808363345338135e-05, "loss": 0.6824, "mean_token_accuracy": 0.8057862401008606, "num_tokens": 568695643.0, "step": 54820 }, { "entropy": 0.7227219045162201, "epoch": 0.43864, "grad_norm": 1.6124567985534668, "learning_rate": 2.8079631852741095e-05, "loss": 0.7185, "mean_token_accuracy": 0.7958428800106049, "num_tokens": 568789936.0, "step": 54830 }, { "entropy": 0.6558770716190339, "epoch": 0.43872, "grad_norm": 3.032499313354492, "learning_rate": 2.8075630252100842e-05, "loss": 0.6594, "mean_token_accuracy": 0.7990320563316345, "num_tokens": 568921461.0, "step": 54840 }, { "entropy": 0.6360956400632858, "epoch": 0.4388, "grad_norm": 4.131659507751465, "learning_rate": 2.8071628651460586e-05, "loss": 0.6358, "mean_token_accuracy": 0.8281497657299042, "num_tokens": 568954741.0, "step": 54850 }, { "entropy": 0.6744830071926117, "epoch": 0.43888, "grad_norm": 3.2187278270721436, "learning_rate": 2.8067627050820326e-05, "loss": 0.6721, "mean_token_accuracy": 0.7900097787380218, "num_tokens": 569118581.0, "step": 54860 }, { "entropy": 0.6658056050539016, "epoch": 0.43896, "grad_norm": 2.3594794273376465, "learning_rate": 2.8063625450180077e-05, "loss": 0.6607, "mean_token_accuracy": 0.8102604985237122, "num_tokens": 569195190.0, "step": 54870 }, { "entropy": 0.6887578010559082, "epoch": 0.43904, "grad_norm": 1.8507699966430664, "learning_rate": 2.8059623849539817e-05, "loss": 0.6949, "mean_token_accuracy": 0.801706337928772, "num_tokens": 569287156.0, "step": 54880 }, { "entropy": 0.7317592382431031, "epoch": 0.43912, "grad_norm": 2.12809157371521, "learning_rate": 2.805562224889956e-05, "loss": 0.7221, "mean_token_accuracy": 0.7798229813575744, "num_tokens": 569428304.0, "step": 54890 }, { "entropy": 0.6663982272148132, "epoch": 0.4392, "grad_norm": 4.092324256896973, "learning_rate": 2.80516206482593e-05, "loss": 0.6507, "mean_token_accuracy": 0.8257465481758117, "num_tokens": 569465496.0, "step": 54900 }, { "entropy": 0.6338774561882019, "epoch": 0.43928, "grad_norm": 1.922805666923523, "learning_rate": 2.804761904761905e-05, "loss": 0.631, "mean_token_accuracy": 0.7996693134307862, "num_tokens": 569628476.0, "step": 54910 }, { "entropy": 0.7508569836616517, "epoch": 0.43936, "grad_norm": 2.683655261993408, "learning_rate": 2.8043617446978792e-05, "loss": 0.7496, "mean_token_accuracy": 0.7905464112758637, "num_tokens": 569701995.0, "step": 54920 }, { "entropy": 0.6612761080265045, "epoch": 0.43944, "grad_norm": 1.8017760515213013, "learning_rate": 2.8039615846338535e-05, "loss": 0.6654, "mean_token_accuracy": 0.8068254590034485, "num_tokens": 569794929.0, "step": 54930 }, { "entropy": 0.7354738533496856, "epoch": 0.43952, "grad_norm": 2.62319278717041, "learning_rate": 2.8035614245698283e-05, "loss": 0.7334, "mean_token_accuracy": 0.7829894363880158, "num_tokens": 569928169.0, "step": 54940 }, { "entropy": 0.7337763190269471, "epoch": 0.4396, "grad_norm": 5.31417989730835, "learning_rate": 2.8031612645058026e-05, "loss": 0.7286, "mean_token_accuracy": 0.8062245965003967, "num_tokens": 569963439.0, "step": 54950 }, { "entropy": 0.6200597524642945, "epoch": 0.43968, "grad_norm": 2.089747428894043, "learning_rate": 2.8027611044417767e-05, "loss": 0.6112, "mean_token_accuracy": 0.8025791108608246, "num_tokens": 570127116.0, "step": 54960 }, { "entropy": 0.6445700943470001, "epoch": 0.43976, "grad_norm": 5.118545055389404, "learning_rate": 2.802360944377751e-05, "loss": 0.6418, "mean_token_accuracy": 0.8153103172779084, "num_tokens": 570200721.0, "step": 54970 }, { "entropy": 0.7075453251600266, "epoch": 0.43984, "grad_norm": 1.4755555391311646, "learning_rate": 2.8019607843137257e-05, "loss": 0.7033, "mean_token_accuracy": 0.801959103345871, "num_tokens": 570293209.0, "step": 54980 }, { "entropy": 0.6276440322399139, "epoch": 0.43992, "grad_norm": 3.754487991333008, "learning_rate": 2.8015606242497e-05, "loss": 0.6207, "mean_token_accuracy": 0.8091209053993225, "num_tokens": 570427713.0, "step": 54990 }, { "entropy": 0.6598863631486893, "epoch": 0.44, "grad_norm": 5.734046459197998, "learning_rate": 2.801160464185674e-05, "loss": 0.6516, "mean_token_accuracy": 0.8225716948509216, "num_tokens": 570468500.0, "step": 55000 }, { "entropy": 0.6386459231376648, "epoch": 0.44008, "grad_norm": 2.1950414180755615, "learning_rate": 2.8007603041216492e-05, "loss": 0.6372, "mean_token_accuracy": 0.7966435551643372, "num_tokens": 570630676.0, "step": 55010 }, { "entropy": 0.698923671245575, "epoch": 0.44016, "grad_norm": 3.55183482170105, "learning_rate": 2.8003601440576232e-05, "loss": 0.6837, "mean_token_accuracy": 0.8067326366901397, "num_tokens": 570704293.0, "step": 55020 }, { "entropy": 0.695481127500534, "epoch": 0.44024, "grad_norm": 1.6882026195526123, "learning_rate": 2.7999599839935976e-05, "loss": 0.701, "mean_token_accuracy": 0.7998678088188171, "num_tokens": 570795922.0, "step": 55030 }, { "entropy": 0.6310112833976745, "epoch": 0.44032, "grad_norm": 2.714029550552368, "learning_rate": 2.7995598239295716e-05, "loss": 0.629, "mean_token_accuracy": 0.8030120730400085, "num_tokens": 570939808.0, "step": 55040 }, { "entropy": 0.6711225628852844, "epoch": 0.4404, "grad_norm": 3.602233409881592, "learning_rate": 2.7991596638655467e-05, "loss": 0.6474, "mean_token_accuracy": 0.8290734946727752, "num_tokens": 570979128.0, "step": 55050 }, { "entropy": 0.6245810627937317, "epoch": 0.44048, "grad_norm": 1.5038431882858276, "learning_rate": 2.7987595038015207e-05, "loss": 0.6233, "mean_token_accuracy": 0.8043945014476777, "num_tokens": 571140386.0, "step": 55060 }, { "entropy": 0.72182657122612, "epoch": 0.44056, "grad_norm": 4.01569128036499, "learning_rate": 2.798359343737495e-05, "loss": 0.7186, "mean_token_accuracy": 0.8046181440353394, "num_tokens": 571215130.0, "step": 55070 }, { "entropy": 0.6902652204036712, "epoch": 0.44064, "grad_norm": 1.5698022842407227, "learning_rate": 2.7979591836734698e-05, "loss": 0.6856, "mean_token_accuracy": 0.8018265962600708, "num_tokens": 571307580.0, "step": 55080 }, { "entropy": 0.6849178791046142, "epoch": 0.44072, "grad_norm": 2.026474714279175, "learning_rate": 2.797559023609444e-05, "loss": 0.6783, "mean_token_accuracy": 0.7920527756214142, "num_tokens": 571448954.0, "step": 55090 }, { "entropy": 0.7871632754802704, "epoch": 0.4408, "grad_norm": 4.442068099975586, "learning_rate": 2.7971588635454182e-05, "loss": 0.7944, "mean_token_accuracy": 0.790200275182724, "num_tokens": 571493473.0, "step": 55100 }, { "entropy": 0.6860476851463317, "epoch": 0.44088, "grad_norm": 2.5530498027801514, "learning_rate": 2.7967587034813926e-05, "loss": 0.6768, "mean_token_accuracy": 0.788855642080307, "num_tokens": 571657313.0, "step": 55110 }, { "entropy": 0.6731288850307464, "epoch": 0.44096, "grad_norm": 3.5805623531341553, "learning_rate": 2.7963585434173673e-05, "loss": 0.6633, "mean_token_accuracy": 0.8089004635810852, "num_tokens": 571748741.0, "step": 55120 }, { "entropy": 0.6985828459262848, "epoch": 0.44104, "grad_norm": 2.3288192749023438, "learning_rate": 2.7959583833533417e-05, "loss": 0.6825, "mean_token_accuracy": 0.8027049660682678, "num_tokens": 571842515.0, "step": 55130 }, { "entropy": 0.6928346157073975, "epoch": 0.44112, "grad_norm": 2.4911224842071533, "learning_rate": 2.7955582232893157e-05, "loss": 0.6928, "mean_token_accuracy": 0.7869644045829773, "num_tokens": 571986415.0, "step": 55140 }, { "entropy": 0.6238755404949188, "epoch": 0.4412, "grad_norm": 4.604879379272461, "learning_rate": 2.7951580632252904e-05, "loss": 0.6227, "mean_token_accuracy": 0.8305119454860688, "num_tokens": 572029624.0, "step": 55150 }, { "entropy": 0.6786661326885224, "epoch": 0.44128, "grad_norm": 1.9048618078231812, "learning_rate": 2.7947579031612648e-05, "loss": 0.679, "mean_token_accuracy": 0.7873919606208801, "num_tokens": 572193445.0, "step": 55160 }, { "entropy": 0.6944296360015869, "epoch": 0.44136, "grad_norm": 3.43906307220459, "learning_rate": 2.794357743097239e-05, "loss": 0.6828, "mean_token_accuracy": 0.8064414262771606, "num_tokens": 572269407.0, "step": 55170 }, { "entropy": 0.676831966638565, "epoch": 0.44144, "grad_norm": 1.456135869026184, "learning_rate": 2.793957583033213e-05, "loss": 0.6983, "mean_token_accuracy": 0.8022107124328614, "num_tokens": 572361877.0, "step": 55180 }, { "entropy": 0.6740504801273346, "epoch": 0.44152, "grad_norm": 3.12070631980896, "learning_rate": 2.793557422969188e-05, "loss": 0.6715, "mean_token_accuracy": 0.7930472314357757, "num_tokens": 572495787.0, "step": 55190 }, { "entropy": 0.7290012091398239, "epoch": 0.4416, "grad_norm": 4.632167816162109, "learning_rate": 2.7931572629051623e-05, "loss": 0.7131, "mean_token_accuracy": 0.8135158121585846, "num_tokens": 572530702.0, "step": 55200 }, { "entropy": 0.622934365272522, "epoch": 0.44168, "grad_norm": 2.3042519092559814, "learning_rate": 2.7927571028411366e-05, "loss": 0.6217, "mean_token_accuracy": 0.8009037673473358, "num_tokens": 572694542.0, "step": 55210 }, { "entropy": 0.6815413653850555, "epoch": 0.44176, "grad_norm": 3.007690191268921, "learning_rate": 2.7923569427771113e-05, "loss": 0.6713, "mean_token_accuracy": 0.8016779661178589, "num_tokens": 572791453.0, "step": 55220 }, { "entropy": 0.6558058619499206, "epoch": 0.44184, "grad_norm": 1.555757761001587, "learning_rate": 2.7919567827130854e-05, "loss": 0.6652, "mean_token_accuracy": 0.8040595114231109, "num_tokens": 572886899.0, "step": 55230 }, { "entropy": 0.6402872979640961, "epoch": 0.44192, "grad_norm": 1.793834924697876, "learning_rate": 2.7915566226490597e-05, "loss": 0.6379, "mean_token_accuracy": 0.7996286392211914, "num_tokens": 573035026.0, "step": 55240 }, { "entropy": 0.6654545158147812, "epoch": 0.442, "grad_norm": 6.255444526672363, "learning_rate": 2.791156462585034e-05, "loss": 0.6651, "mean_token_accuracy": 0.8204156398773194, "num_tokens": 573075823.0, "step": 55250 }, { "entropy": 0.6418148636817932, "epoch": 0.44208, "grad_norm": 2.520613193511963, "learning_rate": 2.7907563025210088e-05, "loss": 0.6365, "mean_token_accuracy": 0.7987480461597443, "num_tokens": 573239046.0, "step": 55260 }, { "entropy": 0.6550824910402298, "epoch": 0.44216, "grad_norm": 2.681777238845825, "learning_rate": 2.790356142456983e-05, "loss": 0.644, "mean_token_accuracy": 0.8170536756515503, "num_tokens": 573313327.0, "step": 55270 }, { "entropy": 0.6905095279216766, "epoch": 0.44224, "grad_norm": 1.4554095268249512, "learning_rate": 2.7899559823929572e-05, "loss": 0.6769, "mean_token_accuracy": 0.8058952867984772, "num_tokens": 573407484.0, "step": 55280 }, { "entropy": 0.682085445523262, "epoch": 0.44232, "grad_norm": 2.479304552078247, "learning_rate": 2.789555822328932e-05, "loss": 0.6801, "mean_token_accuracy": 0.7921823799610138, "num_tokens": 573537231.0, "step": 55290 }, { "entropy": 0.6992878556251526, "epoch": 0.4424, "grad_norm": 5.319050312042236, "learning_rate": 2.7891556622649063e-05, "loss": 0.6898, "mean_token_accuracy": 0.8153695285320282, "num_tokens": 573574141.0, "step": 55300 }, { "entropy": 0.6569351732730866, "epoch": 0.44248, "grad_norm": 2.0733072757720947, "learning_rate": 2.7887555022008803e-05, "loss": 0.6544, "mean_token_accuracy": 0.792629474401474, "num_tokens": 573737981.0, "step": 55310 }, { "entropy": 0.6175833940505981, "epoch": 0.44256, "grad_norm": 3.5586535930633545, "learning_rate": 2.7883553421368547e-05, "loss": 0.6124, "mean_token_accuracy": 0.8199154615402222, "num_tokens": 573822010.0, "step": 55320 }, { "entropy": 0.7116624712944031, "epoch": 0.44264, "grad_norm": 1.386143684387207, "learning_rate": 2.7879551820728294e-05, "loss": 0.7012, "mean_token_accuracy": 0.8013136625289917, "num_tokens": 573915358.0, "step": 55330 }, { "entropy": 0.7444394886493683, "epoch": 0.44272, "grad_norm": 4.179113388061523, "learning_rate": 2.7875550220088038e-05, "loss": 0.7365, "mean_token_accuracy": 0.7813419163227081, "num_tokens": 574053825.0, "step": 55340 }, { "entropy": 0.6022908747196197, "epoch": 0.4428, "grad_norm": 5.845221996307373, "learning_rate": 2.7871548619447778e-05, "loss": 0.6016, "mean_token_accuracy": 0.8285149812698365, "num_tokens": 574090936.0, "step": 55350 }, { "entropy": 0.6274353325366974, "epoch": 0.44288, "grad_norm": 1.6590520143508911, "learning_rate": 2.7867547018807522e-05, "loss": 0.6225, "mean_token_accuracy": 0.7992244780063629, "num_tokens": 574254776.0, "step": 55360 }, { "entropy": 0.6568230450153351, "epoch": 0.44296, "grad_norm": 3.4486749172210693, "learning_rate": 2.786354541816727e-05, "loss": 0.6552, "mean_token_accuracy": 0.8086520493030548, "num_tokens": 574346576.0, "step": 55370 }, { "entropy": 0.744742888212204, "epoch": 0.44304, "grad_norm": 3.0401148796081543, "learning_rate": 2.7859543817527013e-05, "loss": 0.7509, "mean_token_accuracy": 0.7882747769355773, "num_tokens": 574440074.0, "step": 55380 }, { "entropy": 0.6753243267536163, "epoch": 0.44312, "grad_norm": 2.8067874908447266, "learning_rate": 2.7855542216886753e-05, "loss": 0.6699, "mean_token_accuracy": 0.7969795346260071, "num_tokens": 574575812.0, "step": 55390 }, { "entropy": 0.7155551344156266, "epoch": 0.4432, "grad_norm": 4.886232376098633, "learning_rate": 2.7851540616246504e-05, "loss": 0.7158, "mean_token_accuracy": 0.8143512189388276, "num_tokens": 574611664.0, "step": 55400 }, { "entropy": 0.6574213922023773, "epoch": 0.44328, "grad_norm": 1.324187994003296, "learning_rate": 2.7847539015606244e-05, "loss": 0.6565, "mean_token_accuracy": 0.7920920848846436, "num_tokens": 574775504.0, "step": 55410 }, { "entropy": 0.6913735508918762, "epoch": 0.44336, "grad_norm": 2.9372947216033936, "learning_rate": 2.7843537414965988e-05, "loss": 0.6728, "mean_token_accuracy": 0.8048954486846924, "num_tokens": 574877131.0, "step": 55420 }, { "entropy": 0.6950988888740539, "epoch": 0.44344, "grad_norm": 1.9097580909729004, "learning_rate": 2.7839535814325728e-05, "loss": 0.701, "mean_token_accuracy": 0.8015814900398255, "num_tokens": 574972219.0, "step": 55430 }, { "entropy": 0.7158176124095916, "epoch": 0.44352, "grad_norm": 2.644826650619507, "learning_rate": 2.783553421368548e-05, "loss": 0.7116, "mean_token_accuracy": 0.787030029296875, "num_tokens": 575111475.0, "step": 55440 }, { "entropy": 0.6380693316459656, "epoch": 0.4436, "grad_norm": 4.412130355834961, "learning_rate": 2.783153261304522e-05, "loss": 0.6384, "mean_token_accuracy": 0.8278037965297699, "num_tokens": 575154649.0, "step": 55450 }, { "entropy": 0.6771803438663483, "epoch": 0.44368, "grad_norm": 1.5783441066741943, "learning_rate": 2.7827531012404962e-05, "loss": 0.677, "mean_token_accuracy": 0.790404486656189, "num_tokens": 575318001.0, "step": 55460 }, { "entropy": 0.7113046407699585, "epoch": 0.44376, "grad_norm": 4.249597072601318, "learning_rate": 2.782352941176471e-05, "loss": 0.7045, "mean_token_accuracy": 0.8034768879413605, "num_tokens": 575396618.0, "step": 55470 }, { "entropy": 0.745395052433014, "epoch": 0.44384, "grad_norm": 2.5763401985168457, "learning_rate": 2.7819527811124453e-05, "loss": 0.7276, "mean_token_accuracy": 0.7922476232051849, "num_tokens": 575490093.0, "step": 55480 }, { "entropy": 0.7018317401409149, "epoch": 0.44392, "grad_norm": 2.650383710861206, "learning_rate": 2.7815526210484194e-05, "loss": 0.706, "mean_token_accuracy": 0.7869484484195709, "num_tokens": 575633711.0, "step": 55490 }, { "entropy": 0.7167612969875335, "epoch": 0.444, "grad_norm": 4.121910095214844, "learning_rate": 2.7811524609843937e-05, "loss": 0.7082, "mean_token_accuracy": 0.8133208155632019, "num_tokens": 575672956.0, "step": 55500 }, { "entropy": 0.6283051311969757, "epoch": 0.44408, "grad_norm": 1.9931367635726929, "learning_rate": 2.7807523009203684e-05, "loss": 0.6187, "mean_token_accuracy": 0.8004763066768646, "num_tokens": 575836796.0, "step": 55510 }, { "entropy": 0.6763742476701736, "epoch": 0.44416, "grad_norm": 3.7320339679718018, "learning_rate": 2.7803521408563428e-05, "loss": 0.677, "mean_token_accuracy": 0.8060267448425293, "num_tokens": 575920036.0, "step": 55520 }, { "entropy": 0.7250619947910308, "epoch": 0.44424, "grad_norm": 1.9629243612289429, "learning_rate": 2.779951980792317e-05, "loss": 0.7208, "mean_token_accuracy": 0.7980219066143036, "num_tokens": 576013217.0, "step": 55530 }, { "entropy": 0.7444736897945404, "epoch": 0.44432, "grad_norm": 2.455876350402832, "learning_rate": 2.7795518207282916e-05, "loss": 0.7419, "mean_token_accuracy": 0.7786621630191803, "num_tokens": 576148626.0, "step": 55540 }, { "entropy": 0.6668260008096695, "epoch": 0.4444, "grad_norm": 5.107484817504883, "learning_rate": 2.779151660664266e-05, "loss": 0.654, "mean_token_accuracy": 0.8260885357856751, "num_tokens": 576184475.0, "step": 55550 }, { "entropy": 0.7019758105278016, "epoch": 0.44448, "grad_norm": 1.9879956245422363, "learning_rate": 2.7787515006002403e-05, "loss": 0.7102, "mean_token_accuracy": 0.7819125592708588, "num_tokens": 576348315.0, "step": 55560 }, { "entropy": 0.6441848784685135, "epoch": 0.44456, "grad_norm": 3.7434890270233154, "learning_rate": 2.7783513405362143e-05, "loss": 0.638, "mean_token_accuracy": 0.811512291431427, "num_tokens": 576437390.0, "step": 55570 }, { "entropy": 0.6934963881969451, "epoch": 0.44464, "grad_norm": 1.9152681827545166, "learning_rate": 2.777951180472189e-05, "loss": 0.6829, "mean_token_accuracy": 0.8074833929538727, "num_tokens": 576531874.0, "step": 55580 }, { "entropy": 0.7358788788318634, "epoch": 0.44472, "grad_norm": 3.378726005554199, "learning_rate": 2.7775510204081634e-05, "loss": 0.7317, "mean_token_accuracy": 0.7798699557781219, "num_tokens": 576675761.0, "step": 55590 }, { "entropy": 0.7577161580324173, "epoch": 0.4448, "grad_norm": 6.88375186920166, "learning_rate": 2.7771508603441378e-05, "loss": 0.753, "mean_token_accuracy": 0.8054669022560119, "num_tokens": 576721144.0, "step": 55600 }, { "entropy": 0.6617082893848419, "epoch": 0.44488, "grad_norm": 1.9669475555419922, "learning_rate": 2.7767507002801125e-05, "loss": 0.6625, "mean_token_accuracy": 0.7919699549674988, "num_tokens": 576884984.0, "step": 55610 }, { "entropy": 0.6462846577167511, "epoch": 0.44496, "grad_norm": 3.7959914207458496, "learning_rate": 2.7763505402160865e-05, "loss": 0.6402, "mean_token_accuracy": 0.8149145007133484, "num_tokens": 576966632.0, "step": 55620 }, { "entropy": 0.6918116211891174, "epoch": 0.44504, "grad_norm": 1.8841716051101685, "learning_rate": 2.775950380152061e-05, "loss": 0.6886, "mean_token_accuracy": 0.8073054373264312, "num_tokens": 577060047.0, "step": 55630 }, { "entropy": 0.6954351842403412, "epoch": 0.44512, "grad_norm": 2.046194076538086, "learning_rate": 2.7755502200880353e-05, "loss": 0.6824, "mean_token_accuracy": 0.792292469739914, "num_tokens": 577204895.0, "step": 55640 }, { "entropy": 0.7094561457633972, "epoch": 0.4452, "grad_norm": 4.614621639251709, "learning_rate": 2.77515006002401e-05, "loss": 0.7068, "mean_token_accuracy": 0.8075642883777618, "num_tokens": 577247712.0, "step": 55650 }, { "entropy": 0.6416839599609375, "epoch": 0.44528, "grad_norm": 2.4040791988372803, "learning_rate": 2.774749899959984e-05, "loss": 0.6377, "mean_token_accuracy": 0.8004213452339173, "num_tokens": 577411552.0, "step": 55660 }, { "entropy": 0.6189917623996735, "epoch": 0.44536, "grad_norm": 2.996678113937378, "learning_rate": 2.7743497398959584e-05, "loss": 0.6132, "mean_token_accuracy": 0.8207288801670074, "num_tokens": 577504774.0, "step": 55670 }, { "entropy": 0.738066029548645, "epoch": 0.44544, "grad_norm": 1.6377503871917725, "learning_rate": 2.773949579831933e-05, "loss": 0.7375, "mean_token_accuracy": 0.7881965100765228, "num_tokens": 577599205.0, "step": 55680 }, { "entropy": 0.7231986165046692, "epoch": 0.44552, "grad_norm": 2.2230544090270996, "learning_rate": 2.7735494197679075e-05, "loss": 0.7133, "mean_token_accuracy": 0.7813875615596771, "num_tokens": 577729288.0, "step": 55690 }, { "entropy": 0.7414454936981201, "epoch": 0.4456, "grad_norm": 4.467281341552734, "learning_rate": 2.7731492597038815e-05, "loss": 0.7297, "mean_token_accuracy": 0.8066278398036957, "num_tokens": 577765422.0, "step": 55700 }, { "entropy": 0.6403146862983704, "epoch": 0.44568, "grad_norm": 1.9315787553787231, "learning_rate": 2.772749099639856e-05, "loss": 0.6437, "mean_token_accuracy": 0.7960368812084198, "num_tokens": 577929262.0, "step": 55710 }, { "entropy": 0.7153452932834625, "epoch": 0.44576, "grad_norm": 3.2077677249908447, "learning_rate": 2.7723489395758306e-05, "loss": 0.7083, "mean_token_accuracy": 0.7946760535240174, "num_tokens": 578029566.0, "step": 55720 }, { "entropy": 0.7295752108097077, "epoch": 0.44584, "grad_norm": 2.843524217605591, "learning_rate": 2.771948779511805e-05, "loss": 0.72, "mean_token_accuracy": 0.7954494893550873, "num_tokens": 578124459.0, "step": 55730 }, { "entropy": 0.6726764440536499, "epoch": 0.44592, "grad_norm": 2.2754416465759277, "learning_rate": 2.771548619447779e-05, "loss": 0.6652, "mean_token_accuracy": 0.7948982000350953, "num_tokens": 578263403.0, "step": 55740 }, { "entropy": 0.6159518510103226, "epoch": 0.446, "grad_norm": 5.032502174377441, "learning_rate": 2.771148459383754e-05, "loss": 0.6117, "mean_token_accuracy": 0.8344861209392548, "num_tokens": 578304048.0, "step": 55750 }, { "entropy": 0.6627967536449433, "epoch": 0.44608, "grad_norm": 1.8118940591812134, "learning_rate": 2.770748299319728e-05, "loss": 0.6675, "mean_token_accuracy": 0.7887879967689514, "num_tokens": 578466250.0, "step": 55760 }, { "entropy": 0.6830525755882263, "epoch": 0.44616, "grad_norm": 3.5142998695373535, "learning_rate": 2.7703481392557024e-05, "loss": 0.6706, "mean_token_accuracy": 0.8096980094909668, "num_tokens": 578531195.0, "step": 55770 }, { "entropy": 0.7008122563362121, "epoch": 0.44624, "grad_norm": 1.6589860916137695, "learning_rate": 2.7699479791916765e-05, "loss": 0.7151, "mean_token_accuracy": 0.802330207824707, "num_tokens": 578622911.0, "step": 55780 }, { "entropy": 0.6766036868095398, "epoch": 0.44632, "grad_norm": 2.6281228065490723, "learning_rate": 2.7695478191276515e-05, "loss": 0.6668, "mean_token_accuracy": 0.7948897361755372, "num_tokens": 578767045.0, "step": 55790 }, { "entropy": 0.6070546984672547, "epoch": 0.4464, "grad_norm": 4.2794342041015625, "learning_rate": 2.7691476590636255e-05, "loss": 0.6047, "mean_token_accuracy": 0.8336291313171387, "num_tokens": 578808351.0, "step": 55800 }, { "entropy": 0.6960186958312988, "epoch": 0.44648, "grad_norm": 2.6652045249938965, "learning_rate": 2.7687474989996e-05, "loss": 0.6908, "mean_token_accuracy": 0.7855998694896698, "num_tokens": 578971914.0, "step": 55810 }, { "entropy": 0.6211029946804046, "epoch": 0.44656, "grad_norm": 3.0666518211364746, "learning_rate": 2.7683473389355746e-05, "loss": 0.6092, "mean_token_accuracy": 0.8217040061950683, "num_tokens": 579055585.0, "step": 55820 }, { "entropy": 0.7161609470844269, "epoch": 0.44664, "grad_norm": 1.7917335033416748, "learning_rate": 2.767947178871549e-05, "loss": 0.72, "mean_token_accuracy": 0.7963793873786926, "num_tokens": 579150251.0, "step": 55830 }, { "entropy": 0.6737475603818893, "epoch": 0.44672, "grad_norm": 3.28279709815979, "learning_rate": 2.767547018807523e-05, "loss": 0.6565, "mean_token_accuracy": 0.8011157810688019, "num_tokens": 579272675.0, "step": 55840 }, { "entropy": 0.5934045076370239, "epoch": 0.4468, "grad_norm": 4.713639259338379, "learning_rate": 2.7671468587434974e-05, "loss": 0.5865, "mean_token_accuracy": 0.8430930018424988, "num_tokens": 579307819.0, "step": 55850 }, { "entropy": 0.6664517819881439, "epoch": 0.44688, "grad_norm": 1.8082736730575562, "learning_rate": 2.766746698679472e-05, "loss": 0.6615, "mean_token_accuracy": 0.7909257471561432, "num_tokens": 579471659.0, "step": 55860 }, { "entropy": 0.759491091966629, "epoch": 0.44696, "grad_norm": 3.010246753692627, "learning_rate": 2.7663465386154465e-05, "loss": 0.7505, "mean_token_accuracy": 0.7866048693656922, "num_tokens": 579564227.0, "step": 55870 }, { "entropy": 0.6986628949642182, "epoch": 0.44704, "grad_norm": 1.4760901927947998, "learning_rate": 2.7659463785514205e-05, "loss": 0.7117, "mean_token_accuracy": 0.799528843164444, "num_tokens": 579658669.0, "step": 55880 }, { "entropy": 0.7386326789855957, "epoch": 0.44712, "grad_norm": 2.6838765144348145, "learning_rate": 2.765546218487395e-05, "loss": 0.7209, "mean_token_accuracy": 0.7838758528232574, "num_tokens": 579801824.0, "step": 55890 }, { "entropy": 0.7587802469730377, "epoch": 0.4472, "grad_norm": 4.860090732574463, "learning_rate": 2.7651460584233696e-05, "loss": 0.7634, "mean_token_accuracy": 0.8024385809898377, "num_tokens": 579843381.0, "step": 55900 }, { "entropy": 0.6615847468376159, "epoch": 0.44728, "grad_norm": 1.554770588874817, "learning_rate": 2.764745898359344e-05, "loss": 0.6578, "mean_token_accuracy": 0.7923905968666076, "num_tokens": 580007002.0, "step": 55910 }, { "entropy": 0.7055348008871078, "epoch": 0.44736, "grad_norm": 3.4923489093780518, "learning_rate": 2.764345738295318e-05, "loss": 0.6878, "mean_token_accuracy": 0.8099782586097717, "num_tokens": 580079295.0, "step": 55920 }, { "entropy": 0.7194471269845962, "epoch": 0.44744, "grad_norm": 1.7702261209487915, "learning_rate": 2.7639455782312927e-05, "loss": 0.7353, "mean_token_accuracy": 0.7908852934837342, "num_tokens": 580173168.0, "step": 55930 }, { "entropy": 0.6926711976528168, "epoch": 0.44752, "grad_norm": 2.979173183441162, "learning_rate": 2.763545418167267e-05, "loss": 0.687, "mean_token_accuracy": 0.7920665740966797, "num_tokens": 580311808.0, "step": 55940 }, { "entropy": 0.6327514976263047, "epoch": 0.4476, "grad_norm": 4.385632038116455, "learning_rate": 2.7631452581032415e-05, "loss": 0.6297, "mean_token_accuracy": 0.8290167868137359, "num_tokens": 580351885.0, "step": 55950 }, { "entropy": 0.6407612860202789, "epoch": 0.44768, "grad_norm": 1.446204662322998, "learning_rate": 2.7627450980392155e-05, "loss": 0.6389, "mean_token_accuracy": 0.7979176938533783, "num_tokens": 580515725.0, "step": 55960 }, { "entropy": 0.795578008890152, "epoch": 0.44776, "grad_norm": 2.9543545246124268, "learning_rate": 2.7623449379751902e-05, "loss": 0.7902, "mean_token_accuracy": 0.7751209616661072, "num_tokens": 580617545.0, "step": 55970 }, { "entropy": 0.7179414868354798, "epoch": 0.44784, "grad_norm": 2.081984758377075, "learning_rate": 2.7619447779111646e-05, "loss": 0.7368, "mean_token_accuracy": 0.7887963473796844, "num_tokens": 580711727.0, "step": 55980 }, { "entropy": 0.7278852164745331, "epoch": 0.44792, "grad_norm": 2.5133914947509766, "learning_rate": 2.761544617847139e-05, "loss": 0.7273, "mean_token_accuracy": 0.7815122365951538, "num_tokens": 580840158.0, "step": 55990 }, { "entropy": 0.6894770264625549, "epoch": 0.448, "grad_norm": 5.768731594085693, "learning_rate": 2.7611444577831137e-05, "loss": 0.6705, "mean_token_accuracy": 0.8243268966674805, "num_tokens": 580873636.0, "step": 56000 }, { "entropy": 0.6769317030906677, "epoch": 0.44808, "grad_norm": 2.303044080734253, "learning_rate": 2.7607442977190877e-05, "loss": 0.6788, "mean_token_accuracy": 0.787194675207138, "num_tokens": 581037476.0, "step": 56010 }, { "entropy": 0.6692675888538361, "epoch": 0.44816, "grad_norm": 2.915961742401123, "learning_rate": 2.760344137655062e-05, "loss": 0.6585, "mean_token_accuracy": 0.8116395831108093, "num_tokens": 581125171.0, "step": 56020 }, { "entropy": 0.6415495812892914, "epoch": 0.44824, "grad_norm": 1.902172565460205, "learning_rate": 2.7599439775910364e-05, "loss": 0.6256, "mean_token_accuracy": 0.8156184673309326, "num_tokens": 581218933.0, "step": 56030 }, { "entropy": 0.6955548524856567, "epoch": 0.44832, "grad_norm": 2.1420018672943115, "learning_rate": 2.759543817527011e-05, "loss": 0.7018, "mean_token_accuracy": 0.7851268947124481, "num_tokens": 581358589.0, "step": 56040 }, { "entropy": 0.7064301669597626, "epoch": 0.4484, "grad_norm": 5.322944641113281, "learning_rate": 2.7591436574629852e-05, "loss": 0.6968, "mean_token_accuracy": 0.8111237227916718, "num_tokens": 581400564.0, "step": 56050 }, { "entropy": 0.6313971877098083, "epoch": 0.44848, "grad_norm": 1.7004146575927734, "learning_rate": 2.7587434973989595e-05, "loss": 0.6294, "mean_token_accuracy": 0.7986443638801575, "num_tokens": 581564404.0, "step": 56060 }, { "entropy": 0.6432953506708146, "epoch": 0.44856, "grad_norm": 4.804252624511719, "learning_rate": 2.7583433373349343e-05, "loss": 0.6331, "mean_token_accuracy": 0.8145428597927094, "num_tokens": 581651962.0, "step": 56070 }, { "entropy": 0.7368758618831635, "epoch": 0.44864, "grad_norm": 2.388672113418579, "learning_rate": 2.7579431772709086e-05, "loss": 0.7307, "mean_token_accuracy": 0.7925692915916442, "num_tokens": 581744889.0, "step": 56080 }, { "entropy": 0.665696918964386, "epoch": 0.44872, "grad_norm": 3.675865888595581, "learning_rate": 2.7575430172068827e-05, "loss": 0.6585, "mean_token_accuracy": 0.7995155334472657, "num_tokens": 581876698.0, "step": 56090 }, { "entropy": 0.6478489756584167, "epoch": 0.4488, "grad_norm": 5.026772975921631, "learning_rate": 2.757142857142857e-05, "loss": 0.6513, "mean_token_accuracy": 0.8245112478733063, "num_tokens": 581911909.0, "step": 56100 }, { "entropy": 0.6522638916969299, "epoch": 0.44888, "grad_norm": 2.7095870971679688, "learning_rate": 2.7567426970788317e-05, "loss": 0.6492, "mean_token_accuracy": 0.7943392693996429, "num_tokens": 582075749.0, "step": 56110 }, { "entropy": 0.6273234933614731, "epoch": 0.44896, "grad_norm": 3.2297370433807373, "learning_rate": 2.756342537014806e-05, "loss": 0.6252, "mean_token_accuracy": 0.8162715315818787, "num_tokens": 582155436.0, "step": 56120 }, { "entropy": 0.7461933553218841, "epoch": 0.44904, "grad_norm": 1.6683109998703003, "learning_rate": 2.75594237695078e-05, "loss": 0.737, "mean_token_accuracy": 0.7913541913032531, "num_tokens": 582248675.0, "step": 56130 }, { "entropy": 0.6942728877067565, "epoch": 0.44912, "grad_norm": 2.461432933807373, "learning_rate": 2.7555422168867552e-05, "loss": 0.6885, "mean_token_accuracy": 0.789094614982605, "num_tokens": 582389766.0, "step": 56140 }, { "entropy": 0.6912375718355179, "epoch": 0.4492, "grad_norm": 6.317601203918457, "learning_rate": 2.7551420568227292e-05, "loss": 0.6637, "mean_token_accuracy": 0.8226071536540985, "num_tokens": 582426676.0, "step": 56150 }, { "entropy": 0.7143800377845764, "epoch": 0.44928, "grad_norm": 1.672910451889038, "learning_rate": 2.7547418967587036e-05, "loss": 0.7225, "mean_token_accuracy": 0.7779555499553681, "num_tokens": 582590516.0, "step": 56160 }, { "entropy": 0.6746039688587189, "epoch": 0.44936, "grad_norm": 2.7255406379699707, "learning_rate": 2.7543417366946776e-05, "loss": 0.6664, "mean_token_accuracy": 0.8125530898571014, "num_tokens": 582678745.0, "step": 56170 }, { "entropy": 0.6582074642181397, "epoch": 0.44944, "grad_norm": 1.9855891466140747, "learning_rate": 2.7539415766306527e-05, "loss": 0.6631, "mean_token_accuracy": 0.8068483889102935, "num_tokens": 582776060.0, "step": 56180 }, { "entropy": 0.679929119348526, "epoch": 0.44952, "grad_norm": 3.7448370456695557, "learning_rate": 2.7535414165666267e-05, "loss": 0.6725, "mean_token_accuracy": 0.796158391237259, "num_tokens": 582903578.0, "step": 56190 }, { "entropy": 0.7416290760040283, "epoch": 0.4496, "grad_norm": 5.8317341804504395, "learning_rate": 2.753141256502601e-05, "loss": 0.7314, "mean_token_accuracy": 0.8058753848075867, "num_tokens": 582936560.0, "step": 56200 }, { "entropy": 0.6699104487895966, "epoch": 0.44968, "grad_norm": 2.1383001804351807, "learning_rate": 2.7527410964385758e-05, "loss": 0.6754, "mean_token_accuracy": 0.7890266299247741, "num_tokens": 583100400.0, "step": 56210 }, { "entropy": 0.7008213877677918, "epoch": 0.44976, "grad_norm": 2.908052921295166, "learning_rate": 2.75234093637455e-05, "loss": 0.6877, "mean_token_accuracy": 0.8015306115150451, "num_tokens": 583190368.0, "step": 56220 }, { "entropy": 0.6911321997642517, "epoch": 0.44984, "grad_norm": 1.408753514289856, "learning_rate": 2.7519407763105242e-05, "loss": 0.6972, "mean_token_accuracy": 0.7983189284801483, "num_tokens": 583285422.0, "step": 56230 }, { "entropy": 0.6450729429721832, "epoch": 0.44992, "grad_norm": 3.5429272651672363, "learning_rate": 2.7515406162464986e-05, "loss": 0.6409, "mean_token_accuracy": 0.8045085787773132, "num_tokens": 583413852.0, "step": 56240 }, { "entropy": 0.65544593334198, "epoch": 0.45, "grad_norm": 4.393413543701172, "learning_rate": 2.7511404561824733e-05, "loss": 0.6628, "mean_token_accuracy": 0.8182673811912536, "num_tokens": 583447719.0, "step": 56250 }, { "entropy": 0.6559676438570022, "epoch": 0.45008, "grad_norm": 1.661795735359192, "learning_rate": 2.7507402961184476e-05, "loss": 0.6525, "mean_token_accuracy": 0.7928065478801727, "num_tokens": 583611559.0, "step": 56260 }, { "entropy": 0.7210573852062225, "epoch": 0.45016, "grad_norm": 2.911424160003662, "learning_rate": 2.7503401360544217e-05, "loss": 0.7098, "mean_token_accuracy": 0.7952528119087219, "num_tokens": 583701182.0, "step": 56270 }, { "entropy": 0.6814588725566864, "epoch": 0.45024, "grad_norm": 2.384087562561035, "learning_rate": 2.7499399759903964e-05, "loss": 0.6796, "mean_token_accuracy": 0.8062657535076141, "num_tokens": 583795576.0, "step": 56280 }, { "entropy": 0.6780976235866547, "epoch": 0.45032, "grad_norm": 3.5258607864379883, "learning_rate": 2.7495398159263708e-05, "loss": 0.6658, "mean_token_accuracy": 0.7979012489318847, "num_tokens": 583922595.0, "step": 56290 }, { "entropy": 0.6424573481082916, "epoch": 0.4504, "grad_norm": 4.364515781402588, "learning_rate": 2.749139655862345e-05, "loss": 0.6377, "mean_token_accuracy": 0.8288744032382965, "num_tokens": 583955935.0, "step": 56300 }, { "entropy": 0.6533990383148194, "epoch": 0.45048, "grad_norm": 1.86171555519104, "learning_rate": 2.748739495798319e-05, "loss": 0.6576, "mean_token_accuracy": 0.7908830046653748, "num_tokens": 584119775.0, "step": 56310 }, { "entropy": 0.6791076123714447, "epoch": 0.45056, "grad_norm": 3.1145918369293213, "learning_rate": 2.748339335734294e-05, "loss": 0.6685, "mean_token_accuracy": 0.8077464520931243, "num_tokens": 584207087.0, "step": 56320 }, { "entropy": 0.7062349259853363, "epoch": 0.45064, "grad_norm": 1.7787314653396606, "learning_rate": 2.7479391756702682e-05, "loss": 0.697, "mean_token_accuracy": 0.79952432513237, "num_tokens": 584302421.0, "step": 56330 }, { "entropy": 0.7093290209770202, "epoch": 0.45072, "grad_norm": 2.339108943939209, "learning_rate": 2.7475390156062426e-05, "loss": 0.7068, "mean_token_accuracy": 0.7835953295230865, "num_tokens": 584450875.0, "step": 56340 }, { "entropy": 0.6654702633619308, "epoch": 0.4508, "grad_norm": 4.96814489364624, "learning_rate": 2.7471388555422173e-05, "loss": 0.6469, "mean_token_accuracy": 0.8264869570732116, "num_tokens": 584491909.0, "step": 56350 }, { "entropy": 0.6594749391078949, "epoch": 0.45088, "grad_norm": 2.0569090843200684, "learning_rate": 2.7467386954781914e-05, "loss": 0.6627, "mean_token_accuracy": 0.7892586648464203, "num_tokens": 584655749.0, "step": 56360 }, { "entropy": 0.6086404740810394, "epoch": 0.45096, "grad_norm": 2.961560010910034, "learning_rate": 2.7463385354141657e-05, "loss": 0.6003, "mean_token_accuracy": 0.8225260853767395, "num_tokens": 584748207.0, "step": 56370 }, { "entropy": 0.6780577719211578, "epoch": 0.45104, "grad_norm": 1.3609905242919922, "learning_rate": 2.74593837535014e-05, "loss": 0.6799, "mean_token_accuracy": 0.8054323315620422, "num_tokens": 584841948.0, "step": 56380 }, { "entropy": 0.676335233449936, "epoch": 0.45112, "grad_norm": 4.063608646392822, "learning_rate": 2.7455382152861148e-05, "loss": 0.6784, "mean_token_accuracy": 0.7951229751110077, "num_tokens": 584985003.0, "step": 56390 }, { "entropy": 0.7359497666358947, "epoch": 0.4512, "grad_norm": 5.593918323516846, "learning_rate": 2.745138055222089e-05, "loss": 0.7307, "mean_token_accuracy": 0.8073175549507141, "num_tokens": 585021840.0, "step": 56400 }, { "entropy": 0.6453340947628021, "epoch": 0.45128, "grad_norm": 2.308415174484253, "learning_rate": 2.7447378951580632e-05, "loss": 0.6385, "mean_token_accuracy": 0.7999887704849243, "num_tokens": 585179534.0, "step": 56410 }, { "entropy": 0.6558765232563019, "epoch": 0.45136, "grad_norm": 3.905472755432129, "learning_rate": 2.744337735094038e-05, "loss": 0.6537, "mean_token_accuracy": 0.8126073002815246, "num_tokens": 585241991.0, "step": 56420 }, { "entropy": 0.7275026917457581, "epoch": 0.45144, "grad_norm": 1.5639296770095825, "learning_rate": 2.7439375750300123e-05, "loss": 0.7267, "mean_token_accuracy": 0.7959590435028077, "num_tokens": 585334024.0, "step": 56430 }, { "entropy": 0.715665516257286, "epoch": 0.45152, "grad_norm": 2.264019250869751, "learning_rate": 2.7435374149659863e-05, "loss": 0.7083, "mean_token_accuracy": 0.78950714468956, "num_tokens": 585471439.0, "step": 56440 }, { "entropy": 0.6354858845472335, "epoch": 0.4516, "grad_norm": 4.104480743408203, "learning_rate": 2.7431372549019607e-05, "loss": 0.6297, "mean_token_accuracy": 0.8262633502483367, "num_tokens": 585508270.0, "step": 56450 }, { "entropy": 0.6489991128444672, "epoch": 0.45168, "grad_norm": 1.5600348711013794, "learning_rate": 2.7427370948379354e-05, "loss": 0.6491, "mean_token_accuracy": 0.7977908670902252, "num_tokens": 585671342.0, "step": 56460 }, { "entropy": 0.6725571244955063, "epoch": 0.45176, "grad_norm": 3.6126062870025635, "learning_rate": 2.7423369347739098e-05, "loss": 0.6583, "mean_token_accuracy": 0.8116998970508575, "num_tokens": 585744891.0, "step": 56470 }, { "entropy": 0.7606111705303192, "epoch": 0.45184, "grad_norm": 1.9197148084640503, "learning_rate": 2.7419367747098838e-05, "loss": 0.7643, "mean_token_accuracy": 0.7896495401859284, "num_tokens": 585838051.0, "step": 56480 }, { "entropy": 0.665202385187149, "epoch": 0.45192, "grad_norm": 2.8078205585479736, "learning_rate": 2.7415366146458582e-05, "loss": 0.6681, "mean_token_accuracy": 0.7964402735233307, "num_tokens": 585976261.0, "step": 56490 }, { "entropy": 0.689787745475769, "epoch": 0.452, "grad_norm": 4.405261039733887, "learning_rate": 2.741136454581833e-05, "loss": 0.6839, "mean_token_accuracy": 0.8145834445953369, "num_tokens": 586020047.0, "step": 56500 }, { "entropy": 0.6624597251415253, "epoch": 0.45208, "grad_norm": 1.5857902765274048, "learning_rate": 2.7407362945178073e-05, "loss": 0.6526, "mean_token_accuracy": 0.7941744029521942, "num_tokens": 586183887.0, "step": 56510 }, { "entropy": 0.7056154906749725, "epoch": 0.45216, "grad_norm": 4.8334455490112305, "learning_rate": 2.7403361344537813e-05, "loss": 0.7102, "mean_token_accuracy": 0.8015896022319794, "num_tokens": 586270155.0, "step": 56520 }, { "entropy": 0.645148640871048, "epoch": 0.45224, "grad_norm": 1.3506190776824951, "learning_rate": 2.7399359743897564e-05, "loss": 0.6526, "mean_token_accuracy": 0.809473329782486, "num_tokens": 586364828.0, "step": 56530 }, { "entropy": 0.693512350320816, "epoch": 0.45232, "grad_norm": 2.9116263389587402, "learning_rate": 2.7395358143257304e-05, "loss": 0.6791, "mean_token_accuracy": 0.7899794757366181, "num_tokens": 586508967.0, "step": 56540 }, { "entropy": 0.6815805196762085, "epoch": 0.4524, "grad_norm": 5.181509017944336, "learning_rate": 2.7391356542617048e-05, "loss": 0.6692, "mean_token_accuracy": 0.816441011428833, "num_tokens": 586550065.0, "step": 56550 }, { "entropy": 0.6929409623146057, "epoch": 0.45248, "grad_norm": 2.2328882217407227, "learning_rate": 2.7387354941976788e-05, "loss": 0.6876, "mean_token_accuracy": 0.7846421658992767, "num_tokens": 586713905.0, "step": 56560 }, { "entropy": 0.6851302087306976, "epoch": 0.45256, "grad_norm": 3.0206663608551025, "learning_rate": 2.738335334133654e-05, "loss": 0.6726, "mean_token_accuracy": 0.8026072144508362, "num_tokens": 586815461.0, "step": 56570 }, { "entropy": 0.7576663076877594, "epoch": 0.45264, "grad_norm": 1.743998646736145, "learning_rate": 2.737935174069628e-05, "loss": 0.7801, "mean_token_accuracy": 0.7853425920009613, "num_tokens": 586909382.0, "step": 56580 }, { "entropy": 0.7299654006958007, "epoch": 0.45272, "grad_norm": 2.277822494506836, "learning_rate": 2.7375350140056022e-05, "loss": 0.7129, "mean_token_accuracy": 0.783483624458313, "num_tokens": 587048905.0, "step": 56590 }, { "entropy": 0.5884771943092346, "epoch": 0.4528, "grad_norm": 4.796167373657227, "learning_rate": 2.737134853941577e-05, "loss": 0.576, "mean_token_accuracy": 0.8370664596557618, "num_tokens": 587090013.0, "step": 56600 }, { "entropy": 0.6384181380271912, "epoch": 0.45288, "grad_norm": 2.1285617351531982, "learning_rate": 2.7367346938775513e-05, "loss": 0.6431, "mean_token_accuracy": 0.797844409942627, "num_tokens": 587253853.0, "step": 56610 }, { "entropy": 0.6529269695281983, "epoch": 0.45296, "grad_norm": 3.5988917350769043, "learning_rate": 2.7363345338135254e-05, "loss": 0.6474, "mean_token_accuracy": 0.8120740115642547, "num_tokens": 587338803.0, "step": 56620 }, { "entropy": 0.6550166606903076, "epoch": 0.45304, "grad_norm": 1.3500590324401855, "learning_rate": 2.7359343737494997e-05, "loss": 0.6509, "mean_token_accuracy": 0.8110752522945404, "num_tokens": 587432064.0, "step": 56630 }, { "entropy": 0.6468331158161164, "epoch": 0.45312, "grad_norm": 1.8739286661148071, "learning_rate": 2.7355342136854744e-05, "loss": 0.6485, "mean_token_accuracy": 0.7986145973205566, "num_tokens": 587579190.0, "step": 56640 }, { "entropy": 0.5986223578453064, "epoch": 0.4532, "grad_norm": 4.389915943145752, "learning_rate": 2.7351340536214488e-05, "loss": 0.5826, "mean_token_accuracy": 0.8364690542221069, "num_tokens": 587620170.0, "step": 56650 }, { "entropy": 0.6466200411319732, "epoch": 0.45328, "grad_norm": 2.9855644702911377, "learning_rate": 2.734733893557423e-05, "loss": 0.644, "mean_token_accuracy": 0.7974353671073914, "num_tokens": 587782659.0, "step": 56660 }, { "entropy": 0.641165754199028, "epoch": 0.45336, "grad_norm": 3.364234685897827, "learning_rate": 2.734333733493398e-05, "loss": 0.6249, "mean_token_accuracy": 0.8191370069980621, "num_tokens": 587854715.0, "step": 56670 }, { "entropy": 0.7045015245676041, "epoch": 0.45344, "grad_norm": 1.6610140800476074, "learning_rate": 2.733933573429372e-05, "loss": 0.6998, "mean_token_accuracy": 0.8015368759632111, "num_tokens": 587947728.0, "step": 56680 }, { "entropy": 0.6897850751876831, "epoch": 0.45352, "grad_norm": 2.5225937366485596, "learning_rate": 2.7335334133653463e-05, "loss": 0.6855, "mean_token_accuracy": 0.7895913064479828, "num_tokens": 588088421.0, "step": 56690 }, { "entropy": 0.6572534620761872, "epoch": 0.4536, "grad_norm": 4.692235946655273, "learning_rate": 2.7331332533013203e-05, "loss": 0.6482, "mean_token_accuracy": 0.8209780335426331, "num_tokens": 588127108.0, "step": 56700 }, { "entropy": 0.6353268682956695, "epoch": 0.45368, "grad_norm": 1.8999221324920654, "learning_rate": 2.7327330932372954e-05, "loss": 0.6357, "mean_token_accuracy": 0.7981863796710968, "num_tokens": 588290948.0, "step": 56710 }, { "entropy": 0.7479595601558685, "epoch": 0.45376, "grad_norm": 3.3266773223876953, "learning_rate": 2.7323329331732694e-05, "loss": 0.738, "mean_token_accuracy": 0.7897676348686218, "num_tokens": 588382376.0, "step": 56720 }, { "entropy": 0.6718631386756897, "epoch": 0.45384, "grad_norm": 1.5785845518112183, "learning_rate": 2.7319327731092438e-05, "loss": 0.6624, "mean_token_accuracy": 0.8080162286758423, "num_tokens": 588476962.0, "step": 56730 }, { "entropy": 0.6607308030128479, "epoch": 0.45392, "grad_norm": 3.3522796630859375, "learning_rate": 2.7315326130452185e-05, "loss": 0.666, "mean_token_accuracy": 0.7934022545814514, "num_tokens": 588614669.0, "step": 56740 }, { "entropy": 0.605771291255951, "epoch": 0.454, "grad_norm": 3.976864814758301, "learning_rate": 2.731132452981193e-05, "loss": 0.5912, "mean_token_accuracy": 0.8368227660655976, "num_tokens": 588654534.0, "step": 56750 }, { "entropy": 0.6528398334980011, "epoch": 0.45408, "grad_norm": 1.706484079360962, "learning_rate": 2.730732292917167e-05, "loss": 0.6542, "mean_token_accuracy": 0.7927053093910217, "num_tokens": 588816890.0, "step": 56760 }, { "entropy": 0.6341929316520691, "epoch": 0.45416, "grad_norm": 2.9754793643951416, "learning_rate": 2.7303321328531413e-05, "loss": 0.6288, "mean_token_accuracy": 0.8193788528442383, "num_tokens": 588896123.0, "step": 56770 }, { "entropy": 0.6504580557346344, "epoch": 0.45424, "grad_norm": 1.739629864692688, "learning_rate": 2.729931972789116e-05, "loss": 0.6378, "mean_token_accuracy": 0.8095587491989136, "num_tokens": 588990016.0, "step": 56780 }, { "entropy": 0.6429086595773696, "epoch": 0.45432, "grad_norm": 2.882275342941284, "learning_rate": 2.7295318127250903e-05, "loss": 0.641, "mean_token_accuracy": 0.8040838658809661, "num_tokens": 589114428.0, "step": 56790 }, { "entropy": 0.6226536393165588, "epoch": 0.4544, "grad_norm": 5.112702369689941, "learning_rate": 2.7291316526610644e-05, "loss": 0.6137, "mean_token_accuracy": 0.8347363591194152, "num_tokens": 589149957.0, "step": 56800 }, { "entropy": 0.6294266402721405, "epoch": 0.45448, "grad_norm": 1.5204591751098633, "learning_rate": 2.728731492597039e-05, "loss": 0.628, "mean_token_accuracy": 0.8004457890987396, "num_tokens": 589313797.0, "step": 56810 }, { "entropy": 0.7021155238151551, "epoch": 0.45456, "grad_norm": 2.9378163814544678, "learning_rate": 2.7283313325330135e-05, "loss": 0.7012, "mean_token_accuracy": 0.7958670854568481, "num_tokens": 589409243.0, "step": 56820 }, { "entropy": 0.7022665202617645, "epoch": 0.45464, "grad_norm": 1.945419430732727, "learning_rate": 2.7279311724689878e-05, "loss": 0.6813, "mean_token_accuracy": 0.8013947248458863, "num_tokens": 589503915.0, "step": 56830 }, { "entropy": 0.6647507548332214, "epoch": 0.45472, "grad_norm": 3.0228078365325928, "learning_rate": 2.727531012404962e-05, "loss": 0.6609, "mean_token_accuracy": 0.7951490581035614, "num_tokens": 589648344.0, "step": 56840 }, { "entropy": 0.6472451895475387, "epoch": 0.4548, "grad_norm": 4.350048542022705, "learning_rate": 2.7271308523409366e-05, "loss": 0.6456, "mean_token_accuracy": 0.8245830953121185, "num_tokens": 589685877.0, "step": 56850 }, { "entropy": 0.6670118391513824, "epoch": 0.45488, "grad_norm": 2.3271396160125732, "learning_rate": 2.726730692276911e-05, "loss": 0.6679, "mean_token_accuracy": 0.7932336926460266, "num_tokens": 589847478.0, "step": 56860 }, { "entropy": 0.7034402042627335, "epoch": 0.45496, "grad_norm": 3.8354077339172363, "learning_rate": 2.7263305322128853e-05, "loss": 0.6945, "mean_token_accuracy": 0.8064355254173279, "num_tokens": 589915214.0, "step": 56870 }, { "entropy": 0.6790139079093933, "epoch": 0.45504, "grad_norm": 1.3938465118408203, "learning_rate": 2.72593037214886e-05, "loss": 0.6762, "mean_token_accuracy": 0.8035619556903839, "num_tokens": 590009134.0, "step": 56880 }, { "entropy": 0.6811562359333039, "epoch": 0.45512, "grad_norm": 3.994872570037842, "learning_rate": 2.725530212084834e-05, "loss": 0.6712, "mean_token_accuracy": 0.7945676863193512, "num_tokens": 590153891.0, "step": 56890 }, { "entropy": 0.6288545250892639, "epoch": 0.4552, "grad_norm": 4.3559393882751465, "learning_rate": 2.7251300520208084e-05, "loss": 0.6219, "mean_token_accuracy": 0.8265578925609589, "num_tokens": 590196225.0, "step": 56900 }, { "entropy": 0.6358034431934356, "epoch": 0.45528, "grad_norm": 2.086804151535034, "learning_rate": 2.7247298919567828e-05, "loss": 0.6313, "mean_token_accuracy": 0.7984644651412964, "num_tokens": 590359712.0, "step": 56910 }, { "entropy": 0.573990136384964, "epoch": 0.45536, "grad_norm": 4.384173393249512, "learning_rate": 2.7243297318927575e-05, "loss": 0.574, "mean_token_accuracy": 0.8333340466022492, "num_tokens": 590429463.0, "step": 56920 }, { "entropy": 0.6728830873966217, "epoch": 0.45544, "grad_norm": 1.2963193655014038, "learning_rate": 2.7239295718287315e-05, "loss": 0.6659, "mean_token_accuracy": 0.8104761898517608, "num_tokens": 590521061.0, "step": 56930 }, { "entropy": 0.7020517766475678, "epoch": 0.45552, "grad_norm": 2.5069026947021484, "learning_rate": 2.723529411764706e-05, "loss": 0.699, "mean_token_accuracy": 0.7831700384616852, "num_tokens": 590670620.0, "step": 56940 }, { "entropy": 0.6699251115322113, "epoch": 0.4556, "grad_norm": 4.280128002166748, "learning_rate": 2.7231292517006806e-05, "loss": 0.6753, "mean_token_accuracy": 0.8156419813632965, "num_tokens": 590714385.0, "step": 56950 }, { "entropy": 0.6966283619403839, "epoch": 0.45568, "grad_norm": 1.7477842569351196, "learning_rate": 2.722729091636655e-05, "loss": 0.6934, "mean_token_accuracy": 0.7849395155906678, "num_tokens": 590875128.0, "step": 56960 }, { "entropy": 0.6516918540000916, "epoch": 0.45576, "grad_norm": 3.801126003265381, "learning_rate": 2.722328931572629e-05, "loss": 0.6362, "mean_token_accuracy": 0.8153157711029053, "num_tokens": 590948194.0, "step": 56970 }, { "entropy": 0.6780352771282196, "epoch": 0.45584, "grad_norm": 1.7070305347442627, "learning_rate": 2.7219287715086034e-05, "loss": 0.7018, "mean_token_accuracy": 0.8009169399738312, "num_tokens": 591039377.0, "step": 56980 }, { "entropy": 0.6908856689929962, "epoch": 0.45592, "grad_norm": 2.444883108139038, "learning_rate": 2.721528611444578e-05, "loss": 0.6812, "mean_token_accuracy": 0.7958712100982666, "num_tokens": 591176159.0, "step": 56990 }, { "entropy": 0.6420352071523666, "epoch": 0.456, "grad_norm": 5.44509220123291, "learning_rate": 2.7211284513805525e-05, "loss": 0.6406, "mean_token_accuracy": 0.8268548786640167, "num_tokens": 591217216.0, "step": 57000 }, { "entropy": 0.6231737971305847, "epoch": 0.45608, "grad_norm": 1.4433503150939941, "learning_rate": 2.7207282913165265e-05, "loss": 0.6177, "mean_token_accuracy": 0.8012945771217346, "num_tokens": 591381056.0, "step": 57010 }, { "entropy": 0.7315119624137878, "epoch": 0.45616, "grad_norm": 2.972447395324707, "learning_rate": 2.720328131252501e-05, "loss": 0.7237, "mean_token_accuracy": 0.791402792930603, "num_tokens": 591473355.0, "step": 57020 }, { "entropy": 0.7258749365806579, "epoch": 0.45624, "grad_norm": 2.4181368350982666, "learning_rate": 2.7199279711884756e-05, "loss": 0.7261, "mean_token_accuracy": 0.7922055661678314, "num_tokens": 591566288.0, "step": 57030 }, { "entropy": 0.6555361032485962, "epoch": 0.45632, "grad_norm": 2.874004364013672, "learning_rate": 2.71952781112445e-05, "loss": 0.6519, "mean_token_accuracy": 0.7993446350097656, "num_tokens": 591709908.0, "step": 57040 }, { "entropy": 0.6637606203556061, "epoch": 0.4564, "grad_norm": 5.140113830566406, "learning_rate": 2.719127651060424e-05, "loss": 0.6565, "mean_token_accuracy": 0.8185456216335296, "num_tokens": 591751768.0, "step": 57050 }, { "entropy": 0.6733482301235199, "epoch": 0.45648, "grad_norm": 1.6205580234527588, "learning_rate": 2.718727490996399e-05, "loss": 0.6722, "mean_token_accuracy": 0.788302743434906, "num_tokens": 591915285.0, "step": 57060 }, { "entropy": 0.6565291374921799, "epoch": 0.45656, "grad_norm": 3.138131618499756, "learning_rate": 2.718327330932373e-05, "loss": 0.6491, "mean_token_accuracy": 0.8181072890758514, "num_tokens": 591990563.0, "step": 57070 }, { "entropy": 0.729056316614151, "epoch": 0.45664, "grad_norm": 3.3916354179382324, "learning_rate": 2.7179271708683475e-05, "loss": 0.7312, "mean_token_accuracy": 0.7945886731147767, "num_tokens": 592081823.0, "step": 57080 }, { "entropy": 0.6643036663532257, "epoch": 0.45672, "grad_norm": 1.9118382930755615, "learning_rate": 2.7175270108043215e-05, "loss": 0.6638, "mean_token_accuracy": 0.7946889936923981, "num_tokens": 592229367.0, "step": 57090 }, { "entropy": 0.7005625486373901, "epoch": 0.4568, "grad_norm": 4.3260698318481445, "learning_rate": 2.7171268507402965e-05, "loss": 0.6852, "mean_token_accuracy": 0.817497730255127, "num_tokens": 592275871.0, "step": 57100 }, { "entropy": 0.6152383327484131, "epoch": 0.45688, "grad_norm": 1.956817865371704, "learning_rate": 2.7167266906762706e-05, "loss": 0.6135, "mean_token_accuracy": 0.8058613002300262, "num_tokens": 592436731.0, "step": 57110 }, { "entropy": 0.6397590309381485, "epoch": 0.45696, "grad_norm": 3.3119428157806396, "learning_rate": 2.716326530612245e-05, "loss": 0.6322, "mean_token_accuracy": 0.8189010620117188, "num_tokens": 592508451.0, "step": 57120 }, { "entropy": 0.7216467201709748, "epoch": 0.45704, "grad_norm": 1.5587538480758667, "learning_rate": 2.7159263705482196e-05, "loss": 0.7276, "mean_token_accuracy": 0.7921978890895843, "num_tokens": 592600595.0, "step": 57130 }, { "entropy": 0.7378726899623871, "epoch": 0.45712, "grad_norm": 2.222849130630493, "learning_rate": 2.715526210484194e-05, "loss": 0.7264, "mean_token_accuracy": 0.7836615920066834, "num_tokens": 592747937.0, "step": 57140 }, { "entropy": 0.6702654838562012, "epoch": 0.4572, "grad_norm": 4.989758014678955, "learning_rate": 2.715126050420168e-05, "loss": 0.6694, "mean_token_accuracy": 0.8154275596141816, "num_tokens": 592792376.0, "step": 57150 }, { "entropy": 0.6744053840637207, "epoch": 0.45728, "grad_norm": 1.6174300909042358, "learning_rate": 2.7147258903561424e-05, "loss": 0.6709, "mean_token_accuracy": 0.7875886619091034, "num_tokens": 592954261.0, "step": 57160 }, { "entropy": 0.5890054553747177, "epoch": 0.45736, "grad_norm": 4.5278496742248535, "learning_rate": 2.714325730292117e-05, "loss": 0.5817, "mean_token_accuracy": 0.8291429817676544, "num_tokens": 593030842.0, "step": 57170 }, { "entropy": 0.6922035813331604, "epoch": 0.45744, "grad_norm": 2.086487293243408, "learning_rate": 2.7139255702280915e-05, "loss": 0.6914, "mean_token_accuracy": 0.806738531589508, "num_tokens": 593124604.0, "step": 57180 }, { "entropy": 0.6722366809844971, "epoch": 0.45752, "grad_norm": 3.3646957874298096, "learning_rate": 2.7135254101640655e-05, "loss": 0.6694, "mean_token_accuracy": 0.7945730984210968, "num_tokens": 593262987.0, "step": 57190 }, { "entropy": 0.6885750025510788, "epoch": 0.4576, "grad_norm": 4.599200248718262, "learning_rate": 2.7131252501000402e-05, "loss": 0.6835, "mean_token_accuracy": 0.8131159901618957, "num_tokens": 593302338.0, "step": 57200 }, { "entropy": 0.6731752216815948, "epoch": 0.45768, "grad_norm": 1.5627790689468384, "learning_rate": 2.7127250900360146e-05, "loss": 0.6624, "mean_token_accuracy": 0.7896983444690704, "num_tokens": 593466178.0, "step": 57210 }, { "entropy": 0.6992650210857392, "epoch": 0.45776, "grad_norm": 3.3905029296875, "learning_rate": 2.712324929971989e-05, "loss": 0.6988, "mean_token_accuracy": 0.8007776260375976, "num_tokens": 593553759.0, "step": 57220 }, { "entropy": 0.729718017578125, "epoch": 0.45784, "grad_norm": 2.1695988178253174, "learning_rate": 2.711924769907963e-05, "loss": 0.7276, "mean_token_accuracy": 0.7907111823558808, "num_tokens": 593648061.0, "step": 57230 }, { "entropy": 0.7176078796386719, "epoch": 0.45792, "grad_norm": 2.6350717544555664, "learning_rate": 2.7115246098439377e-05, "loss": 0.7152, "mean_token_accuracy": 0.7843230128288269, "num_tokens": 593787725.0, "step": 57240 }, { "entropy": 0.6853373527526856, "epoch": 0.458, "grad_norm": 4.631483554840088, "learning_rate": 2.711124449779912e-05, "loss": 0.6838, "mean_token_accuracy": 0.8177623808383941, "num_tokens": 593829695.0, "step": 57250 }, { "entropy": 0.6909034430980683, "epoch": 0.45808, "grad_norm": 2.2028276920318604, "learning_rate": 2.7107242897158865e-05, "loss": 0.6823, "mean_token_accuracy": 0.7862664937973023, "num_tokens": 593993535.0, "step": 57260 }, { "entropy": 0.6412106692790985, "epoch": 0.45816, "grad_norm": 2.9497532844543457, "learning_rate": 2.7103241296518612e-05, "loss": 0.6467, "mean_token_accuracy": 0.8100188970565796, "num_tokens": 594081926.0, "step": 57270 }, { "entropy": 0.6769674181938171, "epoch": 0.45824, "grad_norm": 2.0176961421966553, "learning_rate": 2.7099239695878352e-05, "loss": 0.6661, "mean_token_accuracy": 0.8066766083240509, "num_tokens": 594176346.0, "step": 57280 }, { "entropy": 0.6979789555072784, "epoch": 0.45832, "grad_norm": 2.334465503692627, "learning_rate": 2.7095238095238096e-05, "loss": 0.6969, "mean_token_accuracy": 0.7906414628028869, "num_tokens": 594311430.0, "step": 57290 }, { "entropy": 0.716841584444046, "epoch": 0.4584, "grad_norm": 4.288904666900635, "learning_rate": 2.709123649459784e-05, "loss": 0.7067, "mean_token_accuracy": 0.8108080267906189, "num_tokens": 594342357.0, "step": 57300 }, { "entropy": 0.6878905177116394, "epoch": 0.45848, "grad_norm": 2.108041763305664, "learning_rate": 2.7087234893957587e-05, "loss": 0.6859, "mean_token_accuracy": 0.7881563007831573, "num_tokens": 594504668.0, "step": 57310 }, { "entropy": 0.6259435623884201, "epoch": 0.45856, "grad_norm": 3.916633367538452, "learning_rate": 2.7083233293317327e-05, "loss": 0.617, "mean_token_accuracy": 0.8235197067260742, "num_tokens": 594575600.0, "step": 57320 }, { "entropy": 0.67325000166893, "epoch": 0.45864, "grad_norm": 1.776684284210205, "learning_rate": 2.707923169267707e-05, "loss": 0.6755, "mean_token_accuracy": 0.8042945981025695, "num_tokens": 594669511.0, "step": 57330 }, { "entropy": 0.6489844977855682, "epoch": 0.45872, "grad_norm": 3.6692938804626465, "learning_rate": 2.7075230092036818e-05, "loss": 0.6497, "mean_token_accuracy": 0.7994626939296723, "num_tokens": 594805556.0, "step": 57340 }, { "entropy": 0.6441749215126038, "epoch": 0.4588, "grad_norm": 4.525753974914551, "learning_rate": 2.707122849139656e-05, "loss": 0.6273, "mean_token_accuracy": 0.8339492321014405, "num_tokens": 594840215.0, "step": 57350 }, { "entropy": 0.6783759891986847, "epoch": 0.45888, "grad_norm": 1.4922969341278076, "learning_rate": 2.7067226890756302e-05, "loss": 0.6776, "mean_token_accuracy": 0.790156751871109, "num_tokens": 595003128.0, "step": 57360 }, { "entropy": 0.6742280721664429, "epoch": 0.45896, "grad_norm": 3.4202873706817627, "learning_rate": 2.7063225290116046e-05, "loss": 0.6645, "mean_token_accuracy": 0.8116926193237305, "num_tokens": 595076008.0, "step": 57370 }, { "entropy": 0.6902198731899262, "epoch": 0.45904, "grad_norm": 1.9951688051223755, "learning_rate": 2.7059223689475793e-05, "loss": 0.6796, "mean_token_accuracy": 0.806923508644104, "num_tokens": 595168963.0, "step": 57380 }, { "entropy": 0.6384823501110077, "epoch": 0.45912, "grad_norm": 1.884855031967163, "learning_rate": 2.7055222088835536e-05, "loss": 0.6393, "mean_token_accuracy": 0.801499730348587, "num_tokens": 595306609.0, "step": 57390 }, { "entropy": 0.6540240108966827, "epoch": 0.4592, "grad_norm": 5.231439590454102, "learning_rate": 2.7051220488195277e-05, "loss": 0.6507, "mean_token_accuracy": 0.8260059237480164, "num_tokens": 595344544.0, "step": 57400 }, { "entropy": 0.6369234442710876, "epoch": 0.45928, "grad_norm": 2.293226480484009, "learning_rate": 2.7047218887555027e-05, "loss": 0.6329, "mean_token_accuracy": 0.7961529135704041, "num_tokens": 595508384.0, "step": 57410 }, { "entropy": 0.6699405312538147, "epoch": 0.45936, "grad_norm": 3.1172714233398438, "learning_rate": 2.7043217286914768e-05, "loss": 0.6686, "mean_token_accuracy": 0.8085690557956695, "num_tokens": 595607748.0, "step": 57420 }, { "entropy": 0.6843488335609436, "epoch": 0.45944, "grad_norm": 1.640949010848999, "learning_rate": 2.703921568627451e-05, "loss": 0.6845, "mean_token_accuracy": 0.801767086982727, "num_tokens": 595702603.0, "step": 57430 }, { "entropy": 0.6605508089065552, "epoch": 0.45952, "grad_norm": 2.056577682495117, "learning_rate": 2.703521408563425e-05, "loss": 0.6561, "mean_token_accuracy": 0.7965267241001129, "num_tokens": 595844117.0, "step": 57440 }, { "entropy": 0.7095784485340119, "epoch": 0.4596, "grad_norm": 4.38955545425415, "learning_rate": 2.7031212484994002e-05, "loss": 0.7265, "mean_token_accuracy": 0.8084512829780579, "num_tokens": 595886389.0, "step": 57450 }, { "entropy": 0.6669181287288666, "epoch": 0.45968, "grad_norm": 2.5595812797546387, "learning_rate": 2.7027210884353742e-05, "loss": 0.664, "mean_token_accuracy": 0.7898021459579467, "num_tokens": 596050229.0, "step": 57460 }, { "entropy": 0.6279004603624344, "epoch": 0.45976, "grad_norm": 3.590696096420288, "learning_rate": 2.7023209283713486e-05, "loss": 0.6222, "mean_token_accuracy": 0.8202442228794098, "num_tokens": 596139940.0, "step": 57470 }, { "entropy": 0.7301205575466156, "epoch": 0.45984, "grad_norm": 1.705823302268982, "learning_rate": 2.7019207683073233e-05, "loss": 0.7288, "mean_token_accuracy": 0.7944451689720153, "num_tokens": 596238813.0, "step": 57480 }, { "entropy": 0.6775190889835357, "epoch": 0.45992, "grad_norm": 2.9995923042297363, "learning_rate": 2.7015206082432977e-05, "loss": 0.6741, "mean_token_accuracy": 0.7985572993755341, "num_tokens": 596363084.0, "step": 57490 }, { "entropy": 0.7215378105640411, "epoch": 0.46, "grad_norm": 4.2823591232299805, "learning_rate": 2.7011204481792717e-05, "loss": 0.7124, "mean_token_accuracy": 0.8107360005378723, "num_tokens": 596399808.0, "step": 57500 }, { "entropy": 0.64555983543396, "epoch": 0.46008, "grad_norm": 2.105365753173828, "learning_rate": 2.700720288115246e-05, "loss": 0.6421, "mean_token_accuracy": 0.7968918323516846, "num_tokens": 596563457.0, "step": 57510 }, { "entropy": 0.6402596026659012, "epoch": 0.46016, "grad_norm": 3.2595043182373047, "learning_rate": 2.7003201280512208e-05, "loss": 0.6278, "mean_token_accuracy": 0.8242060005664825, "num_tokens": 596636839.0, "step": 57520 }, { "entropy": 0.7077139616012573, "epoch": 0.46024, "grad_norm": 1.8377896547317505, "learning_rate": 2.6999199679871952e-05, "loss": 0.7052, "mean_token_accuracy": 0.7957395017147064, "num_tokens": 596730511.0, "step": 57530 }, { "entropy": 0.709030294418335, "epoch": 0.46032, "grad_norm": 2.3798952102661133, "learning_rate": 2.6995198079231692e-05, "loss": 0.6978, "mean_token_accuracy": 0.7856214344501495, "num_tokens": 596874005.0, "step": 57540 }, { "entropy": 0.6139178216457367, "epoch": 0.4604, "grad_norm": 4.31465482711792, "learning_rate": 2.699119647859144e-05, "loss": 0.6075, "mean_token_accuracy": 0.8308703303337097, "num_tokens": 596913534.0, "step": 57550 }, { "entropy": 0.6225398480892181, "epoch": 0.46048, "grad_norm": 1.6634318828582764, "learning_rate": 2.6987194877951183e-05, "loss": 0.617, "mean_token_accuracy": 0.8012349307537079, "num_tokens": 597077354.0, "step": 57560 }, { "entropy": 0.7674554526805878, "epoch": 0.46056, "grad_norm": 3.14117431640625, "learning_rate": 2.6983193277310927e-05, "loss": 0.7634, "mean_token_accuracy": 0.7839680194854737, "num_tokens": 597165854.0, "step": 57570 }, { "entropy": 0.6518697500228882, "epoch": 0.46064, "grad_norm": 2.5564568042755127, "learning_rate": 2.6979191676670667e-05, "loss": 0.6475, "mean_token_accuracy": 0.8098453402519226, "num_tokens": 597261033.0, "step": 57580 }, { "entropy": 0.716539716720581, "epoch": 0.46072, "grad_norm": 2.8572676181793213, "learning_rate": 2.6975190076030414e-05, "loss": 0.7149, "mean_token_accuracy": 0.7846482574939728, "num_tokens": 597397280.0, "step": 57590 }, { "entropy": 0.6145051509141922, "epoch": 0.4608, "grad_norm": 3.9252047538757324, "learning_rate": 2.6971188475390158e-05, "loss": 0.6114, "mean_token_accuracy": 0.8311957120895386, "num_tokens": 597440612.0, "step": 57600 }, { "entropy": 0.6295566141605378, "epoch": 0.46088, "grad_norm": 1.3692363500595093, "learning_rate": 2.69671868747499e-05, "loss": 0.6301, "mean_token_accuracy": 0.7981253087520599, "num_tokens": 597604452.0, "step": 57610 }, { "entropy": 0.6741179555654526, "epoch": 0.46096, "grad_norm": 3.9170877933502197, "learning_rate": 2.6963185274109642e-05, "loss": 0.6647, "mean_token_accuracy": 0.8081070363521576, "num_tokens": 597695110.0, "step": 57620 }, { "entropy": 0.6786292433738709, "epoch": 0.46104, "grad_norm": 1.6115692853927612, "learning_rate": 2.695918367346939e-05, "loss": 0.671, "mean_token_accuracy": 0.8118571281433106, "num_tokens": 597790664.0, "step": 57630 }, { "entropy": 0.6978968203067779, "epoch": 0.46112, "grad_norm": 2.4892611503601074, "learning_rate": 2.6955182072829133e-05, "loss": 0.706, "mean_token_accuracy": 0.7853423535823822, "num_tokens": 597934067.0, "step": 57640 }, { "entropy": 0.6505186885595322, "epoch": 0.4612, "grad_norm": 4.999896049499512, "learning_rate": 2.6951180472188876e-05, "loss": 0.6354, "mean_token_accuracy": 0.826836109161377, "num_tokens": 597976763.0, "step": 57650 }, { "entropy": 0.6402724623680115, "epoch": 0.46128, "grad_norm": 1.6942774057388306, "learning_rate": 2.6947178871548623e-05, "loss": 0.6367, "mean_token_accuracy": 0.8007510960102081, "num_tokens": 598140603.0, "step": 57660 }, { "entropy": 0.6527774930000305, "epoch": 0.46136, "grad_norm": 3.260821580886841, "learning_rate": 2.6943177270908364e-05, "loss": 0.647, "mean_token_accuracy": 0.8102624297142029, "num_tokens": 598237848.0, "step": 57670 }, { "entropy": 0.7245939075946808, "epoch": 0.46144, "grad_norm": 2.1255886554718018, "learning_rate": 2.6939175670268108e-05, "loss": 0.7221, "mean_token_accuracy": 0.7942495763301849, "num_tokens": 598334476.0, "step": 57680 }, { "entropy": 0.6578009128570557, "epoch": 0.46152, "grad_norm": 2.116656541824341, "learning_rate": 2.693517406962785e-05, "loss": 0.6514, "mean_token_accuracy": 0.8023414075374603, "num_tokens": 598466316.0, "step": 57690 }, { "entropy": 0.6170989841222763, "epoch": 0.4616, "grad_norm": 3.728635311126709, "learning_rate": 2.69311724689876e-05, "loss": 0.6088, "mean_token_accuracy": 0.8313979923725128, "num_tokens": 598505298.0, "step": 57700 }, { "entropy": 0.6519715189933777, "epoch": 0.46168, "grad_norm": 1.3644911050796509, "learning_rate": 2.692717086834734e-05, "loss": 0.659, "mean_token_accuracy": 0.7935698628425598, "num_tokens": 598669138.0, "step": 57710 }, { "entropy": 0.6559308707714081, "epoch": 0.46176, "grad_norm": 2.884247303009033, "learning_rate": 2.6923169267707082e-05, "loss": 0.642, "mean_token_accuracy": 0.8132949948310852, "num_tokens": 598756942.0, "step": 57720 }, { "entropy": 0.7097295224666595, "epoch": 0.46184, "grad_norm": 2.335216522216797, "learning_rate": 2.691916766706683e-05, "loss": 0.7131, "mean_token_accuracy": 0.8027016341686248, "num_tokens": 598848929.0, "step": 57730 }, { "entropy": 0.6853990763425827, "epoch": 0.46192, "grad_norm": 3.130735158920288, "learning_rate": 2.6915166066426573e-05, "loss": 0.6859, "mean_token_accuracy": 0.7935595989227295, "num_tokens": 598993034.0, "step": 57740 }, { "entropy": 0.6623711466789246, "epoch": 0.462, "grad_norm": 4.827464580535889, "learning_rate": 2.6911164465786313e-05, "loss": 0.6586, "mean_token_accuracy": 0.8180162847042084, "num_tokens": 599035629.0, "step": 57750 }, { "entropy": 0.6621070802211761, "epoch": 0.46208, "grad_norm": 1.85184645652771, "learning_rate": 2.6907162865146057e-05, "loss": 0.6596, "mean_token_accuracy": 0.7917437136173249, "num_tokens": 599195795.0, "step": 57760 }, { "entropy": 0.6469086885452271, "epoch": 0.46216, "grad_norm": 3.660844087600708, "learning_rate": 2.6903161264505804e-05, "loss": 0.6361, "mean_token_accuracy": 0.8149675846099853, "num_tokens": 599264608.0, "step": 57770 }, { "entropy": 0.6718162298202515, "epoch": 0.46224, "grad_norm": 1.468314528465271, "learning_rate": 2.6899159663865548e-05, "loss": 0.665, "mean_token_accuracy": 0.809390127658844, "num_tokens": 599357084.0, "step": 57780 }, { "entropy": 0.606198662519455, "epoch": 0.46232, "grad_norm": 3.05824613571167, "learning_rate": 2.689515806322529e-05, "loss": 0.6077, "mean_token_accuracy": 0.8111180424690246, "num_tokens": 599498081.0, "step": 57790 }, { "entropy": 0.6323683679103851, "epoch": 0.4624, "grad_norm": 6.277272701263428, "learning_rate": 2.689115646258504e-05, "loss": 0.6173, "mean_token_accuracy": 0.8336600005626679, "num_tokens": 599535910.0, "step": 57800 }, { "entropy": 0.654081153869629, "epoch": 0.46248, "grad_norm": 1.6795490980148315, "learning_rate": 2.688715486194478e-05, "loss": 0.6498, "mean_token_accuracy": 0.7928156673908233, "num_tokens": 599699367.0, "step": 57810 }, { "entropy": 0.7254881858825684, "epoch": 0.46256, "grad_norm": 3.0886902809143066, "learning_rate": 2.6883153261304523e-05, "loss": 0.719, "mean_token_accuracy": 0.8008390545845032, "num_tokens": 599775171.0, "step": 57820 }, { "entropy": 0.6336312711238861, "epoch": 0.46264, "grad_norm": 2.208951234817505, "learning_rate": 2.6879151660664263e-05, "loss": 0.6481, "mean_token_accuracy": 0.8131009101867676, "num_tokens": 599869535.0, "step": 57830 }, { "entropy": 0.7181447446346283, "epoch": 0.46272, "grad_norm": 3.514486074447632, "learning_rate": 2.6875150060024014e-05, "loss": 0.7145, "mean_token_accuracy": 0.7859559416770935, "num_tokens": 600003153.0, "step": 57840 }, { "entropy": 0.6456370264291763, "epoch": 0.4628, "grad_norm": 4.203283786773682, "learning_rate": 2.6871148459383754e-05, "loss": 0.6211, "mean_token_accuracy": 0.8291166961193085, "num_tokens": 600044235.0, "step": 57850 }, { "entropy": 0.6975015044212342, "epoch": 0.46288, "grad_norm": 2.3435423374176025, "learning_rate": 2.6867146858743498e-05, "loss": 0.6971, "mean_token_accuracy": 0.7840192914009094, "num_tokens": 600208075.0, "step": 57860 }, { "entropy": 0.6743583172559738, "epoch": 0.46296, "grad_norm": 5.109890460968018, "learning_rate": 2.6863145258103245e-05, "loss": 0.6736, "mean_token_accuracy": 0.804866349697113, "num_tokens": 600307755.0, "step": 57870 }, { "entropy": 0.6466786056756973, "epoch": 0.46304, "grad_norm": 1.7516753673553467, "learning_rate": 2.685914365746299e-05, "loss": 0.6362, "mean_token_accuracy": 0.8141534149646759, "num_tokens": 600401268.0, "step": 57880 }, { "entropy": 0.6700060665607452, "epoch": 0.46312, "grad_norm": 2.799567937850952, "learning_rate": 2.685514205682273e-05, "loss": 0.6643, "mean_token_accuracy": 0.7940884113311768, "num_tokens": 600550977.0, "step": 57890 }, { "entropy": 0.7493816494941712, "epoch": 0.4632, "grad_norm": 4.273106575012207, "learning_rate": 2.6851140456182473e-05, "loss": 0.7417, "mean_token_accuracy": 0.7992721259593963, "num_tokens": 600594407.0, "step": 57900 }, { "entropy": 0.6648769915103913, "epoch": 0.46328, "grad_norm": 2.1440131664276123, "learning_rate": 2.684713885554222e-05, "loss": 0.6605, "mean_token_accuracy": 0.7948827683925629, "num_tokens": 600758247.0, "step": 57910 }, { "entropy": 0.6702032804489135, "epoch": 0.46336, "grad_norm": 4.0291948318481445, "learning_rate": 2.6843137254901963e-05, "loss": 0.6672, "mean_token_accuracy": 0.8051204979419708, "num_tokens": 600848773.0, "step": 57920 }, { "entropy": 0.6530974864959717, "epoch": 0.46344, "grad_norm": 2.8986244201660156, "learning_rate": 2.6839135654261704e-05, "loss": 0.6551, "mean_token_accuracy": 0.8095817148685456, "num_tokens": 600941151.0, "step": 57930 }, { "entropy": 0.7150258183479309, "epoch": 0.46352, "grad_norm": 2.445610523223877, "learning_rate": 2.683513405362145e-05, "loss": 0.7096, "mean_token_accuracy": 0.7871583521366119, "num_tokens": 601079988.0, "step": 57940 }, { "entropy": 0.6128179997205734, "epoch": 0.4636, "grad_norm": 5.861443996429443, "learning_rate": 2.6831132452981195e-05, "loss": 0.6127, "mean_token_accuracy": 0.8339645087718963, "num_tokens": 601119563.0, "step": 57950 }, { "entropy": 0.6333119213581085, "epoch": 0.46368, "grad_norm": 2.0772786140441895, "learning_rate": 2.6827130852340938e-05, "loss": 0.6327, "mean_token_accuracy": 0.8018495678901673, "num_tokens": 601277380.0, "step": 57960 }, { "entropy": 0.5269644230604171, "epoch": 0.46376, "grad_norm": 4.216315269470215, "learning_rate": 2.682312925170068e-05, "loss": 0.5155, "mean_token_accuracy": 0.8498015642166138, "num_tokens": 601345639.0, "step": 57970 }, { "entropy": 0.6841938018798828, "epoch": 0.46384, "grad_norm": 2.0686097145080566, "learning_rate": 2.6819127651060426e-05, "loss": 0.6828, "mean_token_accuracy": 0.8066856682300567, "num_tokens": 601437863.0, "step": 57980 }, { "entropy": 0.6579491913318634, "epoch": 0.46392, "grad_norm": 3.013996124267578, "learning_rate": 2.681512605042017e-05, "loss": 0.6562, "mean_token_accuracy": 0.8027407348155975, "num_tokens": 601559192.0, "step": 57990 }, { "entropy": 0.6784598588943481, "epoch": 0.464, "grad_norm": 4.078954219818115, "learning_rate": 2.6811124449779913e-05, "loss": 0.6583, "mean_token_accuracy": 0.8234084725379944, "num_tokens": 601595578.0, "step": 58000 }, { "entropy": 0.6750770926475524, "epoch": 0.46408, "grad_norm": 1.8711249828338623, "learning_rate": 2.680712284913966e-05, "loss": 0.6785, "mean_token_accuracy": 0.7886480271816254, "num_tokens": 601759418.0, "step": 58010 }, { "entropy": 0.6720607280731201, "epoch": 0.46416, "grad_norm": 2.4161622524261475, "learning_rate": 2.68031212484994e-05, "loss": 0.6572, "mean_token_accuracy": 0.807978355884552, "num_tokens": 601854588.0, "step": 58020 }, { "entropy": 0.6192223787307739, "epoch": 0.46424, "grad_norm": 2.1016299724578857, "learning_rate": 2.6799119647859144e-05, "loss": 0.6309, "mean_token_accuracy": 0.8177457928657532, "num_tokens": 601948469.0, "step": 58030 }, { "entropy": 0.6764066517353058, "epoch": 0.46432, "grad_norm": 2.7089123725891113, "learning_rate": 2.6795118047218888e-05, "loss": 0.6766, "mean_token_accuracy": 0.7905055463314057, "num_tokens": 602081497.0, "step": 58040 }, { "entropy": 0.6297292083501815, "epoch": 0.4644, "grad_norm": 4.942296981811523, "learning_rate": 2.6791116446578635e-05, "loss": 0.6231, "mean_token_accuracy": 0.8336930453777314, "num_tokens": 602122414.0, "step": 58050 }, { "entropy": 0.6681886553764343, "epoch": 0.46448, "grad_norm": 2.5237386226654053, "learning_rate": 2.6787114845938375e-05, "loss": 0.663, "mean_token_accuracy": 0.7895166099071502, "num_tokens": 602286054.0, "step": 58060 }, { "entropy": 0.7742725431919097, "epoch": 0.46456, "grad_norm": 3.7766928672790527, "learning_rate": 2.678311324529812e-05, "loss": 0.7672, "mean_token_accuracy": 0.7872662305831909, "num_tokens": 602366859.0, "step": 58070 }, { "entropy": 0.7195900082588196, "epoch": 0.46464, "grad_norm": 2.3388960361480713, "learning_rate": 2.6779111644657866e-05, "loss": 0.7175, "mean_token_accuracy": 0.7974194884300232, "num_tokens": 602460804.0, "step": 58080 }, { "entropy": 0.7109992265701294, "epoch": 0.46472, "grad_norm": 4.066165447235107, "learning_rate": 2.677511004401761e-05, "loss": 0.7074, "mean_token_accuracy": 0.7863510608673095, "num_tokens": 602603656.0, "step": 58090 }, { "entropy": 0.6909426122903823, "epoch": 0.4648, "grad_norm": 3.856100559234619, "learning_rate": 2.677110844337735e-05, "loss": 0.6839, "mean_token_accuracy": 0.8140327215194703, "num_tokens": 602644082.0, "step": 58100 }, { "entropy": 0.6508643090724945, "epoch": 0.46488, "grad_norm": 1.8409966230392456, "learning_rate": 2.6767106842737094e-05, "loss": 0.6523, "mean_token_accuracy": 0.7955428063869476, "num_tokens": 602807530.0, "step": 58110 }, { "entropy": 0.6713820040225983, "epoch": 0.46496, "grad_norm": 3.08182692527771, "learning_rate": 2.676310524209684e-05, "loss": 0.6666, "mean_token_accuracy": 0.8106829166412354, "num_tokens": 602886354.0, "step": 58120 }, { "entropy": 0.7409083485603333, "epoch": 0.46504, "grad_norm": 1.5445364713668823, "learning_rate": 2.6759103641456585e-05, "loss": 0.726, "mean_token_accuracy": 0.7933618724346161, "num_tokens": 602981127.0, "step": 58130 }, { "entropy": 0.6586531460285187, "epoch": 0.46512, "grad_norm": 2.722874402999878, "learning_rate": 2.6755102040816325e-05, "loss": 0.6705, "mean_token_accuracy": 0.793971163034439, "num_tokens": 603117161.0, "step": 58140 }, { "entropy": 0.7092076539993286, "epoch": 0.4652, "grad_norm": 4.8424153327941895, "learning_rate": 2.675110044017607e-05, "loss": 0.717, "mean_token_accuracy": 0.814543491601944, "num_tokens": 603152405.0, "step": 58150 }, { "entropy": 0.6683294534683227, "epoch": 0.46528, "grad_norm": 1.7436094284057617, "learning_rate": 2.6747098839535816e-05, "loss": 0.6627, "mean_token_accuracy": 0.7940034210681916, "num_tokens": 603316245.0, "step": 58160 }, { "entropy": 0.6321838170289993, "epoch": 0.46536, "grad_norm": 2.6592814922332764, "learning_rate": 2.674309723889556e-05, "loss": 0.6286, "mean_token_accuracy": 0.8107478022575378, "num_tokens": 603418786.0, "step": 58170 }, { "entropy": 0.6775927901268005, "epoch": 0.46544, "grad_norm": 1.9034391641616821, "learning_rate": 2.67390956382553e-05, "loss": 0.6753, "mean_token_accuracy": 0.8072957754135132, "num_tokens": 603513315.0, "step": 58180 }, { "entropy": 0.7470995724201203, "epoch": 0.46552, "grad_norm": 2.39764666557312, "learning_rate": 2.673509403761505e-05, "loss": 0.7369, "mean_token_accuracy": 0.7839819252490997, "num_tokens": 603645363.0, "step": 58190 }, { "entropy": 0.6615418046712875, "epoch": 0.4656, "grad_norm": 5.528515815734863, "learning_rate": 2.673109243697479e-05, "loss": 0.6528, "mean_token_accuracy": 0.8263258397579193, "num_tokens": 603681014.0, "step": 58200 }, { "entropy": 0.625086885690689, "epoch": 0.46568, "grad_norm": 2.079758882522583, "learning_rate": 2.6727090836334534e-05, "loss": 0.6221, "mean_token_accuracy": 0.799975574016571, "num_tokens": 603844854.0, "step": 58210 }, { "entropy": 0.6471798986196518, "epoch": 0.46576, "grad_norm": 2.6322011947631836, "learning_rate": 2.6723089235694275e-05, "loss": 0.6443, "mean_token_accuracy": 0.8155276119709015, "num_tokens": 603933510.0, "step": 58220 }, { "entropy": 0.6406762897968292, "epoch": 0.46584, "grad_norm": 1.3415436744689941, "learning_rate": 2.6719087635054025e-05, "loss": 0.6228, "mean_token_accuracy": 0.8175098598003387, "num_tokens": 604028662.0, "step": 58230 }, { "entropy": 0.6330578029155731, "epoch": 0.46592, "grad_norm": 2.599597215652466, "learning_rate": 2.6715086034413766e-05, "loss": 0.6367, "mean_token_accuracy": 0.8017204225063324, "num_tokens": 604171720.0, "step": 58240 }, { "entropy": 0.571043175458908, "epoch": 0.466, "grad_norm": 4.981161117553711, "learning_rate": 2.671108443377351e-05, "loss": 0.5578, "mean_token_accuracy": 0.8407777428627015, "num_tokens": 604211436.0, "step": 58250 }, { "entropy": 0.7032157838344574, "epoch": 0.46608, "grad_norm": 2.5748515129089355, "learning_rate": 2.6707082833133256e-05, "loss": 0.7048, "mean_token_accuracy": 0.7848576128482818, "num_tokens": 604373204.0, "step": 58260 }, { "entropy": 0.5948619037866593, "epoch": 0.46616, "grad_norm": 3.213306427001953, "learning_rate": 2.6703081232493e-05, "loss": 0.5855, "mean_token_accuracy": 0.8308329880237579, "num_tokens": 604444062.0, "step": 58270 }, { "entropy": 0.7599340379238129, "epoch": 0.46624, "grad_norm": 1.3264118432998657, "learning_rate": 2.669907963185274e-05, "loss": 0.7562, "mean_token_accuracy": 0.7918716609477997, "num_tokens": 604537390.0, "step": 58280 }, { "entropy": 0.7008382141590118, "epoch": 0.46632, "grad_norm": 2.6652212142944336, "learning_rate": 2.6695078031212484e-05, "loss": 0.6894, "mean_token_accuracy": 0.789758849143982, "num_tokens": 604674737.0, "step": 58290 }, { "entropy": 0.6930516391992569, "epoch": 0.4664, "grad_norm": 5.516651153564453, "learning_rate": 2.669107643057223e-05, "loss": 0.6946, "mean_token_accuracy": 0.8188594043254852, "num_tokens": 604712133.0, "step": 58300 }, { "entropy": 0.6325101405382156, "epoch": 0.46648, "grad_norm": 2.133617877960205, "learning_rate": 2.6687074829931975e-05, "loss": 0.6308, "mean_token_accuracy": 0.79944828748703, "num_tokens": 604875750.0, "step": 58310 }, { "entropy": 0.6256420165300369, "epoch": 0.46656, "grad_norm": 2.276965618133545, "learning_rate": 2.6683073229291715e-05, "loss": 0.6185, "mean_token_accuracy": 0.8196208059787751, "num_tokens": 604961639.0, "step": 58320 }, { "entropy": 0.6668230056762695, "epoch": 0.46664, "grad_norm": 1.704695701599121, "learning_rate": 2.6679071628651466e-05, "loss": 0.6604, "mean_token_accuracy": 0.806753295660019, "num_tokens": 605055321.0, "step": 58330 }, { "entropy": 0.6431080996990204, "epoch": 0.46672, "grad_norm": 3.1810271739959717, "learning_rate": 2.6675070028011206e-05, "loss": 0.6433, "mean_token_accuracy": 0.8025886416435242, "num_tokens": 605181713.0, "step": 58340 }, { "entropy": 0.6412559002637863, "epoch": 0.4668, "grad_norm": 5.423111915588379, "learning_rate": 2.667106842737095e-05, "loss": 0.6333, "mean_token_accuracy": 0.8298996984958649, "num_tokens": 605216035.0, "step": 58350 }, { "entropy": 0.6591946363449097, "epoch": 0.46688, "grad_norm": 1.7356979846954346, "learning_rate": 2.666706682673069e-05, "loss": 0.6664, "mean_token_accuracy": 0.788605272769928, "num_tokens": 605379875.0, "step": 58360 }, { "entropy": 0.7571341246366501, "epoch": 0.46696, "grad_norm": 3.1440677642822266, "learning_rate": 2.666306522609044e-05, "loss": 0.7454, "mean_token_accuracy": 0.7904739439487457, "num_tokens": 605477284.0, "step": 58370 }, { "entropy": 0.685069614648819, "epoch": 0.46704, "grad_norm": 1.3982363939285278, "learning_rate": 2.665906362545018e-05, "loss": 0.6725, "mean_token_accuracy": 0.8034827470779419, "num_tokens": 605571627.0, "step": 58380 }, { "entropy": 0.5992043673992157, "epoch": 0.46712, "grad_norm": 1.925756812095642, "learning_rate": 2.6655062024809925e-05, "loss": 0.5951, "mean_token_accuracy": 0.8141358315944671, "num_tokens": 605703517.0, "step": 58390 }, { "entropy": 0.6895612299442291, "epoch": 0.4672, "grad_norm": 4.667175769805908, "learning_rate": 2.6651060424169672e-05, "loss": 0.686, "mean_token_accuracy": 0.8167984068393708, "num_tokens": 605737547.0, "step": 58400 }, { "entropy": 0.6165287673473359, "epoch": 0.46728, "grad_norm": 1.5731886625289917, "learning_rate": 2.6647058823529416e-05, "loss": 0.6097, "mean_token_accuracy": 0.8073237121105195, "num_tokens": 605900858.0, "step": 58410 }, { "entropy": 0.768883329629898, "epoch": 0.46736, "grad_norm": 4.161581516265869, "learning_rate": 2.6643057222889156e-05, "loss": 0.7629, "mean_token_accuracy": 0.7906923651695251, "num_tokens": 605971558.0, "step": 58420 }, { "entropy": 0.6982562988996506, "epoch": 0.46744, "grad_norm": 1.6829208135604858, "learning_rate": 2.66390556222489e-05, "loss": 0.709, "mean_token_accuracy": 0.8038861334323884, "num_tokens": 606062707.0, "step": 58430 }, { "entropy": 0.6967018574476243, "epoch": 0.46752, "grad_norm": 2.0893521308898926, "learning_rate": 2.6635054021608647e-05, "loss": 0.6978, "mean_token_accuracy": 0.7869339466094971, "num_tokens": 606204857.0, "step": 58440 }, { "entropy": 0.6173605859279633, "epoch": 0.4676, "grad_norm": 4.09409236907959, "learning_rate": 2.663105242096839e-05, "loss": 0.6016, "mean_token_accuracy": 0.8327824711799622, "num_tokens": 606245518.0, "step": 58450 }, { "entropy": 0.6662492692470551, "epoch": 0.46768, "grad_norm": 1.951894760131836, "learning_rate": 2.662705082032813e-05, "loss": 0.6619, "mean_token_accuracy": 0.7925516664981842, "num_tokens": 606408085.0, "step": 58460 }, { "entropy": 0.6183348774909974, "epoch": 0.46776, "grad_norm": 3.2067229747772217, "learning_rate": 2.6623049219687878e-05, "loss": 0.6127, "mean_token_accuracy": 0.8244781255722046, "num_tokens": 606479736.0, "step": 58470 }, { "entropy": 0.6924177646636963, "epoch": 0.46784, "grad_norm": 1.843826174736023, "learning_rate": 2.661904761904762e-05, "loss": 0.6887, "mean_token_accuracy": 0.8003609001636505, "num_tokens": 606573118.0, "step": 58480 }, { "entropy": 0.6963006675243377, "epoch": 0.46792, "grad_norm": 2.519075870513916, "learning_rate": 2.6615046018407365e-05, "loss": 0.6915, "mean_token_accuracy": 0.7910870373249054, "num_tokens": 606704861.0, "step": 58490 }, { "entropy": 0.7578354895114898, "epoch": 0.468, "grad_norm": 5.213985919952393, "learning_rate": 2.6611044417767106e-05, "loss": 0.7581, "mean_token_accuracy": 0.8021922945976258, "num_tokens": 606741584.0, "step": 58500 }, { "entropy": 0.6764892041683197, "epoch": 0.46808, "grad_norm": 2.598275661468506, "learning_rate": 2.6607042817126853e-05, "loss": 0.6724, "mean_token_accuracy": 0.792244827747345, "num_tokens": 606903825.0, "step": 58510 }, { "entropy": 0.6807495296001435, "epoch": 0.46816, "grad_norm": 4.386172294616699, "learning_rate": 2.6603041216486596e-05, "loss": 0.6774, "mean_token_accuracy": 0.8108675956726075, "num_tokens": 606975012.0, "step": 58520 }, { "entropy": 0.672638738155365, "epoch": 0.46824, "grad_norm": 1.5431652069091797, "learning_rate": 2.659903961584634e-05, "loss": 0.6657, "mean_token_accuracy": 0.8111242532730103, "num_tokens": 607066524.0, "step": 58530 }, { "entropy": 0.6942709743976593, "epoch": 0.46832, "grad_norm": 2.2806966304779053, "learning_rate": 2.6595038015206087e-05, "loss": 0.6881, "mean_token_accuracy": 0.7911390662193298, "num_tokens": 607204660.0, "step": 58540 }, { "entropy": 0.7142965376377106, "epoch": 0.4684, "grad_norm": 4.363842487335205, "learning_rate": 2.6591036414565828e-05, "loss": 0.6912, "mean_token_accuracy": 0.8159947633743286, "num_tokens": 607243558.0, "step": 58550 }, { "entropy": 0.6705278396606446, "epoch": 0.46848, "grad_norm": 1.7536265850067139, "learning_rate": 2.658703481392557e-05, "loss": 0.673, "mean_token_accuracy": 0.7871580362319947, "num_tokens": 607407398.0, "step": 58560 }, { "entropy": 0.6257370769977569, "epoch": 0.46856, "grad_norm": 3.236295223236084, "learning_rate": 2.6583033213285315e-05, "loss": 0.6104, "mean_token_accuracy": 0.8249714851379395, "num_tokens": 607493076.0, "step": 58570 }, { "entropy": 0.7000867247581481, "epoch": 0.46864, "grad_norm": 1.7255748510360718, "learning_rate": 2.6579031612645062e-05, "loss": 0.7141, "mean_token_accuracy": 0.7999714612960815, "num_tokens": 607586457.0, "step": 58580 }, { "entropy": 0.6800116837024689, "epoch": 0.46872, "grad_norm": 2.4123361110687256, "learning_rate": 2.6575030012004802e-05, "loss": 0.6705, "mean_token_accuracy": 0.7939952850341797, "num_tokens": 607725840.0, "step": 58590 }, { "entropy": 0.7737940192222595, "epoch": 0.4688, "grad_norm": 4.960919380187988, "learning_rate": 2.6571028411364546e-05, "loss": 0.7843, "mean_token_accuracy": 0.8011262714862823, "num_tokens": 607764310.0, "step": 58600 }, { "entropy": 0.6160244703292846, "epoch": 0.46888, "grad_norm": 1.8674352169036865, "learning_rate": 2.6567026810724293e-05, "loss": 0.6112, "mean_token_accuracy": 0.8028014481067658, "num_tokens": 607927542.0, "step": 58610 }, { "entropy": 0.6640739262104034, "epoch": 0.46896, "grad_norm": 3.7624073028564453, "learning_rate": 2.6563025210084037e-05, "loss": 0.6582, "mean_token_accuracy": 0.8095837533473969, "num_tokens": 608005311.0, "step": 58620 }, { "entropy": 0.698111379146576, "epoch": 0.46904, "grad_norm": 2.199321985244751, "learning_rate": 2.6559023609443777e-05, "loss": 0.7021, "mean_token_accuracy": 0.7970265746116638, "num_tokens": 608099312.0, "step": 58630 }, { "entropy": 0.6633195608854294, "epoch": 0.46912, "grad_norm": 2.211303234100342, "learning_rate": 2.655502200880352e-05, "loss": 0.6453, "mean_token_accuracy": 0.8023530125617981, "num_tokens": 608229127.0, "step": 58640 }, { "entropy": 0.7640010774135589, "epoch": 0.4692, "grad_norm": 4.232137203216553, "learning_rate": 2.6551020408163268e-05, "loss": 0.7593, "mean_token_accuracy": 0.8005302786827088, "num_tokens": 608264392.0, "step": 58650 }, { "entropy": 0.6114979952573776, "epoch": 0.46928, "grad_norm": 2.185542106628418, "learning_rate": 2.6547018807523012e-05, "loss": 0.6073, "mean_token_accuracy": 0.8033341526985168, "num_tokens": 608428232.0, "step": 58660 }, { "entropy": 0.6597109913825989, "epoch": 0.46936, "grad_norm": 4.587719440460205, "learning_rate": 2.6543017206882752e-05, "loss": 0.6617, "mean_token_accuracy": 0.8052360236644744, "num_tokens": 608531429.0, "step": 58670 }, { "entropy": 0.7360830962657928, "epoch": 0.46944, "grad_norm": 1.4604793787002563, "learning_rate": 2.6539015606242496e-05, "loss": 0.7316, "mean_token_accuracy": 0.7903098165988922, "num_tokens": 608625273.0, "step": 58680 }, { "entropy": 0.6458802342414856, "epoch": 0.46952, "grad_norm": 4.296722888946533, "learning_rate": 2.6535014005602243e-05, "loss": 0.6387, "mean_token_accuracy": 0.8038572490215301, "num_tokens": 608754328.0, "step": 58690 }, { "entropy": 0.6888731628656387, "epoch": 0.4696, "grad_norm": 5.134754657745361, "learning_rate": 2.6531012404961987e-05, "loss": 0.6912, "mean_token_accuracy": 0.8179869413375854, "num_tokens": 608789638.0, "step": 58700 }, { "entropy": 0.6058568835258484, "epoch": 0.46968, "grad_norm": 1.686743974685669, "learning_rate": 2.6527010804321727e-05, "loss": 0.6048, "mean_token_accuracy": 0.8055251717567444, "num_tokens": 608953022.0, "step": 58710 }, { "entropy": 0.6612878382205963, "epoch": 0.46976, "grad_norm": 3.660738468170166, "learning_rate": 2.6523009203681477e-05, "loss": 0.6484, "mean_token_accuracy": 0.8097345530986786, "num_tokens": 609036400.0, "step": 58720 }, { "entropy": 0.721395081281662, "epoch": 0.46984, "grad_norm": 2.7010624408721924, "learning_rate": 2.6519007603041218e-05, "loss": 0.7294, "mean_token_accuracy": 0.78981454372406, "num_tokens": 609130657.0, "step": 58730 }, { "entropy": 0.660064697265625, "epoch": 0.46992, "grad_norm": 2.420569658279419, "learning_rate": 2.651500600240096e-05, "loss": 0.6557, "mean_token_accuracy": 0.7933099269866943, "num_tokens": 609276896.0, "step": 58740 }, { "entropy": 0.6509192734956741, "epoch": 0.47, "grad_norm": 4.7230682373046875, "learning_rate": 2.6511004401760702e-05, "loss": 0.6491, "mean_token_accuracy": 0.8259395718574524, "num_tokens": 609316163.0, "step": 58750 }, { "entropy": 0.6483204603195191, "epoch": 0.47008, "grad_norm": 2.7113454341888428, "learning_rate": 2.6507002801120452e-05, "loss": 0.6442, "mean_token_accuracy": 0.7984978079795837, "num_tokens": 609480003.0, "step": 58760 }, { "entropy": 0.655782225728035, "epoch": 0.47016, "grad_norm": 3.1447041034698486, "learning_rate": 2.6503001200480193e-05, "loss": 0.6445, "mean_token_accuracy": 0.809010487794876, "num_tokens": 609585714.0, "step": 58770 }, { "entropy": 0.7059383749961853, "epoch": 0.47024, "grad_norm": 3.183222770690918, "learning_rate": 2.6498999599839936e-05, "loss": 0.6914, "mean_token_accuracy": 0.80052649974823, "num_tokens": 609679748.0, "step": 58780 }, { "entropy": 0.6780659556388855, "epoch": 0.47032, "grad_norm": 2.328457832336426, "learning_rate": 2.6494997999199683e-05, "loss": 0.6664, "mean_token_accuracy": 0.7938514888286591, "num_tokens": 609827102.0, "step": 58790 }, { "entropy": 0.565862563252449, "epoch": 0.4704, "grad_norm": 4.712466239929199, "learning_rate": 2.6490996398559427e-05, "loss": 0.5541, "mean_token_accuracy": 0.8428116381168366, "num_tokens": 609873051.0, "step": 58800 }, { "entropy": 0.6664503276348114, "epoch": 0.47048, "grad_norm": 1.8519498109817505, "learning_rate": 2.6486994797919167e-05, "loss": 0.6678, "mean_token_accuracy": 0.7940226197242737, "num_tokens": 610035842.0, "step": 58810 }, { "entropy": 0.5913424491882324, "epoch": 0.47056, "grad_norm": 3.3234939575195312, "learning_rate": 2.648299319727891e-05, "loss": 0.5835, "mean_token_accuracy": 0.8303544819355011, "num_tokens": 610108436.0, "step": 58820 }, { "entropy": 0.673985606431961, "epoch": 0.47064, "grad_norm": 1.7258265018463135, "learning_rate": 2.6478991596638658e-05, "loss": 0.6866, "mean_token_accuracy": 0.8073321342468261, "num_tokens": 610199674.0, "step": 58830 }, { "entropy": 0.6802574038505554, "epoch": 0.47072, "grad_norm": 2.119713544845581, "learning_rate": 2.6474989995998402e-05, "loss": 0.6661, "mean_token_accuracy": 0.7974168360233307, "num_tokens": 610334296.0, "step": 58840 }, { "entropy": 0.6855286121368408, "epoch": 0.4708, "grad_norm": 6.846786022186279, "learning_rate": 2.6470988395358142e-05, "loss": 0.6749, "mean_token_accuracy": 0.8198169648647309, "num_tokens": 610371425.0, "step": 58850 }, { "entropy": 0.629852569103241, "epoch": 0.47088, "grad_norm": 2.69291090965271, "learning_rate": 2.646698679471789e-05, "loss": 0.6301, "mean_token_accuracy": 0.8001180708408355, "num_tokens": 610534908.0, "step": 58860 }, { "entropy": 0.645661062002182, "epoch": 0.47096, "grad_norm": 4.041208744049072, "learning_rate": 2.6462985194077633e-05, "loss": 0.63, "mean_token_accuracy": 0.8164476931095124, "num_tokens": 610615753.0, "step": 58870 }, { "entropy": 0.6733000159263611, "epoch": 0.47104, "grad_norm": 2.1828010082244873, "learning_rate": 2.6458983593437377e-05, "loss": 0.6699, "mean_token_accuracy": 0.8065796077251435, "num_tokens": 610710083.0, "step": 58880 }, { "entropy": 0.7090656757354736, "epoch": 0.47112, "grad_norm": 2.2653558254241943, "learning_rate": 2.6454981992797117e-05, "loss": 0.7057, "mean_token_accuracy": 0.7828175306320191, "num_tokens": 610855125.0, "step": 58890 }, { "entropy": 0.7134607076644898, "epoch": 0.4712, "grad_norm": 4.992192268371582, "learning_rate": 2.6450980392156864e-05, "loss": 0.7057, "mean_token_accuracy": 0.8135316967964172, "num_tokens": 610890583.0, "step": 58900 }, { "entropy": 0.659372353553772, "epoch": 0.47128, "grad_norm": 1.8923896551132202, "learning_rate": 2.6446978791516608e-05, "loss": 0.6548, "mean_token_accuracy": 0.7931312799453736, "num_tokens": 611054069.0, "step": 58910 }, { "entropy": 0.6968962520360946, "epoch": 0.47136, "grad_norm": 3.0940592288970947, "learning_rate": 2.6442977190876352e-05, "loss": 0.6931, "mean_token_accuracy": 0.8027955174446106, "num_tokens": 611132759.0, "step": 58920 }, { "entropy": 0.6827654570341111, "epoch": 0.47144, "grad_norm": 2.141324520111084, "learning_rate": 2.64389755902361e-05, "loss": 0.6864, "mean_token_accuracy": 0.8029484033584595, "num_tokens": 611227368.0, "step": 58930 }, { "entropy": 0.6595832049846649, "epoch": 0.47152, "grad_norm": 3.2961254119873047, "learning_rate": 2.643497398959584e-05, "loss": 0.6474, "mean_token_accuracy": 0.7983677566051484, "num_tokens": 611366431.0, "step": 58940 }, { "entropy": 0.6311177253723145, "epoch": 0.4716, "grad_norm": 5.321827411651611, "learning_rate": 2.6430972388955583e-05, "loss": 0.6282, "mean_token_accuracy": 0.8304627776145935, "num_tokens": 611405251.0, "step": 58950 }, { "entropy": 0.6341064512729645, "epoch": 0.47168, "grad_norm": 2.2606217861175537, "learning_rate": 2.6426970788315327e-05, "loss": 0.6361, "mean_token_accuracy": 0.7981993377208709, "num_tokens": 611568085.0, "step": 58960 }, { "entropy": 0.7001508861780167, "epoch": 0.47176, "grad_norm": 3.4423952102661133, "learning_rate": 2.6422969187675074e-05, "loss": 0.678, "mean_token_accuracy": 0.8091945230960846, "num_tokens": 611639991.0, "step": 58970 }, { "entropy": 0.7334588944911957, "epoch": 0.47184, "grad_norm": 1.5234061479568481, "learning_rate": 2.6418967587034814e-05, "loss": 0.745, "mean_token_accuracy": 0.7917143166065216, "num_tokens": 611732719.0, "step": 58980 }, { "entropy": 0.6823674321174622, "epoch": 0.47192, "grad_norm": 2.12276554107666, "learning_rate": 2.6414965986394558e-05, "loss": 0.6746, "mean_token_accuracy": 0.7918991088867188, "num_tokens": 611864132.0, "step": 58990 }, { "entropy": 0.6694253355264663, "epoch": 0.472, "grad_norm": 4.180976390838623, "learning_rate": 2.6410964385754305e-05, "loss": 0.6606, "mean_token_accuracy": 0.8223714709281922, "num_tokens": 611902410.0, "step": 59000 }, { "entropy": 0.6404807329177856, "epoch": 0.47208, "grad_norm": 1.709559440612793, "learning_rate": 2.640696278511405e-05, "loss": 0.6414, "mean_token_accuracy": 0.7940766990184784, "num_tokens": 612066250.0, "step": 59010 }, { "entropy": 0.6963200002908707, "epoch": 0.47216, "grad_norm": 3.46822190284729, "learning_rate": 2.640296118447379e-05, "loss": 0.6926, "mean_token_accuracy": 0.8015030562877655, "num_tokens": 612160382.0, "step": 59020 }, { "entropy": 0.5926203787326813, "epoch": 0.47224, "grad_norm": 1.4252285957336426, "learning_rate": 2.6398959583833533e-05, "loss": 0.5916, "mean_token_accuracy": 0.8279013395309448, "num_tokens": 612254570.0, "step": 59030 }, { "entropy": 0.7020713686943054, "epoch": 0.47232, "grad_norm": 3.024761915206909, "learning_rate": 2.639495798319328e-05, "loss": 0.6972, "mean_token_accuracy": 0.7869162082672119, "num_tokens": 612391830.0, "step": 59040 }, { "entropy": 0.6978690683841705, "epoch": 0.4724, "grad_norm": 4.570407390594482, "learning_rate": 2.6390956382553023e-05, "loss": 0.6846, "mean_token_accuracy": 0.8134807229042054, "num_tokens": 612428238.0, "step": 59050 }, { "entropy": 0.6975734531879425, "epoch": 0.47248, "grad_norm": 1.9004255533218384, "learning_rate": 2.6386954781912764e-05, "loss": 0.6967, "mean_token_accuracy": 0.7868716001510621, "num_tokens": 612591505.0, "step": 59060 }, { "entropy": 0.6124740719795227, "epoch": 0.47256, "grad_norm": 3.789551258087158, "learning_rate": 2.6382953181272514e-05, "loss": 0.6006, "mean_token_accuracy": 0.8203661501407623, "num_tokens": 612673784.0, "step": 59070 }, { "entropy": 0.748392516374588, "epoch": 0.47264, "grad_norm": 1.5509388446807861, "learning_rate": 2.6378951580632254e-05, "loss": 0.7359, "mean_token_accuracy": 0.7864316761493683, "num_tokens": 612768114.0, "step": 59080 }, { "entropy": 0.6909709572792053, "epoch": 0.47272, "grad_norm": 2.532416582107544, "learning_rate": 2.6374949979991998e-05, "loss": 0.6983, "mean_token_accuracy": 0.7898825109004974, "num_tokens": 612916142.0, "step": 59090 }, { "entropy": 0.6771565645933151, "epoch": 0.4728, "grad_norm": 4.638844966888428, "learning_rate": 2.637094837935174e-05, "loss": 0.6682, "mean_token_accuracy": 0.8141746103763581, "num_tokens": 612961799.0, "step": 59100 }, { "entropy": 0.6689971327781677, "epoch": 0.47288, "grad_norm": 1.8885711431503296, "learning_rate": 2.636694677871149e-05, "loss": 0.6628, "mean_token_accuracy": 0.7887043654918671, "num_tokens": 613124848.0, "step": 59110 }, { "entropy": 0.6730976194143296, "epoch": 0.47296, "grad_norm": 3.0787720680236816, "learning_rate": 2.636294517807123e-05, "loss": 0.6707, "mean_token_accuracy": 0.8129806280136108, "num_tokens": 613200770.0, "step": 59120 }, { "entropy": 0.6645958542823791, "epoch": 0.47304, "grad_norm": 1.4168037176132202, "learning_rate": 2.6358943577430973e-05, "loss": 0.6702, "mean_token_accuracy": 0.8059587895870208, "num_tokens": 613295532.0, "step": 59130 }, { "entropy": 0.6805965900421143, "epoch": 0.47312, "grad_norm": 3.6009368896484375, "learning_rate": 2.635494197679072e-05, "loss": 0.6756, "mean_token_accuracy": 0.795225465297699, "num_tokens": 613431187.0, "step": 59140 }, { "entropy": 0.7030490159988403, "epoch": 0.4732, "grad_norm": 4.419373989105225, "learning_rate": 2.6350940376150464e-05, "loss": 0.6993, "mean_token_accuracy": 0.8123882293701172, "num_tokens": 613466530.0, "step": 59150 }, { "entropy": 0.6382363021373749, "epoch": 0.47328, "grad_norm": 2.361516237258911, "learning_rate": 2.6346938775510204e-05, "loss": 0.6366, "mean_token_accuracy": 0.7986565709114075, "num_tokens": 613630370.0, "step": 59160 }, { "entropy": 0.6249480694532394, "epoch": 0.47336, "grad_norm": 2.975512981414795, "learning_rate": 2.6342937174869948e-05, "loss": 0.613, "mean_token_accuracy": 0.8230108976364136, "num_tokens": 613707889.0, "step": 59170 }, { "entropy": 0.6673639118671417, "epoch": 0.47344, "grad_norm": 2.425028085708618, "learning_rate": 2.6338935574229695e-05, "loss": 0.6734, "mean_token_accuracy": 0.8078582882881165, "num_tokens": 613800377.0, "step": 59180 }, { "entropy": 0.6554935157299042, "epoch": 0.47352, "grad_norm": 2.1267197132110596, "learning_rate": 2.633493397358944e-05, "loss": 0.6491, "mean_token_accuracy": 0.799557900428772, "num_tokens": 613931602.0, "step": 59190 }, { "entropy": 0.6948117792606354, "epoch": 0.4736, "grad_norm": 4.158293724060059, "learning_rate": 2.633093237294918e-05, "loss": 0.6931, "mean_token_accuracy": 0.813271576166153, "num_tokens": 613970627.0, "step": 59200 }, { "entropy": 0.6788432419300079, "epoch": 0.47368, "grad_norm": 1.501640796661377, "learning_rate": 2.6326930772308926e-05, "loss": 0.6763, "mean_token_accuracy": 0.7894418716430665, "num_tokens": 614134467.0, "step": 59210 }, { "entropy": 0.6428117334842682, "epoch": 0.47376, "grad_norm": 4.5418500900268555, "learning_rate": 2.632292917166867e-05, "loss": 0.6348, "mean_token_accuracy": 0.8144511580467224, "num_tokens": 614220491.0, "step": 59220 }, { "entropy": 0.6969691932201385, "epoch": 0.47384, "grad_norm": 1.7735778093338013, "learning_rate": 2.6318927571028414e-05, "loss": 0.7076, "mean_token_accuracy": 0.7975171148777008, "num_tokens": 614314839.0, "step": 59230 }, { "entropy": 0.6475715041160583, "epoch": 0.47392, "grad_norm": 1.8926126956939697, "learning_rate": 2.6314925970388154e-05, "loss": 0.6382, "mean_token_accuracy": 0.8028057873249054, "num_tokens": 614458862.0, "step": 59240 }, { "entropy": 0.735200035572052, "epoch": 0.474, "grad_norm": 4.811481952667236, "learning_rate": 2.63109243697479e-05, "loss": 0.7288, "mean_token_accuracy": 0.8110281229019165, "num_tokens": 614498360.0, "step": 59250 }, { "entropy": 0.652193260192871, "epoch": 0.47408, "grad_norm": 1.5364152193069458, "learning_rate": 2.6306922769107645e-05, "loss": 0.6534, "mean_token_accuracy": 0.7948522210121155, "num_tokens": 614662200.0, "step": 59260 }, { "entropy": 0.6108351469039917, "epoch": 0.47416, "grad_norm": 2.7661783695220947, "learning_rate": 2.630292116846739e-05, "loss": 0.6027, "mean_token_accuracy": 0.8211930453777313, "num_tokens": 614756379.0, "step": 59270 }, { "entropy": 0.6752935588359833, "epoch": 0.47424, "grad_norm": 1.4507319927215576, "learning_rate": 2.629891956782713e-05, "loss": 0.6779, "mean_token_accuracy": 0.7982758224010468, "num_tokens": 614851140.0, "step": 59280 }, { "entropy": 0.6511184692382812, "epoch": 0.47432, "grad_norm": 2.0901079177856445, "learning_rate": 2.6294917967186876e-05, "loss": 0.6455, "mean_token_accuracy": 0.802970951795578, "num_tokens": 614981326.0, "step": 59290 }, { "entropy": 0.7175331592559815, "epoch": 0.4744, "grad_norm": 3.9946694374084473, "learning_rate": 2.629091636654662e-05, "loss": 0.7012, "mean_token_accuracy": 0.8147868394851685, "num_tokens": 615017366.0, "step": 59300 }, { "entropy": 0.6144055843353271, "epoch": 0.47448, "grad_norm": 1.610459566116333, "learning_rate": 2.6286914765906363e-05, "loss": 0.6115, "mean_token_accuracy": 0.8026476204395294, "num_tokens": 615181181.0, "step": 59310 }, { "entropy": 0.6166752755641938, "epoch": 0.47456, "grad_norm": 3.0555777549743652, "learning_rate": 2.628291316526611e-05, "loss": 0.617, "mean_token_accuracy": 0.8204599261283875, "num_tokens": 615256561.0, "step": 59320 }, { "entropy": 0.7242561757564545, "epoch": 0.47464, "grad_norm": 1.6427215337753296, "learning_rate": 2.627891156462585e-05, "loss": 0.7159, "mean_token_accuracy": 0.800860995054245, "num_tokens": 615349326.0, "step": 59330 }, { "entropy": 0.6742319464683533, "epoch": 0.47472, "grad_norm": 3.483393430709839, "learning_rate": 2.6274909963985594e-05, "loss": 0.6706, "mean_token_accuracy": 0.7943444609642029, "num_tokens": 615472026.0, "step": 59340 }, { "entropy": 0.6850774854421615, "epoch": 0.4748, "grad_norm": 5.411218643188477, "learning_rate": 2.6270908363345338e-05, "loss": 0.6744, "mean_token_accuracy": 0.820462304353714, "num_tokens": 615508073.0, "step": 59350 }, { "entropy": 0.6527118802070617, "epoch": 0.47488, "grad_norm": 1.3841190338134766, "learning_rate": 2.6266906762705085e-05, "loss": 0.6485, "mean_token_accuracy": 0.7972520172595978, "num_tokens": 615671646.0, "step": 59360 }, { "entropy": 0.7017013311386109, "epoch": 0.47496, "grad_norm": 3.9477202892303467, "learning_rate": 2.6262905162064826e-05, "loss": 0.6974, "mean_token_accuracy": 0.8003413498401641, "num_tokens": 615752145.0, "step": 59370 }, { "entropy": 0.6939324259757995, "epoch": 0.47504, "grad_norm": 2.7248480319976807, "learning_rate": 2.625890356142457e-05, "loss": 0.6933, "mean_token_accuracy": 0.7999239027500152, "num_tokens": 615845260.0, "step": 59380 }, { "entropy": 0.6570449411869049, "epoch": 0.47512, "grad_norm": 1.7086048126220703, "learning_rate": 2.6254901960784316e-05, "loss": 0.6566, "mean_token_accuracy": 0.7974588930606842, "num_tokens": 615995184.0, "step": 59390 }, { "entropy": 0.6682213723659516, "epoch": 0.4752, "grad_norm": 5.30317497253418, "learning_rate": 2.625090036014406e-05, "loss": 0.6779, "mean_token_accuracy": 0.8126138031482697, "num_tokens": 616045447.0, "step": 59400 }, { "entropy": 0.6343091040849685, "epoch": 0.47528, "grad_norm": 1.931877851486206, "learning_rate": 2.62468987595038e-05, "loss": 0.6238, "mean_token_accuracy": 0.8016444027423859, "num_tokens": 616208665.0, "step": 59410 }, { "entropy": 0.6108599781990052, "epoch": 0.47536, "grad_norm": 3.21751070022583, "learning_rate": 2.6242897158863544e-05, "loss": 0.6009, "mean_token_accuracy": 0.8284789800643921, "num_tokens": 616287649.0, "step": 59420 }, { "entropy": 0.6641929417848587, "epoch": 0.47544, "grad_norm": 1.7396717071533203, "learning_rate": 2.623889555822329e-05, "loss": 0.6701, "mean_token_accuracy": 0.8055189669132232, "num_tokens": 616381379.0, "step": 59430 }, { "entropy": 0.6839167475700378, "epoch": 0.47552, "grad_norm": 2.5144290924072266, "learning_rate": 2.6234893957583035e-05, "loss": 0.6805, "mean_token_accuracy": 0.7914279818534851, "num_tokens": 616531913.0, "step": 59440 }, { "entropy": 0.7101698279380798, "epoch": 0.4756, "grad_norm": 4.245513439178467, "learning_rate": 2.6230892356942775e-05, "loss": 0.6947, "mean_token_accuracy": 0.8100892543792725, "num_tokens": 616577697.0, "step": 59450 }, { "entropy": 0.6715237736701966, "epoch": 0.47568, "grad_norm": 2.6837446689605713, "learning_rate": 2.6226890756302526e-05, "loss": 0.6653, "mean_token_accuracy": 0.7912580966949463, "num_tokens": 616741285.0, "step": 59460 }, { "entropy": 0.6906873047351837, "epoch": 0.47576, "grad_norm": 3.196409225463867, "learning_rate": 2.6222889155662266e-05, "loss": 0.681, "mean_token_accuracy": 0.8057127714157104, "num_tokens": 616818216.0, "step": 59470 }, { "entropy": 0.7225751042366028, "epoch": 0.47584, "grad_norm": 2.8745999336242676, "learning_rate": 2.621888755502201e-05, "loss": 0.7161, "mean_token_accuracy": 0.7953916549682617, "num_tokens": 616912487.0, "step": 59480 }, { "entropy": 0.6394951224327088, "epoch": 0.47592, "grad_norm": 2.7661240100860596, "learning_rate": 2.621488595438175e-05, "loss": 0.6406, "mean_token_accuracy": 0.8047308683395386, "num_tokens": 617056572.0, "step": 59490 }, { "entropy": 0.6661390066146851, "epoch": 0.476, "grad_norm": 4.342526912689209, "learning_rate": 2.62108843537415e-05, "loss": 0.643, "mean_token_accuracy": 0.8222570180892944, "num_tokens": 617094880.0, "step": 59500 }, { "entropy": 0.6218186318874359, "epoch": 0.47608, "grad_norm": 1.306370496749878, "learning_rate": 2.620688275310124e-05, "loss": 0.6189, "mean_token_accuracy": 0.7999755799770355, "num_tokens": 617258720.0, "step": 59510 }, { "entropy": 0.6500730127096176, "epoch": 0.47616, "grad_norm": 3.832256555557251, "learning_rate": 2.6202881152460985e-05, "loss": 0.6516, "mean_token_accuracy": 0.8135707795619964, "num_tokens": 617337574.0, "step": 59520 }, { "entropy": 0.610250586271286, "epoch": 0.47624, "grad_norm": 1.5078730583190918, "learning_rate": 2.6198879551820732e-05, "loss": 0.6045, "mean_token_accuracy": 0.8222699403762818, "num_tokens": 617430581.0, "step": 59530 }, { "entropy": 0.6585777431726456, "epoch": 0.47632, "grad_norm": 3.017913341522217, "learning_rate": 2.6194877951180475e-05, "loss": 0.6516, "mean_token_accuracy": 0.8009363651275635, "num_tokens": 617565172.0, "step": 59540 }, { "entropy": 0.6947599500417709, "epoch": 0.4764, "grad_norm": 4.685021877288818, "learning_rate": 2.6190876350540216e-05, "loss": 0.6998, "mean_token_accuracy": 0.8144859850406647, "num_tokens": 617605826.0, "step": 59550 }, { "entropy": 0.6587155580520629, "epoch": 0.47648, "grad_norm": 1.4979299306869507, "learning_rate": 2.618687474989996e-05, "loss": 0.6571, "mean_token_accuracy": 0.7928492963314057, "num_tokens": 617769666.0, "step": 59560 }, { "entropy": 0.6792428404092788, "epoch": 0.47656, "grad_norm": 3.0987346172332764, "learning_rate": 2.6182873149259707e-05, "loss": 0.6765, "mean_token_accuracy": 0.8101895570755004, "num_tokens": 617851179.0, "step": 59570 }, { "entropy": 0.6280478119850159, "epoch": 0.47664, "grad_norm": 1.7824300527572632, "learning_rate": 2.617887154861945e-05, "loss": 0.6219, "mean_token_accuracy": 0.8182410955429077, "num_tokens": 617944895.0, "step": 59580 }, { "entropy": 0.6430603563785553, "epoch": 0.47672, "grad_norm": 3.0462021827697754, "learning_rate": 2.617486994797919e-05, "loss": 0.64, "mean_token_accuracy": 0.8015980958938599, "num_tokens": 618071228.0, "step": 59590 }, { "entropy": 0.7511819362640381, "epoch": 0.4768, "grad_norm": 4.467182159423828, "learning_rate": 2.6170868347338938e-05, "loss": 0.7456, "mean_token_accuracy": 0.8048641324043274, "num_tokens": 618107031.0, "step": 59600 }, { "entropy": 0.6417478531599045, "epoch": 0.47688, "grad_norm": 2.106289863586426, "learning_rate": 2.616686674669868e-05, "loss": 0.6361, "mean_token_accuracy": 0.7998839795589447, "num_tokens": 618270871.0, "step": 59610 }, { "entropy": 0.6314331114292144, "epoch": 0.47696, "grad_norm": 2.575040102005005, "learning_rate": 2.6162865146058425e-05, "loss": 0.6168, "mean_token_accuracy": 0.8185856640338898, "num_tokens": 618376118.0, "step": 59620 }, { "entropy": 0.6663169264793396, "epoch": 0.47704, "grad_norm": 2.034067392349243, "learning_rate": 2.6158863545418166e-05, "loss": 0.6744, "mean_token_accuracy": 0.805137550830841, "num_tokens": 618471022.0, "step": 59630 }, { "entropy": 0.7016594052314759, "epoch": 0.47712, "grad_norm": 3.1974716186523438, "learning_rate": 2.6154861944777913e-05, "loss": 0.6882, "mean_token_accuracy": 0.7897249460220337, "num_tokens": 618605649.0, "step": 59640 }, { "entropy": 0.6617982357740402, "epoch": 0.4772, "grad_norm": 5.236884117126465, "learning_rate": 2.6150860344137656e-05, "loss": 0.6717, "mean_token_accuracy": 0.8237511098384858, "num_tokens": 618641195.0, "step": 59650 }, { "entropy": 0.6597342193126678, "epoch": 0.47728, "grad_norm": 1.4903039932250977, "learning_rate": 2.61468587434974e-05, "loss": 0.6577, "mean_token_accuracy": 0.7928126513957977, "num_tokens": 618805035.0, "step": 59660 }, { "entropy": 0.6404118746519089, "epoch": 0.47736, "grad_norm": 2.651212453842163, "learning_rate": 2.6142857142857147e-05, "loss": 0.6368, "mean_token_accuracy": 0.8132069826126098, "num_tokens": 618901852.0, "step": 59670 }, { "entropy": 0.6828946173191071, "epoch": 0.47744, "grad_norm": 2.152391195297241, "learning_rate": 2.6138855542216887e-05, "loss": 0.6702, "mean_token_accuracy": 0.8002413451671601, "num_tokens": 618998393.0, "step": 59680 }, { "entropy": 0.6716528952121734, "epoch": 0.47752, "grad_norm": 3.6436386108398438, "learning_rate": 2.613485394157663e-05, "loss": 0.6758, "mean_token_accuracy": 0.7971311986446381, "num_tokens": 619132720.0, "step": 59690 }, { "entropy": 0.7447121441364288, "epoch": 0.4776, "grad_norm": 5.343768119812012, "learning_rate": 2.6130852340936375e-05, "loss": 0.7369, "mean_token_accuracy": 0.8056201994419098, "num_tokens": 619171902.0, "step": 59700 }, { "entropy": 0.6285779714584351, "epoch": 0.47768, "grad_norm": 2.1073508262634277, "learning_rate": 2.6126850740296122e-05, "loss": 0.6249, "mean_token_accuracy": 0.7986152470111847, "num_tokens": 619335706.0, "step": 59710 }, { "entropy": 0.6312522709369659, "epoch": 0.47776, "grad_norm": 3.3955373764038086, "learning_rate": 2.6122849139655862e-05, "loss": 0.6276, "mean_token_accuracy": 0.8217517673969269, "num_tokens": 619424978.0, "step": 59720 }, { "entropy": 0.6411442697048187, "epoch": 0.47784, "grad_norm": 1.3717421293258667, "learning_rate": 2.6118847539015606e-05, "loss": 0.6402, "mean_token_accuracy": 0.8096321582794189, "num_tokens": 619520160.0, "step": 59730 }, { "entropy": 0.6766992270946502, "epoch": 0.47792, "grad_norm": 2.545731782913208, "learning_rate": 2.6114845938375353e-05, "loss": 0.6779, "mean_token_accuracy": 0.7952846229076386, "num_tokens": 619649005.0, "step": 59740 }, { "entropy": 0.7258336663246154, "epoch": 0.478, "grad_norm": 4.0006585121154785, "learning_rate": 2.6110844337735097e-05, "loss": 0.7146, "mean_token_accuracy": 0.8113423943519592, "num_tokens": 619686118.0, "step": 59750 }, { "entropy": 0.7025550365447998, "epoch": 0.47808, "grad_norm": 1.722172498703003, "learning_rate": 2.6106842737094837e-05, "loss": 0.6944, "mean_token_accuracy": 0.7872679531574249, "num_tokens": 619849958.0, "step": 59760 }, { "entropy": 0.6426585018634796, "epoch": 0.47816, "grad_norm": 3.9219202995300293, "learning_rate": 2.610284113645458e-05, "loss": 0.6522, "mean_token_accuracy": 0.8098128497600555, "num_tokens": 619941036.0, "step": 59770 }, { "entropy": 0.6962129950523377, "epoch": 0.47824, "grad_norm": 2.03426456451416, "learning_rate": 2.6098839535814328e-05, "loss": 0.6942, "mean_token_accuracy": 0.8031960666179657, "num_tokens": 620033738.0, "step": 59780 }, { "entropy": 0.7416230380535126, "epoch": 0.47832, "grad_norm": 3.1574172973632812, "learning_rate": 2.6094837935174072e-05, "loss": 0.7381, "mean_token_accuracy": 0.7781583666801453, "num_tokens": 620177195.0, "step": 59790 }, { "entropy": 0.6729081690311431, "epoch": 0.4784, "grad_norm": 4.8924713134765625, "learning_rate": 2.6090836334533812e-05, "loss": 0.6698, "mean_token_accuracy": 0.8193308711051941, "num_tokens": 620218001.0, "step": 59800 }, { "entropy": 0.6504601299762726, "epoch": 0.47848, "grad_norm": 1.8805434703826904, "learning_rate": 2.6086834733893556e-05, "loss": 0.6481, "mean_token_accuracy": 0.7956155359745025, "num_tokens": 620381841.0, "step": 59810 }, { "entropy": 0.6739622831344605, "epoch": 0.47856, "grad_norm": 2.8966760635375977, "learning_rate": 2.6082833133253303e-05, "loss": 0.6608, "mean_token_accuracy": 0.8086280345916748, "num_tokens": 620461980.0, "step": 59820 }, { "entropy": 0.7173678040504455, "epoch": 0.47864, "grad_norm": 1.9220900535583496, "learning_rate": 2.6078831532613047e-05, "loss": 0.7193, "mean_token_accuracy": 0.7982868671417236, "num_tokens": 620555999.0, "step": 59830 }, { "entropy": 0.6614553213119507, "epoch": 0.47872, "grad_norm": 2.6307568550109863, "learning_rate": 2.6074829931972787e-05, "loss": 0.6658, "mean_token_accuracy": 0.79535773396492, "num_tokens": 620698655.0, "step": 59840 }, { "entropy": 0.6359871387481689, "epoch": 0.4788, "grad_norm": 4.404543876647949, "learning_rate": 2.6070828331332537e-05, "loss": 0.6462, "mean_token_accuracy": 0.8231445729732514, "num_tokens": 620736719.0, "step": 59850 }, { "entropy": 0.6872326910495759, "epoch": 0.47888, "grad_norm": 2.5821373462677, "learning_rate": 2.6066826730692278e-05, "loss": 0.6748, "mean_token_accuracy": 0.7927882313728333, "num_tokens": 620900559.0, "step": 59860 }, { "entropy": 0.7072861313819885, "epoch": 0.47896, "grad_norm": 3.0190656185150146, "learning_rate": 2.606282513005202e-05, "loss": 0.6873, "mean_token_accuracy": 0.8061353385448455, "num_tokens": 620982755.0, "step": 59870 }, { "entropy": 0.7168325334787369, "epoch": 0.47904, "grad_norm": 1.275860071182251, "learning_rate": 2.6058823529411762e-05, "loss": 0.7257, "mean_token_accuracy": 0.7953524172306061, "num_tokens": 621075968.0, "step": 59880 }, { "entropy": 0.6752283990383148, "epoch": 0.47912, "grad_norm": 2.3958120346069336, "learning_rate": 2.6054821928771512e-05, "loss": 0.6658, "mean_token_accuracy": 0.7984547436237335, "num_tokens": 621199416.0, "step": 59890 }, { "entropy": 0.7315541297197342, "epoch": 0.4792, "grad_norm": 6.453472137451172, "learning_rate": 2.6050820328131253e-05, "loss": 0.7411, "mean_token_accuracy": 0.8110078930854797, "num_tokens": 621235906.0, "step": 59900 }, { "entropy": 0.6497233867645263, "epoch": 0.47928, "grad_norm": 2.439659595489502, "learning_rate": 2.6046818727490996e-05, "loss": 0.6489, "mean_token_accuracy": 0.7930543601512909, "num_tokens": 621397914.0, "step": 59910 }, { "entropy": 0.6556126654148102, "epoch": 0.47936, "grad_norm": 3.074406147003174, "learning_rate": 2.6042817126850743e-05, "loss": 0.6433, "mean_token_accuracy": 0.8107444465160369, "num_tokens": 621466229.0, "step": 59920 }, { "entropy": 0.6555069565773011, "epoch": 0.47944, "grad_norm": 1.5385617017745972, "learning_rate": 2.6038815526210487e-05, "loss": 0.6437, "mean_token_accuracy": 0.8140006124973297, "num_tokens": 621557889.0, "step": 59930 }, { "entropy": 0.6607612133026123, "epoch": 0.47952, "grad_norm": 3.440002679824829, "learning_rate": 2.6034813925570227e-05, "loss": 0.6573, "mean_token_accuracy": 0.8032636761665344, "num_tokens": 621687753.0, "step": 59940 }, { "entropy": 0.6218266010284423, "epoch": 0.4796, "grad_norm": 5.145103454589844, "learning_rate": 2.603081232492997e-05, "loss": 0.6241, "mean_token_accuracy": 0.8254558622837067, "num_tokens": 621721553.0, "step": 59950 }, { "entropy": 0.6288398027420044, "epoch": 0.47968, "grad_norm": 1.6701970100402832, "learning_rate": 2.6026810724289718e-05, "loss": 0.6204, "mean_token_accuracy": 0.8000366389751434, "num_tokens": 621885393.0, "step": 59960 }, { "entropy": 0.5956594526767731, "epoch": 0.47976, "grad_norm": 3.16373872756958, "learning_rate": 2.6022809123649462e-05, "loss": 0.5966, "mean_token_accuracy": 0.822500205039978, "num_tokens": 621965163.0, "step": 59970 }, { "entropy": 0.6986241340637207, "epoch": 0.47984, "grad_norm": 2.587740182876587, "learning_rate": 2.6018807523009202e-05, "loss": 0.7064, "mean_token_accuracy": 0.7999425411224366, "num_tokens": 622059472.0, "step": 59980 }, { "entropy": 0.6416152358055115, "epoch": 0.47992, "grad_norm": 3.2164387702941895, "learning_rate": 2.6014805922368953e-05, "loss": 0.6362, "mean_token_accuracy": 0.7984353303909302, "num_tokens": 622207046.0, "step": 59990 }, { "entropy": 0.699173954129219, "epoch": 0.48, "grad_norm": 3.897608518600464, "learning_rate": 2.6010804321728693e-05, "loss": 0.6781, "mean_token_accuracy": 0.8198066532611847, "num_tokens": 622246794.0, "step": 60000 }, { "entropy": 0.654422253370285, "epoch": 0.48008, "grad_norm": 2.7212448120117188, "learning_rate": 2.6006802721088437e-05, "loss": 0.6553, "mean_token_accuracy": 0.793447732925415, "num_tokens": 622410634.0, "step": 60010 }, { "entropy": 0.60995654463768, "epoch": 0.48016, "grad_norm": 2.9678404331207275, "learning_rate": 2.6002801120448177e-05, "loss": 0.5983, "mean_token_accuracy": 0.8251031160354614, "num_tokens": 622489452.0, "step": 60020 }, { "entropy": 0.6949496626853943, "epoch": 0.48024, "grad_norm": 1.8300256729125977, "learning_rate": 2.5998799519807928e-05, "loss": 0.7011, "mean_token_accuracy": 0.8013212025165558, "num_tokens": 622582519.0, "step": 60030 }, { "entropy": 0.7074124157428742, "epoch": 0.48032, "grad_norm": 2.3616132736206055, "learning_rate": 2.5994797919167668e-05, "loss": 0.7002, "mean_token_accuracy": 0.7926465511322022, "num_tokens": 622705050.0, "step": 60040 }, { "entropy": 0.695035582780838, "epoch": 0.4804, "grad_norm": 4.841505527496338, "learning_rate": 2.599079631852741e-05, "loss": 0.6908, "mean_token_accuracy": 0.8178074359893799, "num_tokens": 622736279.0, "step": 60050 }, { "entropy": 0.657954341173172, "epoch": 0.48048, "grad_norm": 1.928248405456543, "learning_rate": 2.598679471788716e-05, "loss": 0.6523, "mean_token_accuracy": 0.7929094851016998, "num_tokens": 622899678.0, "step": 60060 }, { "entropy": 0.6853791773319244, "epoch": 0.48056, "grad_norm": 3.6872060298919678, "learning_rate": 2.5982793117246902e-05, "loss": 0.6662, "mean_token_accuracy": 0.811988753080368, "num_tokens": 622974939.0, "step": 60070 }, { "entropy": 0.7295688927173615, "epoch": 0.48064, "grad_norm": 2.755335569381714, "learning_rate": 2.5978791516606643e-05, "loss": 0.7302, "mean_token_accuracy": 0.7925171315670013, "num_tokens": 623067844.0, "step": 60080 }, { "entropy": 0.6839719474315643, "epoch": 0.48072, "grad_norm": 4.035314083099365, "learning_rate": 2.5974789915966387e-05, "loss": 0.681, "mean_token_accuracy": 0.793555223941803, "num_tokens": 623198653.0, "step": 60090 }, { "entropy": 0.6485968887805938, "epoch": 0.4808, "grad_norm": 5.626793384552002, "learning_rate": 2.5970788315326134e-05, "loss": 0.6521, "mean_token_accuracy": 0.823634248971939, "num_tokens": 623235372.0, "step": 60100 }, { "entropy": 0.675769105553627, "epoch": 0.48088, "grad_norm": 2.3848876953125, "learning_rate": 2.5966786714685877e-05, "loss": 0.6806, "mean_token_accuracy": 0.7887823760509491, "num_tokens": 623399212.0, "step": 60110 }, { "entropy": 0.6716840386390686, "epoch": 0.48096, "grad_norm": 3.2013590335845947, "learning_rate": 2.5962785114045618e-05, "loss": 0.6575, "mean_token_accuracy": 0.8042548477649689, "num_tokens": 623496185.0, "step": 60120 }, { "entropy": 0.7081140160560608, "epoch": 0.48104, "grad_norm": 1.9711993932724, "learning_rate": 2.5958783513405365e-05, "loss": 0.7007, "mean_token_accuracy": 0.8008740246295929, "num_tokens": 623591630.0, "step": 60130 }, { "entropy": 0.693169790506363, "epoch": 0.48112, "grad_norm": 2.0127980709075928, "learning_rate": 2.595478191276511e-05, "loss": 0.6964, "mean_token_accuracy": 0.7893802225589752, "num_tokens": 623739682.0, "step": 60140 }, { "entropy": 0.7047802567481994, "epoch": 0.4812, "grad_norm": 4.959932804107666, "learning_rate": 2.5950780312124852e-05, "loss": 0.71, "mean_token_accuracy": 0.8080256283283234, "num_tokens": 623780334.0, "step": 60150 }, { "entropy": 0.6810036897659302, "epoch": 0.48128, "grad_norm": 2.2990827560424805, "learning_rate": 2.5946778711484593e-05, "loss": 0.6742, "mean_token_accuracy": 0.7901731848716735, "num_tokens": 623942891.0, "step": 60160 }, { "entropy": 0.6890356421470643, "epoch": 0.48136, "grad_norm": 4.014347553253174, "learning_rate": 2.594277711084434e-05, "loss": 0.6756, "mean_token_accuracy": 0.8117249727249145, "num_tokens": 624013939.0, "step": 60170 }, { "entropy": 0.6832555532455444, "epoch": 0.48144, "grad_norm": 1.6472880840301514, "learning_rate": 2.5938775510204083e-05, "loss": 0.6856, "mean_token_accuracy": 0.8015100538730622, "num_tokens": 624105923.0, "step": 60180 }, { "entropy": 0.689279067516327, "epoch": 0.48152, "grad_norm": 2.514315128326416, "learning_rate": 2.5934773909563827e-05, "loss": 0.6908, "mean_token_accuracy": 0.7918782293796539, "num_tokens": 624239152.0, "step": 60190 }, { "entropy": 0.6464466422796249, "epoch": 0.4816, "grad_norm": 4.531768321990967, "learning_rate": 2.5930772308923574e-05, "loss": 0.6402, "mean_token_accuracy": 0.8265028357505798, "num_tokens": 624277536.0, "step": 60200 }, { "entropy": 0.6131896376609802, "epoch": 0.48168, "grad_norm": 1.8506656885147095, "learning_rate": 2.5926770708283314e-05, "loss": 0.6071, "mean_token_accuracy": 0.8044321596622467, "num_tokens": 624440532.0, "step": 60210 }, { "entropy": 0.726749700307846, "epoch": 0.48176, "grad_norm": 2.9589426517486572, "learning_rate": 2.5922769107643058e-05, "loss": 0.7167, "mean_token_accuracy": 0.7991234838962555, "num_tokens": 624523499.0, "step": 60220 }, { "entropy": 0.7250353991985321, "epoch": 0.48184, "grad_norm": 2.139944314956665, "learning_rate": 2.5918767507002802e-05, "loss": 0.7169, "mean_token_accuracy": 0.797137838602066, "num_tokens": 624616713.0, "step": 60230 }, { "entropy": 0.6512614667415619, "epoch": 0.48192, "grad_norm": 2.018606185913086, "learning_rate": 2.591476590636255e-05, "loss": 0.6495, "mean_token_accuracy": 0.7959393858909607, "num_tokens": 624767218.0, "step": 60240 }, { "entropy": 0.6610744416713714, "epoch": 0.482, "grad_norm": 5.384034633636475, "learning_rate": 2.591076430572229e-05, "loss": 0.6752, "mean_token_accuracy": 0.8133513808250428, "num_tokens": 624816008.0, "step": 60250 }, { "entropy": 0.6688408195972443, "epoch": 0.48208, "grad_norm": 1.9513041973114014, "learning_rate": 2.5906762705082033e-05, "loss": 0.6631, "mean_token_accuracy": 0.7917571425437927, "num_tokens": 624979611.0, "step": 60260 }, { "entropy": 0.6880227297544479, "epoch": 0.48216, "grad_norm": 3.7936697006225586, "learning_rate": 2.590276110444178e-05, "loss": 0.6832, "mean_token_accuracy": 0.805259644985199, "num_tokens": 625056858.0, "step": 60270 }, { "entropy": 0.7154501795768737, "epoch": 0.48224, "grad_norm": 2.0270984172821045, "learning_rate": 2.5898759503801524e-05, "loss": 0.7029, "mean_token_accuracy": 0.7965446352958679, "num_tokens": 625149610.0, "step": 60280 }, { "entropy": 0.7183961272239685, "epoch": 0.48232, "grad_norm": 2.0879456996917725, "learning_rate": 2.5894757903161264e-05, "loss": 0.7108, "mean_token_accuracy": 0.7802112758159637, "num_tokens": 625299018.0, "step": 60290 }, { "entropy": 0.6855530083179474, "epoch": 0.4824, "grad_norm": 5.796452045440674, "learning_rate": 2.5890756302521008e-05, "loss": 0.6902, "mean_token_accuracy": 0.8122883021831513, "num_tokens": 625343165.0, "step": 60300 }, { "entropy": 0.6316541582345963, "epoch": 0.48248, "grad_norm": 1.4668298959732056, "learning_rate": 2.5886754701880755e-05, "loss": 0.6278, "mean_token_accuracy": 0.8004763066768646, "num_tokens": 625507005.0, "step": 60310 }, { "entropy": 0.6495530039072037, "epoch": 0.48256, "grad_norm": 3.189734697341919, "learning_rate": 2.58827531012405e-05, "loss": 0.6353, "mean_token_accuracy": 0.8193647384643554, "num_tokens": 625586356.0, "step": 60320 }, { "entropy": 0.6781528890132904, "epoch": 0.48264, "grad_norm": 2.179922342300415, "learning_rate": 2.587875150060024e-05, "loss": 0.6845, "mean_token_accuracy": 0.8064682245254516, "num_tokens": 625678592.0, "step": 60330 }, { "entropy": 0.6662903666496277, "epoch": 0.48272, "grad_norm": 1.9847166538238525, "learning_rate": 2.587474989995999e-05, "loss": 0.6671, "mean_token_accuracy": 0.7948631405830383, "num_tokens": 625823879.0, "step": 60340 }, { "entropy": 0.7187945425510407, "epoch": 0.4828, "grad_norm": 4.355136871337891, "learning_rate": 2.587074829931973e-05, "loss": 0.7229, "mean_token_accuracy": 0.8119989693164825, "num_tokens": 625869790.0, "step": 60350 }, { "entropy": 0.6402855277061462, "epoch": 0.48288, "grad_norm": 1.7204716205596924, "learning_rate": 2.5866746698679474e-05, "loss": 0.633, "mean_token_accuracy": 0.8008914411067962, "num_tokens": 626033149.0, "step": 60360 }, { "entropy": 0.6713751465082168, "epoch": 0.48296, "grad_norm": 4.606080532073975, "learning_rate": 2.5862745098039214e-05, "loss": 0.6681, "mean_token_accuracy": 0.8059638381004334, "num_tokens": 626112023.0, "step": 60370 }, { "entropy": 0.6912060767412186, "epoch": 0.48304, "grad_norm": 1.6856249570846558, "learning_rate": 2.5858743497398964e-05, "loss": 0.6844, "mean_token_accuracy": 0.8011143505573273, "num_tokens": 626205244.0, "step": 60380 }, { "entropy": 0.6868738770484925, "epoch": 0.48312, "grad_norm": 1.713769793510437, "learning_rate": 2.5854741896758705e-05, "loss": 0.675, "mean_token_accuracy": 0.7898740947246552, "num_tokens": 626351384.0, "step": 60390 }, { "entropy": 0.7616458773612976, "epoch": 0.4832, "grad_norm": 4.341314315795898, "learning_rate": 2.585074029611845e-05, "loss": 0.7643, "mean_token_accuracy": 0.8016923904418946, "num_tokens": 626389501.0, "step": 60400 }, { "entropy": 0.6233104944229126, "epoch": 0.48328, "grad_norm": 1.9540355205535889, "learning_rate": 2.584673869547819e-05, "loss": 0.6198, "mean_token_accuracy": 0.8042440176010132, "num_tokens": 626553341.0, "step": 60410 }, { "entropy": 0.6404976010322571, "epoch": 0.48336, "grad_norm": 3.1986806392669678, "learning_rate": 2.584273709483794e-05, "loss": 0.6339, "mean_token_accuracy": 0.8143339455127716, "num_tokens": 626643615.0, "step": 60420 }, { "entropy": 0.7193897366523743, "epoch": 0.48344, "grad_norm": 1.955572247505188, "learning_rate": 2.583873549419768e-05, "loss": 0.7115, "mean_token_accuracy": 0.7953936994075775, "num_tokens": 626738145.0, "step": 60430 }, { "entropy": 0.673370224237442, "epoch": 0.48352, "grad_norm": 3.722048759460449, "learning_rate": 2.5834733893557423e-05, "loss": 0.6763, "mean_token_accuracy": 0.7943370938301086, "num_tokens": 626874141.0, "step": 60440 }, { "entropy": 0.6698231220245361, "epoch": 0.4836, "grad_norm": 3.86653733253479, "learning_rate": 2.583073229291717e-05, "loss": 0.6521, "mean_token_accuracy": 0.8234595715999603, "num_tokens": 626911788.0, "step": 60450 }, { "entropy": 0.6014222800731659, "epoch": 0.48368, "grad_norm": 1.2901018857955933, "learning_rate": 2.5826730692276914e-05, "loss": 0.5997, "mean_token_accuracy": 0.8066744089126587, "num_tokens": 627075628.0, "step": 60460 }, { "entropy": 0.6744979172945023, "epoch": 0.48376, "grad_norm": 3.069603204727173, "learning_rate": 2.5822729091636654e-05, "loss": 0.6701, "mean_token_accuracy": 0.8040855884552002, "num_tokens": 627176814.0, "step": 60470 }, { "entropy": 0.7077443987131119, "epoch": 0.48384, "grad_norm": 4.633269309997559, "learning_rate": 2.5818727490996398e-05, "loss": 0.7027, "mean_token_accuracy": 0.8004673182964325, "num_tokens": 627272046.0, "step": 60480 }, { "entropy": 0.70757497549057, "epoch": 0.48392, "grad_norm": 1.915460228919983, "learning_rate": 2.5814725890356145e-05, "loss": 0.7005, "mean_token_accuracy": 0.7850109994411468, "num_tokens": 627414708.0, "step": 60490 }, { "entropy": 0.6954103350639343, "epoch": 0.484, "grad_norm": 4.695506572723389, "learning_rate": 2.581072428971589e-05, "loss": 0.7021, "mean_token_accuracy": 0.812529593706131, "num_tokens": 627457835.0, "step": 60500 }, { "entropy": 0.6489770591259003, "epoch": 0.48408, "grad_norm": 1.9582792520523071, "learning_rate": 2.580672268907563e-05, "loss": 0.6439, "mean_token_accuracy": 0.7959849238395691, "num_tokens": 627620988.0, "step": 60510 }, { "entropy": 0.66464963555336, "epoch": 0.48416, "grad_norm": 2.7156381607055664, "learning_rate": 2.5802721088435376e-05, "loss": 0.6519, "mean_token_accuracy": 0.8136729598045349, "num_tokens": 627695366.0, "step": 60520 }, { "entropy": 0.618604964017868, "epoch": 0.48424, "grad_norm": 1.7725073099136353, "learning_rate": 2.579871948779512e-05, "loss": 0.6119, "mean_token_accuracy": 0.8209079802036285, "num_tokens": 627787381.0, "step": 60530 }, { "entropy": 0.7000254154205322, "epoch": 0.48432, "grad_norm": 2.8047409057617188, "learning_rate": 2.5794717887154864e-05, "loss": 0.691, "mean_token_accuracy": 0.7892709672451019, "num_tokens": 627932371.0, "step": 60540 }, { "entropy": 0.6917384564876556, "epoch": 0.4844, "grad_norm": 5.221975326538086, "learning_rate": 2.5790716286514604e-05, "loss": 0.6894, "mean_token_accuracy": 0.8145421802997589, "num_tokens": 627968977.0, "step": 60550 }, { "entropy": 0.5948905944824219, "epoch": 0.48448, "grad_norm": 2.0520856380462646, "learning_rate": 2.578671468587435e-05, "loss": 0.5915, "mean_token_accuracy": 0.8063141167163849, "num_tokens": 628132817.0, "step": 60560 }, { "entropy": 0.6416551530361175, "epoch": 0.48456, "grad_norm": 2.4835245609283447, "learning_rate": 2.5782713085234095e-05, "loss": 0.631, "mean_token_accuracy": 0.8163849413394928, "num_tokens": 628226326.0, "step": 60570 }, { "entropy": 0.6887041985988617, "epoch": 0.48464, "grad_norm": 2.239973306655884, "learning_rate": 2.577871148459384e-05, "loss": 0.6885, "mean_token_accuracy": 0.7987572491168976, "num_tokens": 628322410.0, "step": 60580 }, { "entropy": 0.6593100368976593, "epoch": 0.48472, "grad_norm": 2.4777252674102783, "learning_rate": 2.5774709883953586e-05, "loss": 0.6564, "mean_token_accuracy": 0.7966041564941406, "num_tokens": 628464900.0, "step": 60590 }, { "entropy": 0.5972219675779342, "epoch": 0.4848, "grad_norm": 5.234426975250244, "learning_rate": 2.5770708283313326e-05, "loss": 0.5882, "mean_token_accuracy": 0.8377853572368622, "num_tokens": 628506660.0, "step": 60600 }, { "entropy": 0.6215960025787354, "epoch": 0.48488, "grad_norm": 2.324458122253418, "learning_rate": 2.576670668267307e-05, "loss": 0.6222, "mean_token_accuracy": 0.8007155239582062, "num_tokens": 628669662.0, "step": 60610 }, { "entropy": 0.6908172607421875, "epoch": 0.48496, "grad_norm": 3.2872562408447266, "learning_rate": 2.5762705082032813e-05, "loss": 0.6794, "mean_token_accuracy": 0.8052700519561767, "num_tokens": 628750513.0, "step": 60620 }, { "entropy": 0.7281113684177398, "epoch": 0.48504, "grad_norm": 1.621991515159607, "learning_rate": 2.575870348139256e-05, "loss": 0.7267, "mean_token_accuracy": 0.7942057073116302, "num_tokens": 628843713.0, "step": 60630 }, { "entropy": 0.7135819375514985, "epoch": 0.48512, "grad_norm": 2.3050220012664795, "learning_rate": 2.57547018807523e-05, "loss": 0.7162, "mean_token_accuracy": 0.7856935918331146, "num_tokens": 628976774.0, "step": 60640 }, { "entropy": 0.659039306640625, "epoch": 0.4852, "grad_norm": 4.206763744354248, "learning_rate": 2.5750700280112045e-05, "loss": 0.6478, "mean_token_accuracy": 0.8231286823749542, "num_tokens": 629015288.0, "step": 60650 }, { "entropy": 0.6456834077835083, "epoch": 0.48528, "grad_norm": 2.418494939804077, "learning_rate": 2.5746698679471792e-05, "loss": 0.6477, "mean_token_accuracy": 0.796977287530899, "num_tokens": 629179128.0, "step": 60660 }, { "entropy": 0.649130892753601, "epoch": 0.48536, "grad_norm": 3.010709285736084, "learning_rate": 2.5742697078831535e-05, "loss": 0.6402, "mean_token_accuracy": 0.816287761926651, "num_tokens": 629264881.0, "step": 60670 }, { "entropy": 0.7137044787406921, "epoch": 0.48544, "grad_norm": 1.9888287782669067, "learning_rate": 2.5738695478191276e-05, "loss": 0.7047, "mean_token_accuracy": 0.7971965491771698, "num_tokens": 629357291.0, "step": 60680 }, { "entropy": 0.662259042263031, "epoch": 0.48552, "grad_norm": 3.0527095794677734, "learning_rate": 2.573469387755102e-05, "loss": 0.6653, "mean_token_accuracy": 0.796947717666626, "num_tokens": 629500438.0, "step": 60690 }, { "entropy": 0.649699541926384, "epoch": 0.4856, "grad_norm": 4.592152118682861, "learning_rate": 2.5730692276910767e-05, "loss": 0.6347, "mean_token_accuracy": 0.8234811663627625, "num_tokens": 629539578.0, "step": 60700 }, { "entropy": 0.6520035862922668, "epoch": 0.48568, "grad_norm": 1.6995670795440674, "learning_rate": 2.572669067627051e-05, "loss": 0.6505, "mean_token_accuracy": 0.7974719107151031, "num_tokens": 629703418.0, "step": 60710 }, { "entropy": 0.5930584579706192, "epoch": 0.48576, "grad_norm": 3.0712714195251465, "learning_rate": 2.572268907563025e-05, "loss": 0.5838, "mean_token_accuracy": 0.8268367886543274, "num_tokens": 629782575.0, "step": 60720 }, { "entropy": 0.6259035885334014, "epoch": 0.48584, "grad_norm": 1.6924384832382202, "learning_rate": 2.571868747499e-05, "loss": 0.6172, "mean_token_accuracy": 0.8183516085147857, "num_tokens": 629876620.0, "step": 60730 }, { "entropy": 0.6789948642253876, "epoch": 0.48592, "grad_norm": 2.5958831310272217, "learning_rate": 2.571468587434974e-05, "loss": 0.6865, "mean_token_accuracy": 0.7890527606010437, "num_tokens": 630022756.0, "step": 60740 }, { "entropy": 0.7151313692331314, "epoch": 0.486, "grad_norm": 4.878241062164307, "learning_rate": 2.5710684273709485e-05, "loss": 0.7075, "mean_token_accuracy": 0.8104170262813568, "num_tokens": 630063245.0, "step": 60750 }, { "entropy": 0.689256340265274, "epoch": 0.48608, "grad_norm": 2.0439679622650146, "learning_rate": 2.5706682673069225e-05, "loss": 0.6763, "mean_token_accuracy": 0.7907975196838379, "num_tokens": 630227085.0, "step": 60760 }, { "entropy": 0.6626453995704651, "epoch": 0.48616, "grad_norm": 4.432923316955566, "learning_rate": 2.5702681072428976e-05, "loss": 0.663, "mean_token_accuracy": 0.8089427947998047, "num_tokens": 630317266.0, "step": 60770 }, { "entropy": 0.7455607414245605, "epoch": 0.48624, "grad_norm": 2.905771255493164, "learning_rate": 2.5698679471788716e-05, "loss": 0.7432, "mean_token_accuracy": 0.7900298774242401, "num_tokens": 630410822.0, "step": 60780 }, { "entropy": 0.7070181190967559, "epoch": 0.48632, "grad_norm": 2.2306971549987793, "learning_rate": 2.569467787114846e-05, "loss": 0.6987, "mean_token_accuracy": 0.78502858877182, "num_tokens": 630554730.0, "step": 60790 }, { "entropy": 0.5894596397876739, "epoch": 0.4864, "grad_norm": 5.016933441162109, "learning_rate": 2.5690676270508207e-05, "loss": 0.5863, "mean_token_accuracy": 0.8347872853279114, "num_tokens": 630594271.0, "step": 60800 }, { "entropy": 0.6199175715446472, "epoch": 0.48648, "grad_norm": 1.357535481452942, "learning_rate": 2.568667466986795e-05, "loss": 0.6176, "mean_token_accuracy": 0.8026746451854706, "num_tokens": 630758111.0, "step": 60810 }, { "entropy": 0.6065685868263244, "epoch": 0.48656, "grad_norm": 3.0086474418640137, "learning_rate": 2.568267306922769e-05, "loss": 0.6021, "mean_token_accuracy": 0.8188140511512756, "num_tokens": 630850735.0, "step": 60820 }, { "entropy": 0.6786951363086701, "epoch": 0.48664, "grad_norm": 2.7696852684020996, "learning_rate": 2.5678671468587435e-05, "loss": 0.6762, "mean_token_accuracy": 0.8060827732086182, "num_tokens": 630946278.0, "step": 60830 }, { "entropy": 0.6546412289142609, "epoch": 0.48672, "grad_norm": 1.9155973196029663, "learning_rate": 2.5674669867947182e-05, "loss": 0.656, "mean_token_accuracy": 0.79577756524086, "num_tokens": 631093452.0, "step": 60840 }, { "entropy": 0.6864094197750091, "epoch": 0.4868, "grad_norm": 5.036472320556641, "learning_rate": 2.5670668267306926e-05, "loss": 0.6828, "mean_token_accuracy": 0.8190833628177643, "num_tokens": 631133675.0, "step": 60850 }, { "entropy": 0.6708236932754517, "epoch": 0.48688, "grad_norm": 1.4419736862182617, "learning_rate": 2.5666666666666666e-05, "loss": 0.6715, "mean_token_accuracy": 0.787194675207138, "num_tokens": 631297515.0, "step": 60860 }, { "entropy": 0.6858843564987183, "epoch": 0.48696, "grad_norm": 3.3391377925872803, "learning_rate": 2.5662665066026413e-05, "loss": 0.6719, "mean_token_accuracy": 0.8085824251174927, "num_tokens": 631382996.0, "step": 60870 }, { "entropy": 0.6617105603218079, "epoch": 0.48704, "grad_norm": 1.5766263008117676, "learning_rate": 2.5658663465386157e-05, "loss": 0.6711, "mean_token_accuracy": 0.8056246101856231, "num_tokens": 631477103.0, "step": 60880 }, { "entropy": 0.6861165463924408, "epoch": 0.48712, "grad_norm": 2.98248028755188, "learning_rate": 2.56546618647459e-05, "loss": 0.6771, "mean_token_accuracy": 0.7946921765804291, "num_tokens": 631608245.0, "step": 60890 }, { "entropy": 0.6901803076267242, "epoch": 0.4872, "grad_norm": 4.024087905883789, "learning_rate": 2.565066026410564e-05, "loss": 0.686, "mean_token_accuracy": 0.8194124281406403, "num_tokens": 631646730.0, "step": 60900 }, { "entropy": 0.6915896773338318, "epoch": 0.48728, "grad_norm": 2.1810989379882812, "learning_rate": 2.5646658663465388e-05, "loss": 0.6912, "mean_token_accuracy": 0.7846666038036346, "num_tokens": 631810570.0, "step": 60910 }, { "entropy": 0.6243165910243988, "epoch": 0.48736, "grad_norm": 3.3602466583251953, "learning_rate": 2.564265706282513e-05, "loss": 0.6213, "mean_token_accuracy": 0.8165643513202667, "num_tokens": 631916353.0, "step": 60920 }, { "entropy": 0.6551752209663391, "epoch": 0.48744, "grad_norm": 1.6441222429275513, "learning_rate": 2.5638655462184875e-05, "loss": 0.647, "mean_token_accuracy": 0.8090303897857666, "num_tokens": 632012331.0, "step": 60930 }, { "entropy": 0.7278768002986908, "epoch": 0.48752, "grad_norm": 2.059722900390625, "learning_rate": 2.5634653861544616e-05, "loss": 0.7199, "mean_token_accuracy": 0.7851823806762696, "num_tokens": 632150942.0, "step": 60940 }, { "entropy": 0.6711848825216293, "epoch": 0.4876, "grad_norm": 5.436330795288086, "learning_rate": 2.5630652260904363e-05, "loss": 0.6725, "mean_token_accuracy": 0.821826821565628, "num_tokens": 632192907.0, "step": 60950 }, { "entropy": 0.6207779586315155, "epoch": 0.48768, "grad_norm": 1.8004114627838135, "learning_rate": 2.5626650660264107e-05, "loss": 0.6183, "mean_token_accuracy": 0.800512033700943, "num_tokens": 632356703.0, "step": 60960 }, { "entropy": 0.6305449485778809, "epoch": 0.48776, "grad_norm": 3.333528995513916, "learning_rate": 2.562264905962385e-05, "loss": 0.6303, "mean_token_accuracy": 0.8155445456504822, "num_tokens": 632434741.0, "step": 60970 }, { "entropy": 0.6797397077083588, "epoch": 0.48784, "grad_norm": 1.9399782419204712, "learning_rate": 2.5618647458983597e-05, "loss": 0.6668, "mean_token_accuracy": 0.8089748203754425, "num_tokens": 632527933.0, "step": 60980 }, { "entropy": 0.6903063416481018, "epoch": 0.48792, "grad_norm": 3.9934144020080566, "learning_rate": 2.5614645858343338e-05, "loss": 0.6841, "mean_token_accuracy": 0.793779045343399, "num_tokens": 632662155.0, "step": 60990 }, { "entropy": 0.6328573644161224, "epoch": 0.488, "grad_norm": 4.066674709320068, "learning_rate": 2.561064425770308e-05, "loss": 0.6332, "mean_token_accuracy": 0.8258464217185975, "num_tokens": 632701217.0, "step": 61000 }, { "entropy": 0.6469392597675323, "epoch": 0.48808, "grad_norm": 1.4038935899734497, "learning_rate": 2.5606642657062825e-05, "loss": 0.6528, "mean_token_accuracy": 0.796427708864212, "num_tokens": 632865057.0, "step": 61010 }, { "entropy": 0.6841803848743438, "epoch": 0.48816, "grad_norm": 3.4586682319641113, "learning_rate": 2.5602641056422572e-05, "loss": 0.683, "mean_token_accuracy": 0.8109972417354584, "num_tokens": 632943418.0, "step": 61020 }, { "entropy": 0.6700054705142975, "epoch": 0.48824, "grad_norm": 2.023792266845703, "learning_rate": 2.5598639455782313e-05, "loss": 0.6617, "mean_token_accuracy": 0.8105735778808594, "num_tokens": 633035859.0, "step": 61030 }, { "entropy": 0.6343393385410309, "epoch": 0.48832, "grad_norm": 1.9055157899856567, "learning_rate": 2.5594637855142056e-05, "loss": 0.6304, "mean_token_accuracy": 0.8013386785984039, "num_tokens": 633189876.0, "step": 61040 }, { "entropy": 0.7144968509674072, "epoch": 0.4884, "grad_norm": 4.6955366134643555, "learning_rate": 2.5590636254501803e-05, "loss": 0.7057, "mean_token_accuracy": 0.8112360715866089, "num_tokens": 633234985.0, "step": 61050 }, { "entropy": 0.631145566701889, "epoch": 0.48848, "grad_norm": 1.648148775100708, "learning_rate": 2.5586634653861547e-05, "loss": 0.6417, "mean_token_accuracy": 0.7963910579681397, "num_tokens": 633398825.0, "step": 61060 }, { "entropy": 0.6087826669216156, "epoch": 0.48856, "grad_norm": 4.336842060089111, "learning_rate": 2.5582633053221287e-05, "loss": 0.6032, "mean_token_accuracy": 0.8223304390907288, "num_tokens": 633484019.0, "step": 61070 }, { "entropy": 0.6916113197803497, "epoch": 0.48864, "grad_norm": 2.6689043045043945, "learning_rate": 2.557863145258103e-05, "loss": 0.6887, "mean_token_accuracy": 0.8017820417881012, "num_tokens": 633576426.0, "step": 61080 }, { "entropy": 0.6396049678325653, "epoch": 0.48872, "grad_norm": 1.9989700317382812, "learning_rate": 2.5574629851940778e-05, "loss": 0.6267, "mean_token_accuracy": 0.8071407496929168, "num_tokens": 633716638.0, "step": 61090 }, { "entropy": 0.7063804686069488, "epoch": 0.4888, "grad_norm": 5.928951263427734, "learning_rate": 2.5570628251300522e-05, "loss": 0.7061, "mean_token_accuracy": 0.8104678571224213, "num_tokens": 633757193.0, "step": 61100 }, { "entropy": 0.7049766302108764, "epoch": 0.48888, "grad_norm": 2.122115135192871, "learning_rate": 2.5566626650660262e-05, "loss": 0.708, "mean_token_accuracy": 0.7805765688419342, "num_tokens": 633920531.0, "step": 61110 }, { "entropy": 0.7039755046367645, "epoch": 0.48896, "grad_norm": 2.9377453327178955, "learning_rate": 2.5562625050020013e-05, "loss": 0.6911, "mean_token_accuracy": 0.80340735912323, "num_tokens": 633997964.0, "step": 61120 }, { "entropy": 0.6723601460456848, "epoch": 0.48904, "grad_norm": 2.3815906047821045, "learning_rate": 2.5558623449379753e-05, "loss": 0.6706, "mean_token_accuracy": 0.8070603847503662, "num_tokens": 634091394.0, "step": 61130 }, { "entropy": 0.6928466081619262, "epoch": 0.48912, "grad_norm": 2.913569688796997, "learning_rate": 2.5554621848739497e-05, "loss": 0.6936, "mean_token_accuracy": 0.7914378046989441, "num_tokens": 634229226.0, "step": 61140 }, { "entropy": 0.7114175677299499, "epoch": 0.4892, "grad_norm": 4.667135715484619, "learning_rate": 2.5550620248099237e-05, "loss": 0.7064, "mean_token_accuracy": 0.8153332352638245, "num_tokens": 634264545.0, "step": 61150 }, { "entropy": 0.6664730072021484, "epoch": 0.48928, "grad_norm": 1.9698399305343628, "learning_rate": 2.5546618647458988e-05, "loss": 0.6599, "mean_token_accuracy": 0.7918295204639435, "num_tokens": 634428385.0, "step": 61160 }, { "entropy": 0.6949564158916474, "epoch": 0.48936, "grad_norm": 3.4715709686279297, "learning_rate": 2.5542617046818728e-05, "loss": 0.6954, "mean_token_accuracy": 0.8044065237045288, "num_tokens": 634513592.0, "step": 61170 }, { "entropy": 0.682863712310791, "epoch": 0.48944, "grad_norm": 2.004701852798462, "learning_rate": 2.553861544617847e-05, "loss": 0.6788, "mean_token_accuracy": 0.8037137389183044, "num_tokens": 634606926.0, "step": 61180 }, { "entropy": 0.7192264914512634, "epoch": 0.48952, "grad_norm": 2.697089910507202, "learning_rate": 2.553461384553822e-05, "loss": 0.7179, "mean_token_accuracy": 0.7823391556739807, "num_tokens": 634741898.0, "step": 61190 }, { "entropy": 0.7352488338947296, "epoch": 0.4896, "grad_norm": 4.547582149505615, "learning_rate": 2.5530612244897962e-05, "loss": 0.7183, "mean_token_accuracy": 0.8096485316753388, "num_tokens": 634782338.0, "step": 61200 }, { "entropy": 0.6120879948139191, "epoch": 0.48968, "grad_norm": 2.2640533447265625, "learning_rate": 2.5526610644257703e-05, "loss": 0.6058, "mean_token_accuracy": 0.8058112382888794, "num_tokens": 634942758.0, "step": 61210 }, { "entropy": 0.6091947466135025, "epoch": 0.48976, "grad_norm": 2.5859715938568115, "learning_rate": 2.5522609043617446e-05, "loss": 0.6003, "mean_token_accuracy": 0.8283100128173828, "num_tokens": 635016717.0, "step": 61220 }, { "entropy": 0.610378485918045, "epoch": 0.48984, "grad_norm": 1.6452275514602661, "learning_rate": 2.5518607442977194e-05, "loss": 0.6088, "mean_token_accuracy": 0.823932433128357, "num_tokens": 635111138.0, "step": 61230 }, { "entropy": 0.680195438861847, "epoch": 0.48992, "grad_norm": 2.2942216396331787, "learning_rate": 2.5514605842336937e-05, "loss": 0.6817, "mean_token_accuracy": 0.7949445962905883, "num_tokens": 635253165.0, "step": 61240 }, { "entropy": 0.621348237991333, "epoch": 0.49, "grad_norm": 4.403073310852051, "learning_rate": 2.5510604241696678e-05, "loss": 0.6038, "mean_token_accuracy": 0.828113317489624, "num_tokens": 635297926.0, "step": 61250 }, { "entropy": 0.6360652506351471, "epoch": 0.49008, "grad_norm": 1.3902777433395386, "learning_rate": 2.5506602641056425e-05, "loss": 0.6329, "mean_token_accuracy": 0.7972581863403321, "num_tokens": 635461766.0, "step": 61260 }, { "entropy": 0.6381272971630096, "epoch": 0.49016, "grad_norm": 3.081378936767578, "learning_rate": 2.550260104041617e-05, "loss": 0.6279, "mean_token_accuracy": 0.8219573736190796, "num_tokens": 635545087.0, "step": 61270 }, { "entropy": 0.6298235476016998, "epoch": 0.49024, "grad_norm": 2.3739125728607178, "learning_rate": 2.5498599439775912e-05, "loss": 0.6325, "mean_token_accuracy": 0.8100820183753967, "num_tokens": 635639737.0, "step": 61280 }, { "entropy": 0.6287094950675964, "epoch": 0.49032, "grad_norm": 2.0067508220672607, "learning_rate": 2.5494597839135652e-05, "loss": 0.6264, "mean_token_accuracy": 0.8019667744636536, "num_tokens": 635780162.0, "step": 61290 }, { "entropy": 0.7250434041023255, "epoch": 0.4904, "grad_norm": 4.890738010406494, "learning_rate": 2.54905962384954e-05, "loss": 0.7148, "mean_token_accuracy": 0.8101273655891419, "num_tokens": 635818301.0, "step": 61300 }, { "entropy": 0.6142259299755096, "epoch": 0.49048, "grad_norm": 1.554621696472168, "learning_rate": 2.5486594637855143e-05, "loss": 0.6151, "mean_token_accuracy": 0.8044733166694641, "num_tokens": 635981251.0, "step": 61310 }, { "entropy": 0.679982590675354, "epoch": 0.49056, "grad_norm": 3.3670637607574463, "learning_rate": 2.5482593037214887e-05, "loss": 0.6633, "mean_token_accuracy": 0.8120238900184631, "num_tokens": 636056079.0, "step": 61320 }, { "entropy": 0.6341813445091248, "epoch": 0.49064, "grad_norm": 1.368764877319336, "learning_rate": 2.5478591436574634e-05, "loss": 0.6341, "mean_token_accuracy": 0.811644321680069, "num_tokens": 636150016.0, "step": 61330 }, { "entropy": 0.653733566403389, "epoch": 0.49072, "grad_norm": 1.7780303955078125, "learning_rate": 2.5474589835934374e-05, "loss": 0.6492, "mean_token_accuracy": 0.801109105348587, "num_tokens": 636297696.0, "step": 61340 }, { "entropy": 0.6998235523700714, "epoch": 0.4908, "grad_norm": 6.5420098304748535, "learning_rate": 2.5470588235294118e-05, "loss": 0.7007, "mean_token_accuracy": 0.8076989710330963, "num_tokens": 636341156.0, "step": 61350 }, { "entropy": 0.6545989811420441, "epoch": 0.49088, "grad_norm": 2.9242968559265137, "learning_rate": 2.5466586634653862e-05, "loss": 0.6527, "mean_token_accuracy": 0.7937103152275086, "num_tokens": 636504996.0, "step": 61360 }, { "entropy": 0.6916651964187622, "epoch": 0.49096, "grad_norm": 3.0887844562530518, "learning_rate": 2.546258503401361e-05, "loss": 0.684, "mean_token_accuracy": 0.7996572911739349, "num_tokens": 636597375.0, "step": 61370 }, { "entropy": 0.7261339902877808, "epoch": 0.49104, "grad_norm": 1.6908127069473267, "learning_rate": 2.545858343337335e-05, "loss": 0.7148, "mean_token_accuracy": 0.8001236200332642, "num_tokens": 636689331.0, "step": 61380 }, { "entropy": 0.6726989090442658, "epoch": 0.49112, "grad_norm": 2.848219394683838, "learning_rate": 2.5454581832733093e-05, "loss": 0.6759, "mean_token_accuracy": 0.7924690246582031, "num_tokens": 636821859.0, "step": 61390 }, { "entropy": 0.675788426399231, "epoch": 0.4912, "grad_norm": 6.692137241363525, "learning_rate": 2.545058023209284e-05, "loss": 0.6574, "mean_token_accuracy": 0.8242148697376251, "num_tokens": 636861030.0, "step": 61400 }, { "entropy": 0.6752780973911285, "epoch": 0.49128, "grad_norm": 1.723454475402832, "learning_rate": 2.5446578631452584e-05, "loss": 0.6725, "mean_token_accuracy": 0.7890021979808808, "num_tokens": 637024870.0, "step": 61410 }, { "entropy": 0.6564089000225067, "epoch": 0.49136, "grad_norm": 3.8216006755828857, "learning_rate": 2.5442577030812324e-05, "loss": 0.6421, "mean_token_accuracy": 0.8165695548057557, "num_tokens": 637116499.0, "step": 61420 }, { "entropy": 0.7209795117378235, "epoch": 0.49144, "grad_norm": 3.4220707416534424, "learning_rate": 2.5438575430172068e-05, "loss": 0.7077, "mean_token_accuracy": 0.7968351721763611, "num_tokens": 637208294.0, "step": 61430 }, { "entropy": 0.7073401987552643, "epoch": 0.49152, "grad_norm": 2.4908182621002197, "learning_rate": 2.5434573829531815e-05, "loss": 0.7027, "mean_token_accuracy": 0.7857542872428894, "num_tokens": 637348107.0, "step": 61440 }, { "entropy": 0.6974665522575378, "epoch": 0.4916, "grad_norm": 3.938323497772217, "learning_rate": 2.543057222889156e-05, "loss": 0.6803, "mean_token_accuracy": 0.8160731673240662, "num_tokens": 637386352.0, "step": 61450 }, { "entropy": 0.6761105597019196, "epoch": 0.49168, "grad_norm": 2.2638401985168457, "learning_rate": 2.54265706282513e-05, "loss": 0.6717, "mean_token_accuracy": 0.7875488638877869, "num_tokens": 637550192.0, "step": 61460 }, { "entropy": 0.7370860040187835, "epoch": 0.49176, "grad_norm": 3.2954905033111572, "learning_rate": 2.5422569027611043e-05, "loss": 0.731, "mean_token_accuracy": 0.7959962904453277, "num_tokens": 637633781.0, "step": 61470 }, { "entropy": 0.6728029608726501, "epoch": 0.49184, "grad_norm": 1.4254897832870483, "learning_rate": 2.541856742697079e-05, "loss": 0.6682, "mean_token_accuracy": 0.8092615902423859, "num_tokens": 637726791.0, "step": 61480 }, { "entropy": 0.6761537015438079, "epoch": 0.49192, "grad_norm": 2.378528356552124, "learning_rate": 2.5414565826330534e-05, "loss": 0.6718, "mean_token_accuracy": 0.8012082636356354, "num_tokens": 637857525.0, "step": 61490 }, { "entropy": 0.6331121057271958, "epoch": 0.492, "grad_norm": 5.389688968658447, "learning_rate": 2.5410564225690274e-05, "loss": 0.6443, "mean_token_accuracy": 0.8286308884620667, "num_tokens": 637894294.0, "step": 61500 }, { "entropy": 0.6427782207727433, "epoch": 0.49208, "grad_norm": 3.487179756164551, "learning_rate": 2.5406562625050024e-05, "loss": 0.6406, "mean_token_accuracy": 0.795627748966217, "num_tokens": 638058134.0, "step": 61510 }, { "entropy": 0.6953422546386718, "epoch": 0.49216, "grad_norm": 3.5655863285064697, "learning_rate": 2.5402561024409765e-05, "loss": 0.689, "mean_token_accuracy": 0.8034306824207306, "num_tokens": 638144795.0, "step": 61520 }, { "entropy": 0.727120965719223, "epoch": 0.49224, "grad_norm": 1.850712537765503, "learning_rate": 2.539855942376951e-05, "loss": 0.7207, "mean_token_accuracy": 0.795568710565567, "num_tokens": 638238141.0, "step": 61530 }, { "entropy": 0.6832220435142518, "epoch": 0.49232, "grad_norm": 4.229941368103027, "learning_rate": 2.539455782312925e-05, "loss": 0.6827, "mean_token_accuracy": 0.7966186583042145, "num_tokens": 638363881.0, "step": 61540 }, { "entropy": 0.7400774657726288, "epoch": 0.4924, "grad_norm": 4.61052942276001, "learning_rate": 2.5390556222489e-05, "loss": 0.7268, "mean_token_accuracy": 0.8104795336723327, "num_tokens": 638399666.0, "step": 61550 }, { "entropy": 0.6494962215423584, "epoch": 0.49248, "grad_norm": 2.143777370452881, "learning_rate": 2.538655462184874e-05, "loss": 0.64, "mean_token_accuracy": 0.7957511723041535, "num_tokens": 638562010.0, "step": 61560 }, { "entropy": 0.6175577282905579, "epoch": 0.49256, "grad_norm": 4.1419453620910645, "learning_rate": 2.5382553021208483e-05, "loss": 0.6209, "mean_token_accuracy": 0.8235719263553619, "num_tokens": 638637490.0, "step": 61570 }, { "entropy": 0.6830419063568115, "epoch": 0.49264, "grad_norm": 1.863390564918518, "learning_rate": 2.537855142056823e-05, "loss": 0.6696, "mean_token_accuracy": 0.8080535590648651, "num_tokens": 638731074.0, "step": 61580 }, { "entropy": 0.6584850013256073, "epoch": 0.49272, "grad_norm": 3.516422748565674, "learning_rate": 2.5374549819927974e-05, "loss": 0.6573, "mean_token_accuracy": 0.7963442862033844, "num_tokens": 638866279.0, "step": 61590 }, { "entropy": 0.6584871709346771, "epoch": 0.4928, "grad_norm": 4.068294525146484, "learning_rate": 2.5370548219287714e-05, "loss": 0.6562, "mean_token_accuracy": 0.8260300993919373, "num_tokens": 638903985.0, "step": 61600 }, { "entropy": 0.7033856749534607, "epoch": 0.49288, "grad_norm": 3.083378553390503, "learning_rate": 2.5366546618647458e-05, "loss": 0.6982, "mean_token_accuracy": 0.7823827624320984, "num_tokens": 639067825.0, "step": 61610 }, { "entropy": 0.646616929769516, "epoch": 0.49296, "grad_norm": 3.1443302631378174, "learning_rate": 2.5362545018007205e-05, "loss": 0.6466, "mean_token_accuracy": 0.8133613646030426, "num_tokens": 639153458.0, "step": 61620 }, { "entropy": 0.5938169360160828, "epoch": 0.49304, "grad_norm": 1.321929693222046, "learning_rate": 2.535854341736695e-05, "loss": 0.5926, "mean_token_accuracy": 0.8245396912097931, "num_tokens": 639247006.0, "step": 61630 }, { "entropy": 0.7515891253948211, "epoch": 0.49312, "grad_norm": 2.5771961212158203, "learning_rate": 2.535454181672669e-05, "loss": 0.7491, "mean_token_accuracy": 0.7776521503925323, "num_tokens": 639384604.0, "step": 61640 }, { "entropy": 0.6384440034627914, "epoch": 0.4932, "grad_norm": 5.377412796020508, "learning_rate": 2.535054021608644e-05, "loss": 0.6299, "mean_token_accuracy": 0.8329555630683899, "num_tokens": 639424725.0, "step": 61650 }, { "entropy": 0.7300046920776367, "epoch": 0.49328, "grad_norm": 1.8516730070114136, "learning_rate": 2.534653861544618e-05, "loss": 0.7273, "mean_token_accuracy": 0.7778856635093689, "num_tokens": 639588263.0, "step": 61660 }, { "entropy": 0.609979921579361, "epoch": 0.49336, "grad_norm": 3.741305351257324, "learning_rate": 2.5342537014805924e-05, "loss": 0.6014, "mean_token_accuracy": 0.8243019342422485, "num_tokens": 639674436.0, "step": 61670 }, { "entropy": 0.6567520439624787, "epoch": 0.49344, "grad_norm": 1.8480271100997925, "learning_rate": 2.5338535414165664e-05, "loss": 0.6519, "mean_token_accuracy": 0.8092283964157104, "num_tokens": 639769693.0, "step": 61680 }, { "entropy": 0.6947817087173462, "epoch": 0.49352, "grad_norm": 2.734421491622925, "learning_rate": 2.5334533813525415e-05, "loss": 0.69, "mean_token_accuracy": 0.7932745099067688, "num_tokens": 639900265.0, "step": 61690 }, { "entropy": 0.6531631797552109, "epoch": 0.4936, "grad_norm": 4.876878261566162, "learning_rate": 2.5330532212885155e-05, "loss": 0.6407, "mean_token_accuracy": 0.8234670698642731, "num_tokens": 639934989.0, "step": 61700 }, { "entropy": 0.625449538230896, "epoch": 0.49368, "grad_norm": 1.2584747076034546, "learning_rate": 2.53265306122449e-05, "loss": 0.6269, "mean_token_accuracy": 0.7993649244308472, "num_tokens": 640098829.0, "step": 61710 }, { "entropy": 0.7251902461051941, "epoch": 0.49376, "grad_norm": 2.539231300354004, "learning_rate": 2.5322529011604646e-05, "loss": 0.7113, "mean_token_accuracy": 0.7928549945354462, "num_tokens": 640196247.0, "step": 61720 }, { "entropy": 0.6954361200332642, "epoch": 0.49384, "grad_norm": 1.698061466217041, "learning_rate": 2.531852741096439e-05, "loss": 0.7007, "mean_token_accuracy": 0.8000323712825775, "num_tokens": 640291357.0, "step": 61730 }, { "entropy": 0.6604079723358154, "epoch": 0.49392, "grad_norm": 4.161979675292969, "learning_rate": 2.531452581032413e-05, "loss": 0.6594, "mean_token_accuracy": 0.7979464709758759, "num_tokens": 640429817.0, "step": 61740 }, { "entropy": 0.6856221079826355, "epoch": 0.494, "grad_norm": 4.384637832641602, "learning_rate": 2.5310524209683873e-05, "loss": 0.6895, "mean_token_accuracy": 0.8151118993759155, "num_tokens": 640469012.0, "step": 61750 }, { "entropy": 0.6247875154018402, "epoch": 0.49408, "grad_norm": 1.9110960960388184, "learning_rate": 2.530652260904362e-05, "loss": 0.6194, "mean_token_accuracy": 0.8012212991714478, "num_tokens": 640632852.0, "step": 61760 }, { "entropy": 0.712902969121933, "epoch": 0.49416, "grad_norm": 3.7236571311950684, "learning_rate": 2.5302521008403364e-05, "loss": 0.7056, "mean_token_accuracy": 0.7976668298244476, "num_tokens": 640734649.0, "step": 61770 }, { "entropy": 0.703230232000351, "epoch": 0.49424, "grad_norm": 2.5924551486968994, "learning_rate": 2.5298519407763105e-05, "loss": 0.7056, "mean_token_accuracy": 0.7943346083164216, "num_tokens": 640829916.0, "step": 61780 }, { "entropy": 0.6527904093265533, "epoch": 0.49432, "grad_norm": 2.3813133239746094, "learning_rate": 2.529451780712285e-05, "loss": 0.6466, "mean_token_accuracy": 0.8030732572078705, "num_tokens": 640970428.0, "step": 61790 }, { "entropy": 0.7228342711925506, "epoch": 0.4944, "grad_norm": 4.628760814666748, "learning_rate": 2.5290516206482595e-05, "loss": 0.7158, "mean_token_accuracy": 0.8096573293209076, "num_tokens": 641005214.0, "step": 61800 }, { "entropy": 0.6358957409858703, "epoch": 0.49448, "grad_norm": 1.9925751686096191, "learning_rate": 2.528651460584234e-05, "loss": 0.6369, "mean_token_accuracy": 0.7969456791877747, "num_tokens": 641168968.0, "step": 61810 }, { "entropy": 0.6598421633243561, "epoch": 0.49456, "grad_norm": 3.8877108097076416, "learning_rate": 2.528251300520208e-05, "loss": 0.6492, "mean_token_accuracy": 0.8144867479801178, "num_tokens": 641248900.0, "step": 61820 }, { "entropy": 0.7025318384170532, "epoch": 0.49464, "grad_norm": 1.4696046113967896, "learning_rate": 2.5278511404561827e-05, "loss": 0.7031, "mean_token_accuracy": 0.8022305607795716, "num_tokens": 641340366.0, "step": 61830 }, { "entropy": 0.681985604763031, "epoch": 0.49472, "grad_norm": 2.065983295440674, "learning_rate": 2.527450980392157e-05, "loss": 0.6735, "mean_token_accuracy": 0.793894749879837, "num_tokens": 641480433.0, "step": 61840 }, { "entropy": 0.6720235168933868, "epoch": 0.4948, "grad_norm": 5.11932897567749, "learning_rate": 2.5270508203281314e-05, "loss": 0.652, "mean_token_accuracy": 0.8217333436012269, "num_tokens": 641520431.0, "step": 61850 }, { "entropy": 0.6453580468893051, "epoch": 0.49488, "grad_norm": 2.25927734375, "learning_rate": 2.526650660264106e-05, "loss": 0.652, "mean_token_accuracy": 0.7957315683364868, "num_tokens": 641684271.0, "step": 61860 }, { "entropy": 0.7657237440347672, "epoch": 0.49496, "grad_norm": 5.115231513977051, "learning_rate": 2.52625050020008e-05, "loss": 0.7551, "mean_token_accuracy": 0.7866011500358582, "num_tokens": 641777865.0, "step": 61870 }, { "entropy": 0.6775718629360199, "epoch": 0.49504, "grad_norm": 1.9866386651992798, "learning_rate": 2.5258503401360545e-05, "loss": 0.6812, "mean_token_accuracy": 0.8032029271125793, "num_tokens": 641871939.0, "step": 61880 }, { "entropy": 0.6574476838111878, "epoch": 0.49512, "grad_norm": 2.5653135776519775, "learning_rate": 2.525450180072029e-05, "loss": 0.6544, "mean_token_accuracy": 0.7972841560840607, "num_tokens": 642011414.0, "step": 61890 }, { "entropy": 0.7157049000263214, "epoch": 0.4952, "grad_norm": 4.738633155822754, "learning_rate": 2.5250500200080036e-05, "loss": 0.7017, "mean_token_accuracy": 0.8112557053565979, "num_tokens": 642050354.0, "step": 61900 }, { "entropy": 0.6356938242912292, "epoch": 0.49528, "grad_norm": 1.3552955389022827, "learning_rate": 2.5246498599439776e-05, "loss": 0.633, "mean_token_accuracy": 0.7981924831867218, "num_tokens": 642214194.0, "step": 61910 }, { "entropy": 0.6703832656145096, "epoch": 0.49536, "grad_norm": 3.8956546783447266, "learning_rate": 2.524249699879952e-05, "loss": 0.6577, "mean_token_accuracy": 0.8067448496818542, "num_tokens": 642306101.0, "step": 61920 }, { "entropy": 0.6699199557304383, "epoch": 0.49544, "grad_norm": 2.334649085998535, "learning_rate": 2.5238495398159267e-05, "loss": 0.6808, "mean_token_accuracy": 0.8029103577136993, "num_tokens": 642400477.0, "step": 61930 }, { "entropy": 0.6236468136310578, "epoch": 0.49552, "grad_norm": 2.774244785308838, "learning_rate": 2.523449379751901e-05, "loss": 0.6152, "mean_token_accuracy": 0.8099499881267548, "num_tokens": 642538221.0, "step": 61940 }, { "entropy": 0.6559976279735565, "epoch": 0.4956, "grad_norm": 4.147159576416016, "learning_rate": 2.523049219687875e-05, "loss": 0.6336, "mean_token_accuracy": 0.823469340801239, "num_tokens": 642577917.0, "step": 61950 }, { "entropy": 0.6433407068252563, "epoch": 0.49568, "grad_norm": 1.8873096704483032, "learning_rate": 2.5226490596238495e-05, "loss": 0.6453, "mean_token_accuracy": 0.7938080132007599, "num_tokens": 642741757.0, "step": 61960 }, { "entropy": 0.6246983885765076, "epoch": 0.49576, "grad_norm": 3.234269142150879, "learning_rate": 2.5222488995598242e-05, "loss": 0.6297, "mean_token_accuracy": 0.8134955406188965, "num_tokens": 642829296.0, "step": 61970 }, { "entropy": 0.7220762431621551, "epoch": 0.49584, "grad_norm": 1.9333019256591797, "learning_rate": 2.5218487394957986e-05, "loss": 0.7208, "mean_token_accuracy": 0.796004843711853, "num_tokens": 642920969.0, "step": 61980 }, { "entropy": 0.6779268503189086, "epoch": 0.49592, "grad_norm": 3.0002970695495605, "learning_rate": 2.5214485794317726e-05, "loss": 0.6728, "mean_token_accuracy": 0.7961925566196442, "num_tokens": 643052664.0, "step": 61990 }, { "entropy": 0.6185632586479187, "epoch": 0.496, "grad_norm": 4.332509994506836, "learning_rate": 2.5210484193677476e-05, "loss": 0.6164, "mean_token_accuracy": 0.8288644194602967, "num_tokens": 643092274.0, "step": 62000 }, { "entropy": 0.6498896956443787, "epoch": 0.49608, "grad_norm": 1.640539526939392, "learning_rate": 2.5206482593037217e-05, "loss": 0.6519, "mean_token_accuracy": 0.7972459733486176, "num_tokens": 643256114.0, "step": 62010 }, { "entropy": 0.6275479525327683, "epoch": 0.49616, "grad_norm": 3.5753157138824463, "learning_rate": 2.520248099239696e-05, "loss": 0.616, "mean_token_accuracy": 0.8177673280239105, "num_tokens": 643345532.0, "step": 62020 }, { "entropy": 0.6154300749301911, "epoch": 0.49624, "grad_norm": 1.4192078113555908, "learning_rate": 2.51984793917567e-05, "loss": 0.6141, "mean_token_accuracy": 0.8172683656215668, "num_tokens": 643439637.0, "step": 62030 }, { "entropy": 0.6200065612792969, "epoch": 0.49632, "grad_norm": 3.1881825923919678, "learning_rate": 2.519447779111645e-05, "loss": 0.6211, "mean_token_accuracy": 0.8048364639282226, "num_tokens": 643585187.0, "step": 62040 }, { "entropy": 0.739230626821518, "epoch": 0.4964, "grad_norm": 4.438470840454102, "learning_rate": 2.519047619047619e-05, "loss": 0.7289, "mean_token_accuracy": 0.8048080325126648, "num_tokens": 643627978.0, "step": 62050 }, { "entropy": 0.6529638826847076, "epoch": 0.49648, "grad_norm": 2.954427719116211, "learning_rate": 2.5186474589835935e-05, "loss": 0.6472, "mean_token_accuracy": 0.7932472050189971, "num_tokens": 643791422.0, "step": 62060 }, { "entropy": 0.6619887560606003, "epoch": 0.49656, "grad_norm": 3.073927164077759, "learning_rate": 2.5182472989195676e-05, "loss": 0.6684, "mean_token_accuracy": 0.805577951669693, "num_tokens": 643871131.0, "step": 62070 }, { "entropy": 0.6801847398281098, "epoch": 0.49664, "grad_norm": 1.478524923324585, "learning_rate": 2.5178471388555426e-05, "loss": 0.6673, "mean_token_accuracy": 0.8059222757816314, "num_tokens": 643966752.0, "step": 62080 }, { "entropy": 0.7295742928981781, "epoch": 0.49672, "grad_norm": 3.6017487049102783, "learning_rate": 2.5174469787915166e-05, "loss": 0.7239, "mean_token_accuracy": 0.7853277206420899, "num_tokens": 644089544.0, "step": 62090 }, { "entropy": 0.6095875263214111, "epoch": 0.4968, "grad_norm": 4.787042617797852, "learning_rate": 2.517046818727491e-05, "loss": 0.5838, "mean_token_accuracy": 0.8391948401927948, "num_tokens": 644126021.0, "step": 62100 }, { "entropy": 0.610832592844963, "epoch": 0.49688, "grad_norm": 1.925554633140564, "learning_rate": 2.5166466586634657e-05, "loss": 0.6152, "mean_token_accuracy": 0.8063568592071533, "num_tokens": 644289861.0, "step": 62110 }, { "entropy": 0.6493029177188874, "epoch": 0.49696, "grad_norm": 3.483116865158081, "learning_rate": 2.51624649859944e-05, "loss": 0.6472, "mean_token_accuracy": 0.8101941764354705, "num_tokens": 644375194.0, "step": 62120 }, { "entropy": 0.7063219606876373, "epoch": 0.49704, "grad_norm": 1.8459758758544922, "learning_rate": 2.515846338535414e-05, "loss": 0.6955, "mean_token_accuracy": 0.8030255794525146, "num_tokens": 644469048.0, "step": 62130 }, { "entropy": 0.6659756481647492, "epoch": 0.49712, "grad_norm": 2.186173439025879, "learning_rate": 2.5154461784713885e-05, "loss": 0.6533, "mean_token_accuracy": 0.8069819390773774, "num_tokens": 644587870.0, "step": 62140 }, { "entropy": 0.6931142747402191, "epoch": 0.4972, "grad_norm": 4.733301639556885, "learning_rate": 2.5150460184073632e-05, "loss": 0.6893, "mean_token_accuracy": 0.819972139596939, "num_tokens": 644619647.0, "step": 62150 }, { "entropy": 0.6498240470886231, "epoch": 0.49728, "grad_norm": 2.291694402694702, "learning_rate": 2.5146458583433376e-05, "loss": 0.6498, "mean_token_accuracy": 0.7946629226207733, "num_tokens": 644783487.0, "step": 62160 }, { "entropy": 0.6467716932296753, "epoch": 0.49736, "grad_norm": 3.7272531986236572, "learning_rate": 2.5142456982793116e-05, "loss": 0.6322, "mean_token_accuracy": 0.8160539507865906, "num_tokens": 644870678.0, "step": 62170 }, { "entropy": 0.6977856755256653, "epoch": 0.49744, "grad_norm": 1.8022809028625488, "learning_rate": 2.5138455382152863e-05, "loss": 0.6953, "mean_token_accuracy": 0.8023514568805694, "num_tokens": 644966084.0, "step": 62180 }, { "entropy": 0.694112229347229, "epoch": 0.49752, "grad_norm": 2.436896800994873, "learning_rate": 2.5134453781512607e-05, "loss": 0.696, "mean_token_accuracy": 0.7855395078659058, "num_tokens": 645114395.0, "step": 62190 }, { "entropy": 0.8057370483875275, "epoch": 0.4976, "grad_norm": 4.884848117828369, "learning_rate": 2.513045218087235e-05, "loss": 0.7919, "mean_token_accuracy": 0.7918983995914459, "num_tokens": 645156028.0, "step": 62200 }, { "entropy": 0.6560110628604889, "epoch": 0.49768, "grad_norm": 1.969847559928894, "learning_rate": 2.512645058023209e-05, "loss": 0.6478, "mean_token_accuracy": 0.7966841697692871, "num_tokens": 645319868.0, "step": 62210 }, { "entropy": 0.6700620472431182, "epoch": 0.49776, "grad_norm": 4.097486972808838, "learning_rate": 2.5122448979591838e-05, "loss": 0.6661, "mean_token_accuracy": 0.8078100800514221, "num_tokens": 645405838.0, "step": 62220 }, { "entropy": 0.6249860465526581, "epoch": 0.49784, "grad_norm": 2.154670238494873, "learning_rate": 2.5118447378951582e-05, "loss": 0.6211, "mean_token_accuracy": 0.8176392674446106, "num_tokens": 645500691.0, "step": 62230 }, { "entropy": 0.7415390074253082, "epoch": 0.49792, "grad_norm": 4.0447001457214355, "learning_rate": 2.5114445778311326e-05, "loss": 0.7334, "mean_token_accuracy": 0.7831809341907501, "num_tokens": 645641952.0, "step": 62240 }, { "entropy": 0.7356463253498078, "epoch": 0.498, "grad_norm": 5.431793689727783, "learning_rate": 2.5110444177671073e-05, "loss": 0.7432, "mean_token_accuracy": 0.8111236870288849, "num_tokens": 645680051.0, "step": 62250 }, { "entropy": 0.6299662172794342, "epoch": 0.49808, "grad_norm": 2.092302083969116, "learning_rate": 2.5106442577030813e-05, "loss": 0.6283, "mean_token_accuracy": 0.7999145090579987, "num_tokens": 645843891.0, "step": 62260 }, { "entropy": 0.6619101583957672, "epoch": 0.49816, "grad_norm": 4.193933010101318, "learning_rate": 2.5102440976390557e-05, "loss": 0.6526, "mean_token_accuracy": 0.811648690700531, "num_tokens": 645927906.0, "step": 62270 }, { "entropy": 0.7470445215702057, "epoch": 0.49824, "grad_norm": 1.86629319190979, "learning_rate": 2.50984393757503e-05, "loss": 0.7391, "mean_token_accuracy": 0.7880887627601624, "num_tokens": 646021377.0, "step": 62280 }, { "entropy": 0.6497988283634186, "epoch": 0.49832, "grad_norm": 2.87286376953125, "learning_rate": 2.5094437775110048e-05, "loss": 0.6477, "mean_token_accuracy": 0.7994925558567048, "num_tokens": 646162597.0, "step": 62290 }, { "entropy": 0.7061267793178558, "epoch": 0.4984, "grad_norm": 4.334192276000977, "learning_rate": 2.5090436174469788e-05, "loss": 0.7051, "mean_token_accuracy": 0.812660425901413, "num_tokens": 646203060.0, "step": 62300 }, { "entropy": 0.6814498782157898, "epoch": 0.49848, "grad_norm": 1.611499309539795, "learning_rate": 2.508643457382953e-05, "loss": 0.6778, "mean_token_accuracy": 0.786138254404068, "num_tokens": 646366900.0, "step": 62310 }, { "entropy": 0.6812349379062652, "epoch": 0.49856, "grad_norm": 3.579578399658203, "learning_rate": 2.508243297318928e-05, "loss": 0.6662, "mean_token_accuracy": 0.8086064219474792, "num_tokens": 646454346.0, "step": 62320 }, { "entropy": 0.7146902620792389, "epoch": 0.49864, "grad_norm": 1.712181568145752, "learning_rate": 2.5078431372549022e-05, "loss": 0.7144, "mean_token_accuracy": 0.7990976333618164, "num_tokens": 646546775.0, "step": 62330 }, { "entropy": 0.6708421111106873, "epoch": 0.49872, "grad_norm": 2.250138282775879, "learning_rate": 2.5074429771908763e-05, "loss": 0.67, "mean_token_accuracy": 0.7931948840618134, "num_tokens": 646682521.0, "step": 62340 }, { "entropy": 0.6086583703756332, "epoch": 0.4988, "grad_norm": 6.009667873382568, "learning_rate": 2.5070428171268506e-05, "loss": 0.6051, "mean_token_accuracy": 0.8348709225654602, "num_tokens": 646721959.0, "step": 62350 }, { "entropy": 0.6493963837623596, "epoch": 0.49888, "grad_norm": 1.5808123350143433, "learning_rate": 2.5066426570628254e-05, "loss": 0.6466, "mean_token_accuracy": 0.7934416234493256, "num_tokens": 646885799.0, "step": 62360 }, { "entropy": 0.6767644882202148, "epoch": 0.49896, "grad_norm": 3.2384064197540283, "learning_rate": 2.5062424969987997e-05, "loss": 0.667, "mean_token_accuracy": 0.8061435103416443, "num_tokens": 646978785.0, "step": 62370 }, { "entropy": 0.7038731873035431, "epoch": 0.49904, "grad_norm": 1.68943452835083, "learning_rate": 2.5058423369347738e-05, "loss": 0.6927, "mean_token_accuracy": 0.7976142227649688, "num_tokens": 647073570.0, "step": 62380 }, { "entropy": 0.7214114665985107, "epoch": 0.49912, "grad_norm": 3.5589962005615234, "learning_rate": 2.5054421768707488e-05, "loss": 0.7228, "mean_token_accuracy": 0.7851906597614289, "num_tokens": 647210875.0, "step": 62390 }, { "entropy": 0.7226861745119095, "epoch": 0.4992, "grad_norm": 4.537856101989746, "learning_rate": 2.505042016806723e-05, "loss": 0.7138, "mean_token_accuracy": 0.8099264323711395, "num_tokens": 647247287.0, "step": 62400 }, { "entropy": 0.6816055536270141, "epoch": 0.49928, "grad_norm": 2.1167142391204834, "learning_rate": 2.5046418567426972e-05, "loss": 0.6787, "mean_token_accuracy": 0.7864008367061615, "num_tokens": 647411127.0, "step": 62410 }, { "entropy": 0.5947805851697922, "epoch": 0.49936, "grad_norm": 3.330742835998535, "learning_rate": 2.5042416966786712e-05, "loss": 0.5837, "mean_token_accuracy": 0.824076771736145, "num_tokens": 647507121.0, "step": 62420 }, { "entropy": 0.6753363370895386, "epoch": 0.49944, "grad_norm": 1.8846253156661987, "learning_rate": 2.5038415366146463e-05, "loss": 0.6702, "mean_token_accuracy": 0.8011537015438079, "num_tokens": 647603133.0, "step": 62430 }, { "entropy": 0.6538568437099457, "epoch": 0.49952, "grad_norm": 2.1181018352508545, "learning_rate": 2.5034413765506203e-05, "loss": 0.6416, "mean_token_accuracy": 0.7993786215782166, "num_tokens": 647739629.0, "step": 62440 }, { "entropy": 0.6565844297409058, "epoch": 0.4996, "grad_norm": 4.173987865447998, "learning_rate": 2.5030412164865947e-05, "loss": 0.6424, "mean_token_accuracy": 0.8232928514480591, "num_tokens": 647779273.0, "step": 62450 }, { "entropy": 0.6540662288665772, "epoch": 0.49968, "grad_norm": 1.8864684104919434, "learning_rate": 2.5026410564225694e-05, "loss": 0.6525, "mean_token_accuracy": 0.7955355942249298, "num_tokens": 647941775.0, "step": 62460 }, { "entropy": 0.626799327135086, "epoch": 0.49976, "grad_norm": 4.16916036605835, "learning_rate": 2.5022408963585438e-05, "loss": 0.6103, "mean_token_accuracy": 0.8226892590522766, "num_tokens": 648019588.0, "step": 62470 }, { "entropy": 0.6893899142742157, "epoch": 0.49984, "grad_norm": 1.7702187299728394, "learning_rate": 2.5018407362945178e-05, "loss": 0.6942, "mean_token_accuracy": 0.8021574556827545, "num_tokens": 648114413.0, "step": 62480 }, { "entropy": 0.6919751048088074, "epoch": 0.49992, "grad_norm": 4.0777106285095215, "learning_rate": 2.5014405762304922e-05, "loss": 0.6799, "mean_token_accuracy": 0.7956336855888366, "num_tokens": 648249781.0, "step": 62490 }, { "entropy": 0.6595178276300431, "epoch": 0.5, "grad_norm": 4.1647844314575195, "learning_rate": 2.501040416166467e-05, "loss": 0.6528, "mean_token_accuracy": 0.8219697117805481, "num_tokens": 648290867.0, "step": 62500 }, { "entropy": 0.6507190644741059, "epoch": 0.50008, "grad_norm": 2.509931802749634, "learning_rate": 2.5006402561024413e-05, "loss": 0.6476, "mean_token_accuracy": 0.7969772934913635, "num_tokens": 648454707.0, "step": 62510 }, { "entropy": 0.592496594786644, "epoch": 0.50016, "grad_norm": 3.7420756816864014, "learning_rate": 2.5002400960384153e-05, "loss": 0.5937, "mean_token_accuracy": 0.8238984227180481, "num_tokens": 648544979.0, "step": 62520 }, { "entropy": 0.7512158453464508, "epoch": 0.50024, "grad_norm": 2.197646141052246, "learning_rate": 2.49983993597439e-05, "loss": 0.7472, "mean_token_accuracy": 0.7918177366256713, "num_tokens": 648640279.0, "step": 62530 }, { "entropy": 0.6568575620651245, "epoch": 0.50032, "grad_norm": 2.477297782897949, "learning_rate": 2.4994397759103644e-05, "loss": 0.6488, "mean_token_accuracy": 0.7974212110042572, "num_tokens": 648777665.0, "step": 62540 }, { "entropy": 0.6219391793012619, "epoch": 0.5004, "grad_norm": 5.481095314025879, "learning_rate": 2.4990396158463387e-05, "loss": 0.6216, "mean_token_accuracy": 0.8310849785804748, "num_tokens": 648817560.0, "step": 62550 }, { "entropy": 0.6187080562114715, "epoch": 0.50048, "grad_norm": 1.3329315185546875, "learning_rate": 2.498639455782313e-05, "loss": 0.6172, "mean_token_accuracy": 0.8019724011421203, "num_tokens": 648981400.0, "step": 62560 }, { "entropy": 0.618156087398529, "epoch": 0.50056, "grad_norm": 2.658069133758545, "learning_rate": 2.4982392957182875e-05, "loss": 0.5989, "mean_token_accuracy": 0.8314850687980652, "num_tokens": 649058175.0, "step": 62570 }, { "entropy": 0.6453337490558624, "epoch": 0.50064, "grad_norm": 2.4425580501556396, "learning_rate": 2.497839135654262e-05, "loss": 0.6501, "mean_token_accuracy": 0.8093362808227539, "num_tokens": 649152189.0, "step": 62580 }, { "entropy": 0.6522208273410797, "epoch": 0.50072, "grad_norm": 2.6286041736602783, "learning_rate": 2.4974389755902362e-05, "loss": 0.6534, "mean_token_accuracy": 0.799595159292221, "num_tokens": 649292289.0, "step": 62590 }, { "entropy": 0.6906574338674545, "epoch": 0.5008, "grad_norm": 4.176985263824463, "learning_rate": 2.4970388155262106e-05, "loss": 0.6995, "mean_token_accuracy": 0.8130217730998993, "num_tokens": 649332859.0, "step": 62600 }, { "entropy": 0.602646803855896, "epoch": 0.50088, "grad_norm": 2.218996524810791, "learning_rate": 2.496638655462185e-05, "loss": 0.5962, "mean_token_accuracy": 0.808225131034851, "num_tokens": 649496100.0, "step": 62610 }, { "entropy": 0.6622371166944504, "epoch": 0.50096, "grad_norm": 3.4745240211486816, "learning_rate": 2.4962384953981593e-05, "loss": 0.6731, "mean_token_accuracy": 0.810307914018631, "num_tokens": 649578293.0, "step": 62620 }, { "entropy": 0.6834226071834564, "epoch": 0.50104, "grad_norm": 1.3698704242706299, "learning_rate": 2.4958383353341337e-05, "loss": 0.6723, "mean_token_accuracy": 0.8032956302165986, "num_tokens": 649673028.0, "step": 62630 }, { "entropy": 0.679769879579544, "epoch": 0.50112, "grad_norm": 3.5876529216766357, "learning_rate": 2.495438175270108e-05, "loss": 0.6705, "mean_token_accuracy": 0.7967566609382629, "num_tokens": 649809332.0, "step": 62640 }, { "entropy": 0.7070474803447724, "epoch": 0.5012, "grad_norm": 5.833940029144287, "learning_rate": 2.4950380152060825e-05, "loss": 0.7181, "mean_token_accuracy": 0.8101207077503204, "num_tokens": 649849787.0, "step": 62650 }, { "entropy": 0.674557089805603, "epoch": 0.50128, "grad_norm": 2.067988157272339, "learning_rate": 2.494637855142057e-05, "loss": 0.6698, "mean_token_accuracy": 0.789405232667923, "num_tokens": 650013627.0, "step": 62660 }, { "entropy": 0.6273655831813812, "epoch": 0.50136, "grad_norm": 3.5669116973876953, "learning_rate": 2.4942376950780312e-05, "loss": 0.616, "mean_token_accuracy": 0.8165471494197846, "num_tokens": 650105863.0, "step": 62670 }, { "entropy": 0.7078760325908661, "epoch": 0.50144, "grad_norm": 1.5231961011886597, "learning_rate": 2.493837535014006e-05, "loss": 0.7153, "mean_token_accuracy": 0.7986895203590393, "num_tokens": 650201063.0, "step": 62680 }, { "entropy": 0.6308655709028244, "epoch": 0.50152, "grad_norm": 3.7285540103912354, "learning_rate": 2.49343737494998e-05, "loss": 0.6268, "mean_token_accuracy": 0.802936065196991, "num_tokens": 650340627.0, "step": 62690 }, { "entropy": 0.7923714101314545, "epoch": 0.5016, "grad_norm": 3.8616318702697754, "learning_rate": 2.4930372148859547e-05, "loss": 0.7989, "mean_token_accuracy": 0.7976057529449463, "num_tokens": 650381376.0, "step": 62700 }, { "entropy": 0.6614140391349792, "epoch": 0.50168, "grad_norm": 1.8729890584945679, "learning_rate": 2.4926370548219287e-05, "loss": 0.6543, "mean_token_accuracy": 0.7957865178585053, "num_tokens": 650545216.0, "step": 62710 }, { "entropy": 0.6194872230291366, "epoch": 0.50176, "grad_norm": 3.1371045112609863, "learning_rate": 2.4922368947579034e-05, "loss": 0.5972, "mean_token_accuracy": 0.8264498829841613, "num_tokens": 650630489.0, "step": 62720 }, { "entropy": 0.6772423923015595, "epoch": 0.50184, "grad_norm": 1.9568305015563965, "learning_rate": 2.4918367346938774e-05, "loss": 0.6859, "mean_token_accuracy": 0.8060913383960724, "num_tokens": 650723939.0, "step": 62730 }, { "entropy": 0.7272939860820771, "epoch": 0.50192, "grad_norm": 3.0279006958007812, "learning_rate": 2.491436574629852e-05, "loss": 0.7234, "mean_token_accuracy": 0.7851949214935303, "num_tokens": 650858270.0, "step": 62740 }, { "entropy": 0.6260966271162033, "epoch": 0.502, "grad_norm": 4.217706680297852, "learning_rate": 2.4910364145658262e-05, "loss": 0.6179, "mean_token_accuracy": 0.8318443834781647, "num_tokens": 650898059.0, "step": 62750 }, { "entropy": 0.6484017372131348, "epoch": 0.50208, "grad_norm": 2.556635618209839, "learning_rate": 2.490636254501801e-05, "loss": 0.6417, "mean_token_accuracy": 0.7986551403999329, "num_tokens": 651061205.0, "step": 62760 }, { "entropy": 0.6024895936250687, "epoch": 0.50216, "grad_norm": 5.583551406860352, "learning_rate": 2.4902360944377753e-05, "loss": 0.5974, "mean_token_accuracy": 0.8250176966190338, "num_tokens": 651139241.0, "step": 62770 }, { "entropy": 0.6733214378356933, "epoch": 0.50224, "grad_norm": 1.6560074090957642, "learning_rate": 2.4898359343737496e-05, "loss": 0.6639, "mean_token_accuracy": 0.8089574158191681, "num_tokens": 651231258.0, "step": 62780 }, { "entropy": 0.6759643316268921, "epoch": 0.50232, "grad_norm": 3.7673325538635254, "learning_rate": 2.489435774309724e-05, "loss": 0.672, "mean_token_accuracy": 0.7969219624996186, "num_tokens": 651367490.0, "step": 62790 }, { "entropy": 0.7036787211894989, "epoch": 0.5024, "grad_norm": 4.180985450744629, "learning_rate": 2.4890356142456984e-05, "loss": 0.6847, "mean_token_accuracy": 0.8141115486621857, "num_tokens": 651406815.0, "step": 62800 }, { "entropy": 0.6016811519861222, "epoch": 0.50248, "grad_norm": 1.7129725217819214, "learning_rate": 2.4886354541816727e-05, "loss": 0.6009, "mean_token_accuracy": 0.8076575398445129, "num_tokens": 651570655.0, "step": 62810 }, { "entropy": 0.6666102051734925, "epoch": 0.50256, "grad_norm": 3.3202004432678223, "learning_rate": 2.488235294117647e-05, "loss": 0.6508, "mean_token_accuracy": 0.8104542434215546, "num_tokens": 651665170.0, "step": 62820 }, { "entropy": 0.7062606692314148, "epoch": 0.50264, "grad_norm": 1.7306122779846191, "learning_rate": 2.4878351340536215e-05, "loss": 0.7077, "mean_token_accuracy": 0.7948248207569122, "num_tokens": 651758941.0, "step": 62830 }, { "entropy": 0.6951185822486877, "epoch": 0.50272, "grad_norm": 2.485482931137085, "learning_rate": 2.4874349739895962e-05, "loss": 0.7067, "mean_token_accuracy": 0.7836464643478394, "num_tokens": 651888713.0, "step": 62840 }, { "entropy": 0.6264256566762925, "epoch": 0.5028, "grad_norm": 4.235004425048828, "learning_rate": 2.4870348139255702e-05, "loss": 0.6221, "mean_token_accuracy": 0.8248792052268982, "num_tokens": 651923915.0, "step": 62850 }, { "entropy": 0.6492510914802552, "epoch": 0.50288, "grad_norm": 1.8772472143173218, "learning_rate": 2.486634653861545e-05, "loss": 0.6392, "mean_token_accuracy": 0.7952674627304077, "num_tokens": 652087755.0, "step": 62860 }, { "entropy": 0.6349306851625443, "epoch": 0.50296, "grad_norm": 2.8424670696258545, "learning_rate": 2.486234493797519e-05, "loss": 0.6294, "mean_token_accuracy": 0.8216066300868988, "num_tokens": 652165263.0, "step": 62870 }, { "entropy": 0.771054881811142, "epoch": 0.50304, "grad_norm": 1.9266026020050049, "learning_rate": 2.4858343337334937e-05, "loss": 0.7695, "mean_token_accuracy": 0.7845880150794983, "num_tokens": 652259372.0, "step": 62880 }, { "entropy": 0.7280334830284119, "epoch": 0.50312, "grad_norm": 3.12068772315979, "learning_rate": 2.4854341736694677e-05, "loss": 0.7168, "mean_token_accuracy": 0.7865588188171386, "num_tokens": 652390230.0, "step": 62890 }, { "entropy": 0.6495499193668366, "epoch": 0.5032, "grad_norm": 4.948546886444092, "learning_rate": 2.4850340136054424e-05, "loss": 0.6768, "mean_token_accuracy": 0.8207981288433075, "num_tokens": 652427980.0, "step": 62900 }, { "entropy": 0.6328753054141998, "epoch": 0.50328, "grad_norm": 2.08833384513855, "learning_rate": 2.4846338535414168e-05, "loss": 0.6308, "mean_token_accuracy": 0.7996152877807617, "num_tokens": 652591820.0, "step": 62910 }, { "entropy": 0.6073993891477585, "epoch": 0.50336, "grad_norm": 5.521646022796631, "learning_rate": 2.484233693477391e-05, "loss": 0.5889, "mean_token_accuracy": 0.8255387485027313, "num_tokens": 652678624.0, "step": 62920 }, { "entropy": 0.6935556113719941, "epoch": 0.50344, "grad_norm": 1.87576162815094, "learning_rate": 2.4838335334133655e-05, "loss": 0.6832, "mean_token_accuracy": 0.8002988219261169, "num_tokens": 652773940.0, "step": 62930 }, { "entropy": 0.6257556676864624, "epoch": 0.50352, "grad_norm": 3.634929656982422, "learning_rate": 2.48343337334934e-05, "loss": 0.6261, "mean_token_accuracy": 0.8041521966457367, "num_tokens": 652919245.0, "step": 62940 }, { "entropy": 0.6464643180370331, "epoch": 0.5036, "grad_norm": 4.749926567077637, "learning_rate": 2.4830332132853143e-05, "loss": 0.6341, "mean_token_accuracy": 0.823677271604538, "num_tokens": 652963637.0, "step": 62950 }, { "entropy": 0.6480957686901092, "epoch": 0.50368, "grad_norm": 1.8564776182174683, "learning_rate": 2.4826330532212886e-05, "loss": 0.64, "mean_token_accuracy": 0.79485222697258, "num_tokens": 653127477.0, "step": 62960 }, { "entropy": 0.7265507221221924, "epoch": 0.50376, "grad_norm": 3.555198907852173, "learning_rate": 2.482232893157263e-05, "loss": 0.7199, "mean_token_accuracy": 0.7991489350795746, "num_tokens": 653213195.0, "step": 62970 }, { "entropy": 0.6912430584430694, "epoch": 0.50384, "grad_norm": 1.8124953508377075, "learning_rate": 2.4818327330932374e-05, "loss": 0.7063, "mean_token_accuracy": 0.7991988599300385, "num_tokens": 653306449.0, "step": 62980 }, { "entropy": 0.6879709959030151, "epoch": 0.50392, "grad_norm": 1.895017147064209, "learning_rate": 2.4814325730292118e-05, "loss": 0.6755, "mean_token_accuracy": 0.7998588562011719, "num_tokens": 653441997.0, "step": 62990 }, { "entropy": 0.7312668114900589, "epoch": 0.504, "grad_norm": 5.153591632843018, "learning_rate": 2.481032412965186e-05, "loss": 0.7337, "mean_token_accuracy": 0.8045348465442658, "num_tokens": 653479438.0, "step": 63000 }, { "entropy": 0.6761442840099334, "epoch": 0.50408, "grad_norm": 2.6738929748535156, "learning_rate": 2.4806322529011605e-05, "loss": 0.6724, "mean_token_accuracy": 0.7876722872257232, "num_tokens": 653642706.0, "step": 63010 }, { "entropy": 0.6358866304159164, "epoch": 0.50416, "grad_norm": 3.1852645874023438, "learning_rate": 2.480232092837135e-05, "loss": 0.619, "mean_token_accuracy": 0.8200814247131347, "num_tokens": 653722323.0, "step": 63020 }, { "entropy": 0.7206106364727021, "epoch": 0.50424, "grad_norm": 1.3579366207122803, "learning_rate": 2.4798319327731092e-05, "loss": 0.7205, "mean_token_accuracy": 0.8003865480422974, "num_tokens": 653815845.0, "step": 63030 }, { "entropy": 0.7060769855976105, "epoch": 0.50432, "grad_norm": 2.3595170974731445, "learning_rate": 2.4794317727090836e-05, "loss": 0.7032, "mean_token_accuracy": 0.7953223884105682, "num_tokens": 653953731.0, "step": 63040 }, { "entropy": 0.6414134174585342, "epoch": 0.5044, "grad_norm": 3.805100917816162, "learning_rate": 2.479031612645058e-05, "loss": 0.625, "mean_token_accuracy": 0.8282906174659729, "num_tokens": 653989194.0, "step": 63050 }, { "entropy": 0.6301013946533203, "epoch": 0.50448, "grad_norm": 1.396146297454834, "learning_rate": 2.4786314525810324e-05, "loss": 0.6332, "mean_token_accuracy": 0.7993954658508301, "num_tokens": 654153034.0, "step": 63060 }, { "entropy": 0.6677318990230561, "epoch": 0.50456, "grad_norm": 3.1048898696899414, "learning_rate": 2.478231292517007e-05, "loss": 0.6576, "mean_token_accuracy": 0.8063987791538239, "num_tokens": 654244905.0, "step": 63070 }, { "entropy": 0.7072586596012116, "epoch": 0.50464, "grad_norm": 1.4625798463821411, "learning_rate": 2.477831132452981e-05, "loss": 0.7107, "mean_token_accuracy": 0.7999001145362854, "num_tokens": 654338398.0, "step": 63080 }, { "entropy": 0.7318182766437531, "epoch": 0.50472, "grad_norm": 1.7522121667861938, "learning_rate": 2.4774309723889558e-05, "loss": 0.7211, "mean_token_accuracy": 0.7833924651145935, "num_tokens": 654484257.0, "step": 63090 }, { "entropy": 0.6779126971960068, "epoch": 0.5048, "grad_norm": 4.5828752517700195, "learning_rate": 2.47703081232493e-05, "loss": 0.6744, "mean_token_accuracy": 0.8198012471199035, "num_tokens": 654526780.0, "step": 63100 }, { "entropy": 0.6205568552017212, "epoch": 0.50488, "grad_norm": 1.2870427370071411, "learning_rate": 2.4766306522609046e-05, "loss": 0.623, "mean_token_accuracy": 0.8023632228374481, "num_tokens": 654690620.0, "step": 63110 }, { "entropy": 0.6263888895511627, "epoch": 0.50496, "grad_norm": 2.7980334758758545, "learning_rate": 2.4762304921968786e-05, "loss": 0.6143, "mean_token_accuracy": 0.8234669446945191, "num_tokens": 654777733.0, "step": 63120 }, { "entropy": 0.7275731146335602, "epoch": 0.50504, "grad_norm": 2.2439260482788086, "learning_rate": 2.4758303321328533e-05, "loss": 0.7373, "mean_token_accuracy": 0.7929703593254089, "num_tokens": 654872952.0, "step": 63130 }, { "entropy": 0.7335789263248443, "epoch": 0.50512, "grad_norm": 3.4610257148742676, "learning_rate": 2.4754301720688277e-05, "loss": 0.7259, "mean_token_accuracy": 0.7840072512626648, "num_tokens": 655012581.0, "step": 63140 }, { "entropy": 0.6031538963317871, "epoch": 0.5052, "grad_norm": 4.3299479484558105, "learning_rate": 2.475030012004802e-05, "loss": 0.585, "mean_token_accuracy": 0.8367542684078216, "num_tokens": 655053765.0, "step": 63150 }, { "entropy": 0.667427408695221, "epoch": 0.50528, "grad_norm": 1.3422845602035522, "learning_rate": 2.4746298519407764e-05, "loss": 0.6694, "mean_token_accuracy": 0.7879763066768646, "num_tokens": 655217605.0, "step": 63160 }, { "entropy": 0.6666418313980103, "epoch": 0.50536, "grad_norm": 2.9940907955169678, "learning_rate": 2.4742296918767508e-05, "loss": 0.6657, "mean_token_accuracy": 0.8147114813327789, "num_tokens": 655299730.0, "step": 63170 }, { "entropy": 0.6984791338443757, "epoch": 0.50544, "grad_norm": 2.0644688606262207, "learning_rate": 2.473829531812725e-05, "loss": 0.6978, "mean_token_accuracy": 0.7972503900527954, "num_tokens": 655395143.0, "step": 63180 }, { "entropy": 0.6978683173656464, "epoch": 0.50552, "grad_norm": 3.053504228591919, "learning_rate": 2.4734293717486995e-05, "loss": 0.693, "mean_token_accuracy": 0.7872032344341278, "num_tokens": 655535581.0, "step": 63190 }, { "entropy": 0.640071177482605, "epoch": 0.5056, "grad_norm": 7.107872009277344, "learning_rate": 2.473029211684674e-05, "loss": 0.6435, "mean_token_accuracy": 0.8213276743888855, "num_tokens": 655580935.0, "step": 63200 }, { "entropy": 0.6490309596061706, "epoch": 0.50568, "grad_norm": 2.172421932220459, "learning_rate": 2.4726290516206486e-05, "loss": 0.6461, "mean_token_accuracy": 0.7982027292251587, "num_tokens": 655744311.0, "step": 63210 }, { "entropy": 0.7120152771472931, "epoch": 0.50576, "grad_norm": 2.816234827041626, "learning_rate": 2.4722288915566226e-05, "loss": 0.695, "mean_token_accuracy": 0.8052793323993683, "num_tokens": 655823285.0, "step": 63220 }, { "entropy": 0.7426279544830322, "epoch": 0.50584, "grad_norm": 2.1629345417022705, "learning_rate": 2.4718287314925974e-05, "loss": 0.7325, "mean_token_accuracy": 0.7943939447402955, "num_tokens": 655917699.0, "step": 63230 }, { "entropy": 0.6436823904514313, "epoch": 0.50592, "grad_norm": 2.4147636890411377, "learning_rate": 2.4714285714285714e-05, "loss": 0.6383, "mean_token_accuracy": 0.8026635646820068, "num_tokens": 656061849.0, "step": 63240 }, { "entropy": 0.5794288992881775, "epoch": 0.506, "grad_norm": 4.420234680175781, "learning_rate": 2.471028411364546e-05, "loss": 0.5681, "mean_token_accuracy": 0.8375838160514831, "num_tokens": 656105629.0, "step": 63250 }, { "entropy": 0.6189308047294617, "epoch": 0.50608, "grad_norm": 1.5924991369247437, "learning_rate": 2.47062825130052e-05, "loss": 0.6157, "mean_token_accuracy": 0.8057217836380005, "num_tokens": 656269469.0, "step": 63260 }, { "entropy": 0.6239898890256882, "epoch": 0.50616, "grad_norm": 3.3591275215148926, "learning_rate": 2.470228091236495e-05, "loss": 0.6215, "mean_token_accuracy": 0.8200311243534089, "num_tokens": 656352401.0, "step": 63270 }, { "entropy": 0.6794660985469818, "epoch": 0.50624, "grad_norm": 1.7854382991790771, "learning_rate": 2.469827931172469e-05, "loss": 0.6879, "mean_token_accuracy": 0.8063483834266663, "num_tokens": 656447227.0, "step": 63280 }, { "entropy": 0.6843123733997345, "epoch": 0.50632, "grad_norm": 2.21744704246521, "learning_rate": 2.4694277711084436e-05, "loss": 0.6698, "mean_token_accuracy": 0.7950030446052552, "num_tokens": 656585632.0, "step": 63290 }, { "entropy": 0.6502258598804473, "epoch": 0.5064, "grad_norm": 5.052165985107422, "learning_rate": 2.469027611044418e-05, "loss": 0.6371, "mean_token_accuracy": 0.8302311420440673, "num_tokens": 656621474.0, "step": 63300 }, { "entropy": 0.6015535295009613, "epoch": 0.50648, "grad_norm": 1.7619309425354004, "learning_rate": 2.4686274509803923e-05, "loss": 0.6055, "mean_token_accuracy": 0.8111452221870422, "num_tokens": 656781519.0, "step": 63310 }, { "entropy": 0.6004818499088287, "epoch": 0.50656, "grad_norm": 2.6183526515960693, "learning_rate": 2.4682272909163667e-05, "loss": 0.5962, "mean_token_accuracy": 0.8253316938877105, "num_tokens": 656851271.0, "step": 63320 }, { "entropy": 0.7032783806324006, "epoch": 0.50664, "grad_norm": 1.5739991664886475, "learning_rate": 2.467827130852341e-05, "loss": 0.7021, "mean_token_accuracy": 0.7997353196144104, "num_tokens": 656944452.0, "step": 63330 }, { "entropy": 0.69909707903862, "epoch": 0.50672, "grad_norm": 2.5370426177978516, "learning_rate": 2.4674269707883154e-05, "loss": 0.6985, "mean_token_accuracy": 0.7872322380542756, "num_tokens": 657084491.0, "step": 63340 }, { "entropy": 0.7234712779521942, "epoch": 0.5068, "grad_norm": 3.8588171005249023, "learning_rate": 2.4670268107242898e-05, "loss": 0.7138, "mean_token_accuracy": 0.8098562598228455, "num_tokens": 657121759.0, "step": 63350 }, { "entropy": 0.6071831166744233, "epoch": 0.50688, "grad_norm": 1.9239401817321777, "learning_rate": 2.4666266506602642e-05, "loss": 0.602, "mean_token_accuracy": 0.8076333343982697, "num_tokens": 657284880.0, "step": 63360 }, { "entropy": 0.6579135358333588, "epoch": 0.50696, "grad_norm": 3.609218120574951, "learning_rate": 2.466226490596239e-05, "loss": 0.6405, "mean_token_accuracy": 0.8157314836978913, "num_tokens": 657364621.0, "step": 63370 }, { "entropy": 0.6455841362476349, "epoch": 0.50704, "grad_norm": 2.729607343673706, "learning_rate": 2.465826330532213e-05, "loss": 0.6403, "mean_token_accuracy": 0.8145015895366668, "num_tokens": 657456723.0, "step": 63380 }, { "entropy": 0.6841565728187561, "epoch": 0.50712, "grad_norm": 3.2447197437286377, "learning_rate": 2.4654261704681876e-05, "loss": 0.6837, "mean_token_accuracy": 0.7929744780063629, "num_tokens": 657586998.0, "step": 63390 }, { "entropy": 0.7230515241622925, "epoch": 0.5072, "grad_norm": 6.015252113342285, "learning_rate": 2.4650260104041617e-05, "loss": 0.7142, "mean_token_accuracy": 0.807322371006012, "num_tokens": 657624446.0, "step": 63400 }, { "entropy": 0.6469758093357086, "epoch": 0.50728, "grad_norm": 2.453376293182373, "learning_rate": 2.4646258503401364e-05, "loss": 0.6426, "mean_token_accuracy": 0.7965154886245728, "num_tokens": 657788262.0, "step": 63410 }, { "entropy": 0.7155092537403107, "epoch": 0.50736, "grad_norm": 4.8023905754089355, "learning_rate": 2.4642256902761104e-05, "loss": 0.709, "mean_token_accuracy": 0.7969614565372467, "num_tokens": 657870429.0, "step": 63420 }, { "entropy": 0.7511203825473786, "epoch": 0.50744, "grad_norm": 2.3235905170440674, "learning_rate": 2.463825530212085e-05, "loss": 0.7406, "mean_token_accuracy": 0.7939981639385223, "num_tokens": 657964665.0, "step": 63430 }, { "entropy": 0.6431330144405365, "epoch": 0.50752, "grad_norm": 3.24204683303833, "learning_rate": 2.4634253701480595e-05, "loss": 0.6487, "mean_token_accuracy": 0.798414146900177, "num_tokens": 658106732.0, "step": 63440 }, { "entropy": 0.6746729999780655, "epoch": 0.5076, "grad_norm": 4.561697483062744, "learning_rate": 2.463025210084034e-05, "loss": 0.66, "mean_token_accuracy": 0.8253861546516419, "num_tokens": 658149717.0, "step": 63450 }, { "entropy": 0.6591333627700806, "epoch": 0.50768, "grad_norm": 2.559284210205078, "learning_rate": 2.4626250500200082e-05, "loss": 0.6587, "mean_token_accuracy": 0.7926539599895477, "num_tokens": 658313511.0, "step": 63460 }, { "entropy": 0.6193243443965912, "epoch": 0.50776, "grad_norm": 5.160760402679443, "learning_rate": 2.4622248899559826e-05, "loss": 0.6077, "mean_token_accuracy": 0.8210169196128845, "num_tokens": 658396135.0, "step": 63470 }, { "entropy": 0.7018122792243957, "epoch": 0.50784, "grad_norm": 1.9659196138381958, "learning_rate": 2.461824729891957e-05, "loss": 0.7062, "mean_token_accuracy": 0.7947976469993592, "num_tokens": 658490983.0, "step": 63480 }, { "entropy": 0.6967430472373962, "epoch": 0.50792, "grad_norm": 2.1364080905914307, "learning_rate": 2.4614245698279313e-05, "loss": 0.6875, "mean_token_accuracy": 0.7921563684940338, "num_tokens": 658632036.0, "step": 63490 }, { "entropy": 0.667278790473938, "epoch": 0.508, "grad_norm": 5.273202419281006, "learning_rate": 2.4610244097639057e-05, "loss": 0.6618, "mean_token_accuracy": 0.8243456840515136, "num_tokens": 658674066.0, "step": 63500 }, { "entropy": 0.6537495493888855, "epoch": 0.50808, "grad_norm": 2.2238080501556396, "learning_rate": 2.46062424969988e-05, "loss": 0.6544, "mean_token_accuracy": 0.7948376893997192, "num_tokens": 658837881.0, "step": 63510 }, { "entropy": 0.7921627551317215, "epoch": 0.50816, "grad_norm": 3.8640859127044678, "learning_rate": 2.4602240896358545e-05, "loss": 0.7902, "mean_token_accuracy": 0.7845274448394776, "num_tokens": 658920788.0, "step": 63520 }, { "entropy": 0.7010055184364319, "epoch": 0.50824, "grad_norm": 1.557446837425232, "learning_rate": 2.459823929571829e-05, "loss": 0.6962, "mean_token_accuracy": 0.800191468000412, "num_tokens": 659015430.0, "step": 63530 }, { "entropy": 0.6877174019813538, "epoch": 0.50832, "grad_norm": 2.395305633544922, "learning_rate": 2.4594237695078032e-05, "loss": 0.6759, "mean_token_accuracy": 0.7936456322669982, "num_tokens": 659157508.0, "step": 63540 }, { "entropy": 0.7240788280963898, "epoch": 0.5084, "grad_norm": 4.724473476409912, "learning_rate": 2.4590236094437776e-05, "loss": 0.7173, "mean_token_accuracy": 0.8026040196418762, "num_tokens": 659199357.0, "step": 63550 }, { "entropy": 0.6925354063510895, "epoch": 0.50848, "grad_norm": 1.908769965171814, "learning_rate": 2.458623449379752e-05, "loss": 0.6931, "mean_token_accuracy": 0.7862237453460693, "num_tokens": 659363197.0, "step": 63560 }, { "entropy": 0.7312471330165863, "epoch": 0.50856, "grad_norm": 3.627264976501465, "learning_rate": 2.4582232893157263e-05, "loss": 0.7224, "mean_token_accuracy": 0.7913082480430603, "num_tokens": 659454110.0, "step": 63570 }, { "entropy": 0.7384361505508423, "epoch": 0.50864, "grad_norm": 2.147883415222168, "learning_rate": 2.4578231292517007e-05, "loss": 0.74, "mean_token_accuracy": 0.7891759276390076, "num_tokens": 659547168.0, "step": 63580 }, { "entropy": 0.7095988929271698, "epoch": 0.50872, "grad_norm": 2.5082881450653076, "learning_rate": 2.457422969187675e-05, "loss": 0.7007, "mean_token_accuracy": 0.7905725538730621, "num_tokens": 659678366.0, "step": 63590 }, { "entropy": 0.7126847624778747, "epoch": 0.5088, "grad_norm": 5.040182113647461, "learning_rate": 2.4570228091236498e-05, "loss": 0.7116, "mean_token_accuracy": 0.8074079990386963, "num_tokens": 659713873.0, "step": 63600 }, { "entropy": 0.64402876496315, "epoch": 0.50888, "grad_norm": 1.687558650970459, "learning_rate": 2.4566226490596238e-05, "loss": 0.6397, "mean_token_accuracy": 0.7991802632808686, "num_tokens": 659871706.0, "step": 63610 }, { "entropy": 0.5551268458366394, "epoch": 0.50896, "grad_norm": 3.8189473152160645, "learning_rate": 2.4562224889955985e-05, "loss": 0.543, "mean_token_accuracy": 0.8401057839393615, "num_tokens": 659938633.0, "step": 63620 }, { "entropy": 0.6973935425281524, "epoch": 0.50904, "grad_norm": 1.4914908409118652, "learning_rate": 2.4558223289315725e-05, "loss": 0.6911, "mean_token_accuracy": 0.7983290910720825, "num_tokens": 660030800.0, "step": 63630 }, { "entropy": 0.6800607323646546, "epoch": 0.50912, "grad_norm": 2.5541698932647705, "learning_rate": 2.4554221688675473e-05, "loss": 0.6724, "mean_token_accuracy": 0.8001962959766388, "num_tokens": 660154487.0, "step": 63640 }, { "entropy": 0.6722219258546829, "epoch": 0.5092, "grad_norm": 4.561405658721924, "learning_rate": 2.4550220088035213e-05, "loss": 0.6704, "mean_token_accuracy": 0.8195937991142273, "num_tokens": 660190115.0, "step": 63650 }, { "entropy": 0.6096426367759704, "epoch": 0.50928, "grad_norm": 2.143825054168701, "learning_rate": 2.454621848739496e-05, "loss": 0.6061, "mean_token_accuracy": 0.8055691301822663, "num_tokens": 660353955.0, "step": 63660 }, { "entropy": 0.7057668268680573, "epoch": 0.50936, "grad_norm": 3.6405351161956787, "learning_rate": 2.4542216886754704e-05, "loss": 0.7067, "mean_token_accuracy": 0.8026822090148926, "num_tokens": 660446230.0, "step": 63670 }, { "entropy": 0.7246671915054321, "epoch": 0.50944, "grad_norm": 2.3057162761688232, "learning_rate": 2.4538215286114447e-05, "loss": 0.706, "mean_token_accuracy": 0.7984765589237213, "num_tokens": 660540163.0, "step": 63680 }, { "entropy": 0.6332258641719818, "epoch": 0.50952, "grad_norm": 2.035970687866211, "learning_rate": 2.453421368547419e-05, "loss": 0.6259, "mean_token_accuracy": 0.8060246467590332, "num_tokens": 660693710.0, "step": 63690 }, { "entropy": 0.6498572826385498, "epoch": 0.5096, "grad_norm": 5.208099365234375, "learning_rate": 2.4530212084833935e-05, "loss": 0.6481, "mean_token_accuracy": 0.8220962285995483, "num_tokens": 660734591.0, "step": 63700 }, { "entropy": 0.6184284627437592, "epoch": 0.50968, "grad_norm": 3.501396417617798, "learning_rate": 2.452621048419368e-05, "loss": 0.6189, "mean_token_accuracy": 0.8009342968463897, "num_tokens": 660898431.0, "step": 63710 }, { "entropy": 0.6910972356796264, "epoch": 0.50976, "grad_norm": 3.1387226581573486, "learning_rate": 2.4522208883553422e-05, "loss": 0.6833, "mean_token_accuracy": 0.8038836658000946, "num_tokens": 660985184.0, "step": 63720 }, { "entropy": 0.7377801656723022, "epoch": 0.50984, "grad_norm": 1.4962295293807983, "learning_rate": 2.4518207282913166e-05, "loss": 0.7365, "mean_token_accuracy": 0.7937036037445069, "num_tokens": 661080807.0, "step": 63730 }, { "entropy": 0.7230903506278992, "epoch": 0.50992, "grad_norm": 2.4847724437713623, "learning_rate": 2.4514205682272913e-05, "loss": 0.7165, "mean_token_accuracy": 0.7864681780338287, "num_tokens": 661211494.0, "step": 63740 }, { "entropy": 0.704805463552475, "epoch": 0.51, "grad_norm": 5.038666248321533, "learning_rate": 2.4510204081632653e-05, "loss": 0.6964, "mean_token_accuracy": 0.8148557424545289, "num_tokens": 661244140.0, "step": 63750 }, { "entropy": 0.629138171672821, "epoch": 0.51008, "grad_norm": 2.12693190574646, "learning_rate": 2.45062024809924e-05, "loss": 0.6309, "mean_token_accuracy": 0.8004737854003906, "num_tokens": 661407780.0, "step": 63760 }, { "entropy": 0.7379389524459838, "epoch": 0.51016, "grad_norm": 3.8226006031036377, "learning_rate": 2.450220088035214e-05, "loss": 0.735, "mean_token_accuracy": 0.7960776507854461, "num_tokens": 661482838.0, "step": 63770 }, { "entropy": 0.7071563482284546, "epoch": 0.51024, "grad_norm": 2.2427175045013428, "learning_rate": 2.4498199279711888e-05, "loss": 0.6973, "mean_token_accuracy": 0.796394145488739, "num_tokens": 661574182.0, "step": 63780 }, { "entropy": 0.6997338771820069, "epoch": 0.51032, "grad_norm": 2.5302786827087402, "learning_rate": 2.4494197679071628e-05, "loss": 0.6933, "mean_token_accuracy": 0.7860728859901428, "num_tokens": 661723499.0, "step": 63790 }, { "entropy": 0.614730441570282, "epoch": 0.5104, "grad_norm": 6.583508014678955, "learning_rate": 2.4490196078431375e-05, "loss": 0.6155, "mean_token_accuracy": 0.8312779545783997, "num_tokens": 661764592.0, "step": 63800 }, { "entropy": 0.6727806687355041, "epoch": 0.51048, "grad_norm": 1.3844494819641113, "learning_rate": 2.448619447779112e-05, "loss": 0.6673, "mean_token_accuracy": 0.7920676648616791, "num_tokens": 661928432.0, "step": 63810 }, { "entropy": 0.6234866946935653, "epoch": 0.51056, "grad_norm": 3.633371114730835, "learning_rate": 2.4482192877150863e-05, "loss": 0.6215, "mean_token_accuracy": 0.8140923082828522, "num_tokens": 662016208.0, "step": 63820 }, { "entropy": 0.6995743215084076, "epoch": 0.51064, "grad_norm": 2.168480396270752, "learning_rate": 2.4478191276510607e-05, "loss": 0.7063, "mean_token_accuracy": 0.8006924510002136, "num_tokens": 662109255.0, "step": 63830 }, { "entropy": 0.6896859228610992, "epoch": 0.51072, "grad_norm": 2.8612496852874756, "learning_rate": 2.447418967587035e-05, "loss": 0.6772, "mean_token_accuracy": 0.7941762030124664, "num_tokens": 662240477.0, "step": 63840 }, { "entropy": 0.614093816280365, "epoch": 0.5108, "grad_norm": 4.035105228424072, "learning_rate": 2.4470188075230094e-05, "loss": 0.6155, "mean_token_accuracy": 0.8282984554767608, "num_tokens": 662277862.0, "step": 63850 }, { "entropy": 0.632041084766388, "epoch": 0.51088, "grad_norm": 2.0764200687408447, "learning_rate": 2.4466186474589838e-05, "loss": 0.6281, "mean_token_accuracy": 0.7991195619106293, "num_tokens": 662441682.0, "step": 63860 }, { "entropy": 0.6315923392772674, "epoch": 0.51096, "grad_norm": 2.8248019218444824, "learning_rate": 2.446218487394958e-05, "loss": 0.6268, "mean_token_accuracy": 0.8201326429843903, "num_tokens": 662523529.0, "step": 63870 }, { "entropy": 0.7514929831027984, "epoch": 0.51104, "grad_norm": 1.9566717147827148, "learning_rate": 2.4458183273309325e-05, "loss": 0.7512, "mean_token_accuracy": 0.7884994506835937, "num_tokens": 662618601.0, "step": 63880 }, { "entropy": 0.6856038093566894, "epoch": 0.51112, "grad_norm": 2.2796599864959717, "learning_rate": 2.445418167266907e-05, "loss": 0.6783, "mean_token_accuracy": 0.7899438261985778, "num_tokens": 662772210.0, "step": 63890 }, { "entropy": 0.6737073123455047, "epoch": 0.5112, "grad_norm": 4.3008317947387695, "learning_rate": 2.4450180072028813e-05, "loss": 0.673, "mean_token_accuracy": 0.818065345287323, "num_tokens": 662815654.0, "step": 63900 }, { "entropy": 0.6231060981750488, "epoch": 0.51128, "grad_norm": 2.0362985134124756, "learning_rate": 2.4446178471388556e-05, "loss": 0.6241, "mean_token_accuracy": 0.8011846661567688, "num_tokens": 662979494.0, "step": 63910 }, { "entropy": 0.7197643935680389, "epoch": 0.51136, "grad_norm": 3.057988405227661, "learning_rate": 2.44421768707483e-05, "loss": 0.7197, "mean_token_accuracy": 0.7959057748317718, "num_tokens": 663068648.0, "step": 63920 }, { "entropy": 0.6760698735713959, "epoch": 0.51144, "grad_norm": 1.5496826171875, "learning_rate": 2.4438175270108044e-05, "loss": 0.6723, "mean_token_accuracy": 0.8046635270118714, "num_tokens": 663163113.0, "step": 63930 }, { "entropy": 0.6508675873279571, "epoch": 0.51152, "grad_norm": 3.3834452629089355, "learning_rate": 2.4434173669467787e-05, "loss": 0.6431, "mean_token_accuracy": 0.8043905615806579, "num_tokens": 663303333.0, "step": 63940 }, { "entropy": 0.6421311855316162, "epoch": 0.5116, "grad_norm": 4.852074146270752, "learning_rate": 2.443017206882753e-05, "loss": 0.6354, "mean_token_accuracy": 0.8299787700176239, "num_tokens": 663341201.0, "step": 63950 }, { "entropy": 0.6650594413280487, "epoch": 0.51168, "grad_norm": 2.185488224029541, "learning_rate": 2.4426170468187275e-05, "loss": 0.6631, "mean_token_accuracy": 0.7909623920917511, "num_tokens": 663505041.0, "step": 63960 }, { "entropy": 0.5863789051771164, "epoch": 0.51176, "grad_norm": 4.442680835723877, "learning_rate": 2.4422168867547022e-05, "loss": 0.5748, "mean_token_accuracy": 0.8296134412288666, "num_tokens": 663587049.0, "step": 63970 }, { "entropy": 0.6518589675426483, "epoch": 0.51184, "grad_norm": 1.3862022161483765, "learning_rate": 2.4418167266906762e-05, "loss": 0.6376, "mean_token_accuracy": 0.8115325331687927, "num_tokens": 663680673.0, "step": 63980 }, { "entropy": 0.7019701719284057, "epoch": 0.51192, "grad_norm": 2.1357836723327637, "learning_rate": 2.441416566626651e-05, "loss": 0.6983, "mean_token_accuracy": 0.7869156122207641, "num_tokens": 663824344.0, "step": 63990 }, { "entropy": 0.6449472188949585, "epoch": 0.512, "grad_norm": 4.60781192779541, "learning_rate": 2.441016406562625e-05, "loss": 0.6346, "mean_token_accuracy": 0.8274320006370545, "num_tokens": 663866018.0, "step": 64000 }, { "entropy": 0.6099534273147583, "epoch": 0.51208, "grad_norm": 1.6379880905151367, "learning_rate": 2.4406162464985997e-05, "loss": 0.6075, "mean_token_accuracy": 0.80266854763031, "num_tokens": 664029858.0, "step": 64010 }, { "entropy": 0.6476939946413041, "epoch": 0.51216, "grad_norm": 3.3231709003448486, "learning_rate": 2.4402160864345737e-05, "loss": 0.642, "mean_token_accuracy": 0.8102090120315552, "num_tokens": 664116657.0, "step": 64020 }, { "entropy": 0.6970906913280487, "epoch": 0.51224, "grad_norm": 2.149637460708618, "learning_rate": 2.4398159263705484e-05, "loss": 0.696, "mean_token_accuracy": 0.8030352830886841, "num_tokens": 664209330.0, "step": 64030 }, { "entropy": 0.698067843914032, "epoch": 0.51232, "grad_norm": 3.1830732822418213, "learning_rate": 2.4394157663065228e-05, "loss": 0.6903, "mean_token_accuracy": 0.7869598031044006, "num_tokens": 664357678.0, "step": 64040 }, { "entropy": 0.6038524806499481, "epoch": 0.5124, "grad_norm": 5.068709373474121, "learning_rate": 2.439015606242497e-05, "loss": 0.5972, "mean_token_accuracy": 0.832920390367508, "num_tokens": 664406374.0, "step": 64050 }, { "entropy": 0.6290126860141754, "epoch": 0.51248, "grad_norm": 2.341055393218994, "learning_rate": 2.4386154461784715e-05, "loss": 0.6286, "mean_token_accuracy": 0.7991450905799866, "num_tokens": 664570214.0, "step": 64060 }, { "entropy": 0.629901447892189, "epoch": 0.51256, "grad_norm": 3.1578450202941895, "learning_rate": 2.438215286114446e-05, "loss": 0.6271, "mean_token_accuracy": 0.8156846940517426, "num_tokens": 664660797.0, "step": 64070 }, { "entropy": 0.6682724595069885, "epoch": 0.51264, "grad_norm": 2.9526312351226807, "learning_rate": 2.4378151260504203e-05, "loss": 0.6833, "mean_token_accuracy": 0.805251395702362, "num_tokens": 664754320.0, "step": 64080 }, { "entropy": 0.7112736105918884, "epoch": 0.51272, "grad_norm": 3.5886335372924805, "learning_rate": 2.4374149659863946e-05, "loss": 0.706, "mean_token_accuracy": 0.7870584070682526, "num_tokens": 664886896.0, "step": 64090 }, { "entropy": 0.6503311812877655, "epoch": 0.5128, "grad_norm": 5.22286319732666, "learning_rate": 2.437014805922369e-05, "loss": 0.647, "mean_token_accuracy": 0.8322580099105835, "num_tokens": 664925513.0, "step": 64100 }, { "entropy": 0.6446394264698029, "epoch": 0.51288, "grad_norm": 1.6564404964447021, "learning_rate": 2.4366146458583437e-05, "loss": 0.6467, "mean_token_accuracy": 0.7948400199413299, "num_tokens": 665089353.0, "step": 64110 }, { "entropy": 0.5885488808155059, "epoch": 0.51296, "grad_norm": 3.2175180912017822, "learning_rate": 2.4362144857943178e-05, "loss": 0.5687, "mean_token_accuracy": 0.8284402251243591, "num_tokens": 665175023.0, "step": 64120 }, { "entropy": 0.6516918361186981, "epoch": 0.51304, "grad_norm": 2.6542868614196777, "learning_rate": 2.4358143257302925e-05, "loss": 0.6491, "mean_token_accuracy": 0.8096800923347474, "num_tokens": 665268323.0, "step": 64130 }, { "entropy": 0.6724983334541321, "epoch": 0.51312, "grad_norm": 2.078165054321289, "learning_rate": 2.4354141656662665e-05, "loss": 0.6626, "mean_token_accuracy": 0.797054386138916, "num_tokens": 665400992.0, "step": 64140 }, { "entropy": 0.6883745282888413, "epoch": 0.5132, "grad_norm": 5.3936591148376465, "learning_rate": 2.4350140056022412e-05, "loss": 0.6733, "mean_token_accuracy": 0.8194060325622559, "num_tokens": 665438897.0, "step": 64150 }, { "entropy": 0.6800861418247223, "epoch": 0.51328, "grad_norm": 1.9321166276931763, "learning_rate": 2.4346138455382152e-05, "loss": 0.6785, "mean_token_accuracy": 0.787744265794754, "num_tokens": 665602737.0, "step": 64160 }, { "entropy": 0.7327366381883621, "epoch": 0.51336, "grad_norm": 3.8035521507263184, "learning_rate": 2.43421368547419e-05, "loss": 0.7277, "mean_token_accuracy": 0.7991726875305176, "num_tokens": 665690221.0, "step": 64170 }, { "entropy": 0.6407071411609649, "epoch": 0.51344, "grad_norm": 1.7732765674591064, "learning_rate": 2.433813525410164e-05, "loss": 0.6439, "mean_token_accuracy": 0.8116746723651886, "num_tokens": 665783718.0, "step": 64180 }, { "entropy": 0.7086034297943116, "epoch": 0.51352, "grad_norm": 2.9153811931610107, "learning_rate": 2.4334133653461387e-05, "loss": 0.7, "mean_token_accuracy": 0.7872468173503876, "num_tokens": 665918150.0, "step": 64190 }, { "entropy": 0.635346257686615, "epoch": 0.5136, "grad_norm": 5.781280517578125, "learning_rate": 2.433013205282113e-05, "loss": 0.6319, "mean_token_accuracy": 0.8281588315963745, "num_tokens": 665956735.0, "step": 64200 }, { "entropy": 0.6466519117355347, "epoch": 0.51368, "grad_norm": 2.5407028198242188, "learning_rate": 2.4326130452180874e-05, "loss": 0.6448, "mean_token_accuracy": 0.7990657091140747, "num_tokens": 666120575.0, "step": 64210 }, { "entropy": 0.6830021917819977, "epoch": 0.51376, "grad_norm": 3.1470820903778076, "learning_rate": 2.4322128851540618e-05, "loss": 0.6699, "mean_token_accuracy": 0.8100481390953064, "num_tokens": 666205859.0, "step": 64220 }, { "entropy": 0.6933509707450867, "epoch": 0.51384, "grad_norm": 2.517439365386963, "learning_rate": 2.4318127250900362e-05, "loss": 0.6885, "mean_token_accuracy": 0.8060845136642456, "num_tokens": 666299525.0, "step": 64230 }, { "entropy": 0.6680233508348465, "epoch": 0.51392, "grad_norm": 2.35642147064209, "learning_rate": 2.4314125650260106e-05, "loss": 0.6646, "mean_token_accuracy": 0.7980986952781677, "num_tokens": 666421903.0, "step": 64240 }, { "entropy": 0.6729497194290162, "epoch": 0.514, "grad_norm": 5.51289176940918, "learning_rate": 2.431012404961985e-05, "loss": 0.6707, "mean_token_accuracy": 0.822916442155838, "num_tokens": 666455352.0, "step": 64250 }, { "entropy": 0.6528966069221497, "epoch": 0.51408, "grad_norm": 2.558259963989258, "learning_rate": 2.4306122448979593e-05, "loss": 0.6525, "mean_token_accuracy": 0.792678314447403, "num_tokens": 666619192.0, "step": 64260 }, { "entropy": 0.6711708962917328, "epoch": 0.51416, "grad_norm": 3.845756769180298, "learning_rate": 2.4302120848339337e-05, "loss": 0.6721, "mean_token_accuracy": 0.8074910581111908, "num_tokens": 666697982.0, "step": 64270 }, { "entropy": 0.6893834233283996, "epoch": 0.51424, "grad_norm": 1.386942982673645, "learning_rate": 2.429811924769908e-05, "loss": 0.6746, "mean_token_accuracy": 0.8044866383075714, "num_tokens": 666790951.0, "step": 64280 }, { "entropy": 0.6983682513237, "epoch": 0.51432, "grad_norm": 2.580915689468384, "learning_rate": 2.4294117647058824e-05, "loss": 0.6947, "mean_token_accuracy": 0.785330843925476, "num_tokens": 666937951.0, "step": 64290 }, { "entropy": 0.7439997285604477, "epoch": 0.5144, "grad_norm": 5.193769454956055, "learning_rate": 2.4290116046418568e-05, "loss": 0.7333, "mean_token_accuracy": 0.8027320027351379, "num_tokens": 666985772.0, "step": 64300 }, { "entropy": 0.6426544725894928, "epoch": 0.51448, "grad_norm": 1.3314367532730103, "learning_rate": 2.428611444577831e-05, "loss": 0.6438, "mean_token_accuracy": 0.7932401120662689, "num_tokens": 667149612.0, "step": 64310 }, { "entropy": 0.6576062262058258, "epoch": 0.51456, "grad_norm": 3.25557541847229, "learning_rate": 2.4282112845138055e-05, "loss": 0.6386, "mean_token_accuracy": 0.8145770788192749, "num_tokens": 667233278.0, "step": 64320 }, { "entropy": 0.7070986568927765, "epoch": 0.51464, "grad_norm": 2.115678071975708, "learning_rate": 2.42781112444978e-05, "loss": 0.7126, "mean_token_accuracy": 0.7978430032730103, "num_tokens": 667325279.0, "step": 64330 }, { "entropy": 0.6625198245048523, "epoch": 0.51472, "grad_norm": 3.1365108489990234, "learning_rate": 2.4274109643857546e-05, "loss": 0.6585, "mean_token_accuracy": 0.8000525414943696, "num_tokens": 667446849.0, "step": 64340 }, { "entropy": 0.7544160127639771, "epoch": 0.5148, "grad_norm": 4.86362361907959, "learning_rate": 2.4270108043217286e-05, "loss": 0.7538, "mean_token_accuracy": 0.8065437793731689, "num_tokens": 667478258.0, "step": 64350 }, { "entropy": 0.6639073967933655, "epoch": 0.51488, "grad_norm": 1.4666510820388794, "learning_rate": 2.4266106442577033e-05, "loss": 0.6551, "mean_token_accuracy": 0.797557407617569, "num_tokens": 667642098.0, "step": 64360 }, { "entropy": 0.6369635432958602, "epoch": 0.51496, "grad_norm": 3.262282609939575, "learning_rate": 2.4262104841936774e-05, "loss": 0.632, "mean_token_accuracy": 0.8169142723083496, "num_tokens": 667736838.0, "step": 64370 }, { "entropy": 0.7004769027233124, "epoch": 0.51504, "grad_norm": 1.7981175184249878, "learning_rate": 2.425810324129652e-05, "loss": 0.6967, "mean_token_accuracy": 0.802134382724762, "num_tokens": 667831926.0, "step": 64380 }, { "entropy": 0.7270337045192719, "epoch": 0.51512, "grad_norm": 2.562774419784546, "learning_rate": 2.425410164065626e-05, "loss": 0.7156, "mean_token_accuracy": 0.792282509803772, "num_tokens": 667956310.0, "step": 64390 }, { "entropy": 0.6850905656814575, "epoch": 0.5152, "grad_norm": 4.788293361663818, "learning_rate": 2.425010004001601e-05, "loss": 0.6827, "mean_token_accuracy": 0.8199685633182525, "num_tokens": 667993047.0, "step": 64400 }, { "entropy": 0.6520132541656494, "epoch": 0.51528, "grad_norm": 2.5786848068237305, "learning_rate": 2.424609843937575e-05, "loss": 0.6529, "mean_token_accuracy": 0.7945195496082306, "num_tokens": 668155925.0, "step": 64410 }, { "entropy": 0.6324536323547363, "epoch": 0.51536, "grad_norm": 3.1042938232421875, "learning_rate": 2.4242096838735496e-05, "loss": 0.6144, "mean_token_accuracy": 0.822059154510498, "num_tokens": 668233254.0, "step": 64420 }, { "entropy": 0.740658962726593, "epoch": 0.51544, "grad_norm": 1.3187530040740967, "learning_rate": 2.423809523809524e-05, "loss": 0.7537, "mean_token_accuracy": 0.7876644730567932, "num_tokens": 668327104.0, "step": 64430 }, { "entropy": 0.6982633173465729, "epoch": 0.51552, "grad_norm": 2.290046453475952, "learning_rate": 2.4234093637454983e-05, "loss": 0.6952, "mean_token_accuracy": 0.78746297955513, "num_tokens": 668468300.0, "step": 64440 }, { "entropy": 0.6161954820156097, "epoch": 0.5156, "grad_norm": 5.064590930938721, "learning_rate": 2.4230092036814727e-05, "loss": 0.6115, "mean_token_accuracy": 0.833160799741745, "num_tokens": 668508370.0, "step": 64450 }, { "entropy": 0.6766734838485717, "epoch": 0.51568, "grad_norm": 1.4371670484542847, "learning_rate": 2.422609043617447e-05, "loss": 0.6716, "mean_token_accuracy": 0.7881472945213318, "num_tokens": 668672210.0, "step": 64460 }, { "entropy": 0.5881156623363495, "epoch": 0.51576, "grad_norm": 3.081134796142578, "learning_rate": 2.4222088835534214e-05, "loss": 0.5812, "mean_token_accuracy": 0.8294372916221618, "num_tokens": 668754378.0, "step": 64470 }, { "entropy": 0.6534909904003143, "epoch": 0.51584, "grad_norm": 1.4062292575836182, "learning_rate": 2.4218087234893958e-05, "loss": 0.6668, "mean_token_accuracy": 0.8083953440189362, "num_tokens": 668848134.0, "step": 64480 }, { "entropy": 0.6686118841171265, "epoch": 0.51592, "grad_norm": 2.3363864421844482, "learning_rate": 2.4214085634253702e-05, "loss": 0.6584, "mean_token_accuracy": 0.7948773920536041, "num_tokens": 668999598.0, "step": 64490 }, { "entropy": 0.7131963729858398, "epoch": 0.516, "grad_norm": 4.835975170135498, "learning_rate": 2.421008403361345e-05, "loss": 0.7107, "mean_token_accuracy": 0.8111380219459534, "num_tokens": 669045932.0, "step": 64500 }, { "entropy": 0.600173419713974, "epoch": 0.51608, "grad_norm": 2.0032265186309814, "learning_rate": 2.420608243297319e-05, "loss": 0.5975, "mean_token_accuracy": 0.8075170993804932, "num_tokens": 669209772.0, "step": 64510 }, { "entropy": 0.713911771774292, "epoch": 0.51616, "grad_norm": 3.1270675659179688, "learning_rate": 2.4202080832332936e-05, "loss": 0.7126, "mean_token_accuracy": 0.7956171095371246, "num_tokens": 669312771.0, "step": 64520 }, { "entropy": 0.7061522722244262, "epoch": 0.51624, "grad_norm": 1.9491854906082153, "learning_rate": 2.4198079231692677e-05, "loss": 0.6927, "mean_token_accuracy": 0.8065496981143951, "num_tokens": 669406268.0, "step": 64530 }, { "entropy": 0.7058396756649017, "epoch": 0.51632, "grad_norm": 3.244415283203125, "learning_rate": 2.4194077631052424e-05, "loss": 0.7081, "mean_token_accuracy": 0.7858916401863099, "num_tokens": 669551333.0, "step": 64540 }, { "entropy": 0.6870602697134018, "epoch": 0.5164, "grad_norm": 5.024630546569824, "learning_rate": 2.4190076030412164e-05, "loss": 0.6739, "mean_token_accuracy": 0.817306923866272, "num_tokens": 669595380.0, "step": 64550 }, { "entropy": 0.6653835624456406, "epoch": 0.51648, "grad_norm": 1.7167714834213257, "learning_rate": 2.418607442977191e-05, "loss": 0.667, "mean_token_accuracy": 0.7936126112937927, "num_tokens": 669759220.0, "step": 64560 }, { "entropy": 0.6282320499420166, "epoch": 0.51656, "grad_norm": 3.6845815181732178, "learning_rate": 2.4182072829131655e-05, "loss": 0.6254, "mean_token_accuracy": 0.8196670949459076, "num_tokens": 669844190.0, "step": 64570 }, { "entropy": 0.7112458944320679, "epoch": 0.51664, "grad_norm": 2.4685606956481934, "learning_rate": 2.41780712284914e-05, "loss": 0.7013, "mean_token_accuracy": 0.7977445542812347, "num_tokens": 669936899.0, "step": 64580 }, { "entropy": 0.6808644533157349, "epoch": 0.51672, "grad_norm": 3.3402655124664307, "learning_rate": 2.4174069627851142e-05, "loss": 0.6811, "mean_token_accuracy": 0.7910045266151429, "num_tokens": 670083296.0, "step": 64590 }, { "entropy": 0.6626866400241852, "epoch": 0.5168, "grad_norm": 4.209677696228027, "learning_rate": 2.4170068027210886e-05, "loss": 0.662, "mean_token_accuracy": 0.8200192511081695, "num_tokens": 670125637.0, "step": 64600 }, { "entropy": 0.6124362945556641, "epoch": 0.51688, "grad_norm": 1.6998670101165771, "learning_rate": 2.416606642657063e-05, "loss": 0.6137, "mean_token_accuracy": 0.8039086043834687, "num_tokens": 670287295.0, "step": 64610 }, { "entropy": 0.5994830548763275, "epoch": 0.51696, "grad_norm": 2.663165330886841, "learning_rate": 2.4162064825930373e-05, "loss": 0.5796, "mean_token_accuracy": 0.8338227689266204, "num_tokens": 670352827.0, "step": 64620 }, { "entropy": 0.6695707440376282, "epoch": 0.51704, "grad_norm": 2.137995958328247, "learning_rate": 2.4158063225290117e-05, "loss": 0.6627, "mean_token_accuracy": 0.8071292519569397, "num_tokens": 670445019.0, "step": 64630 }, { "entropy": 0.6789388418197632, "epoch": 0.51712, "grad_norm": 2.9043445587158203, "learning_rate": 2.415406162464986e-05, "loss": 0.6818, "mean_token_accuracy": 0.7883882999420166, "num_tokens": 670601428.0, "step": 64640 }, { "entropy": 0.7129093408584595, "epoch": 0.5172, "grad_norm": 5.477908134460449, "learning_rate": 2.4150060024009605e-05, "loss": 0.6967, "mean_token_accuracy": 0.8140482127666473, "num_tokens": 670646943.0, "step": 64650 }, { "entropy": 0.6374875485897065, "epoch": 0.51728, "grad_norm": 1.9957083463668823, "learning_rate": 2.4146058423369348e-05, "loss": 0.6284, "mean_token_accuracy": 0.8046409368515015, "num_tokens": 670810783.0, "step": 64660 }, { "entropy": 0.6565580308437348, "epoch": 0.51736, "grad_norm": 2.702634572982788, "learning_rate": 2.4142056822729092e-05, "loss": 0.6612, "mean_token_accuracy": 0.8085413873195648, "num_tokens": 670905525.0, "step": 64670 }, { "entropy": 0.6828884363174439, "epoch": 0.51744, "grad_norm": 1.8668259382247925, "learning_rate": 2.4138055222088836e-05, "loss": 0.6791, "mean_token_accuracy": 0.8034141182899475, "num_tokens": 671001073.0, "step": 64680 }, { "entropy": 0.700045382976532, "epoch": 0.51752, "grad_norm": 2.850921154022217, "learning_rate": 2.413405362144858e-05, "loss": 0.6919, "mean_token_accuracy": 0.7848471820354461, "num_tokens": 671153360.0, "step": 64690 }, { "entropy": 0.6387740999460221, "epoch": 0.5176, "grad_norm": 4.678131103515625, "learning_rate": 2.4130052020808323e-05, "loss": 0.6457, "mean_token_accuracy": 0.8224786043167114, "num_tokens": 671196120.0, "step": 64700 }, { "entropy": 0.6430775880813598, "epoch": 0.51768, "grad_norm": 1.759594202041626, "learning_rate": 2.4126050420168067e-05, "loss": 0.6449, "mean_token_accuracy": 0.7950578927993774, "num_tokens": 671358176.0, "step": 64710 }, { "entropy": 0.6277038365602493, "epoch": 0.51776, "grad_norm": 3.275721549987793, "learning_rate": 2.412204881952781e-05, "loss": 0.6082, "mean_token_accuracy": 0.8203036248683929, "num_tokens": 671434984.0, "step": 64720 }, { "entropy": 0.6539662778377533, "epoch": 0.51784, "grad_norm": 2.0275678634643555, "learning_rate": 2.4118047218887558e-05, "loss": 0.6721, "mean_token_accuracy": 0.806368374824524, "num_tokens": 671528278.0, "step": 64730 }, { "entropy": 0.7629374682903289, "epoch": 0.51792, "grad_norm": 3.000828742980957, "learning_rate": 2.4114045618247298e-05, "loss": 0.7511, "mean_token_accuracy": 0.7810292184352875, "num_tokens": 671669136.0, "step": 64740 }, { "entropy": 0.6474066257476807, "epoch": 0.518, "grad_norm": 5.9796342849731445, "learning_rate": 2.4110044017607045e-05, "loss": 0.6334, "mean_token_accuracy": 0.8262201726436615, "num_tokens": 671709488.0, "step": 64750 }, { "entropy": 0.6300790488719941, "epoch": 0.51808, "grad_norm": 3.8866827487945557, "learning_rate": 2.4106042416966785e-05, "loss": 0.6352, "mean_token_accuracy": 0.7978693842887878, "num_tokens": 671872197.0, "step": 64760 }, { "entropy": 0.715697905421257, "epoch": 0.51816, "grad_norm": 3.003305673599243, "learning_rate": 2.4102040816326533e-05, "loss": 0.6987, "mean_token_accuracy": 0.8017689168453217, "num_tokens": 671948371.0, "step": 64770 }, { "entropy": 0.6824292302131653, "epoch": 0.51824, "grad_norm": 1.3740020990371704, "learning_rate": 2.4098039215686273e-05, "loss": 0.682, "mean_token_accuracy": 0.8017405033111572, "num_tokens": 672042685.0, "step": 64780 }, { "entropy": 0.7077652186155319, "epoch": 0.51832, "grad_norm": 2.4403927326202393, "learning_rate": 2.409403761504602e-05, "loss": 0.7101, "mean_token_accuracy": 0.7883624494075775, "num_tokens": 672179976.0, "step": 64790 }, { "entropy": 0.6589897066354752, "epoch": 0.5184, "grad_norm": 3.735926866531372, "learning_rate": 2.4090036014405764e-05, "loss": 0.658, "mean_token_accuracy": 0.8222115278244019, "num_tokens": 672220425.0, "step": 64800 }, { "entropy": 0.6666192829608917, "epoch": 0.51848, "grad_norm": 1.7058467864990234, "learning_rate": 2.4086034413765507e-05, "loss": 0.6627, "mean_token_accuracy": 0.7937883853912353, "num_tokens": 672380596.0, "step": 64810 }, { "entropy": 0.6024827659130096, "epoch": 0.51856, "grad_norm": 3.7022926807403564, "learning_rate": 2.408203281312525e-05, "loss": 0.5906, "mean_token_accuracy": 0.8346064865589142, "num_tokens": 672446931.0, "step": 64820 }, { "entropy": 0.6362218141555787, "epoch": 0.51864, "grad_norm": 1.7250111103057861, "learning_rate": 2.4078031212484995e-05, "loss": 0.6294, "mean_token_accuracy": 0.8151276886463166, "num_tokens": 672539435.0, "step": 64830 }, { "entropy": 0.6439220130443573, "epoch": 0.51872, "grad_norm": 2.0962417125701904, "learning_rate": 2.407402961184474e-05, "loss": 0.643, "mean_token_accuracy": 0.8005901873111725, "num_tokens": 672673963.0, "step": 64840 }, { "entropy": 0.6681776016950607, "epoch": 0.5188, "grad_norm": 5.202529430389404, "learning_rate": 2.4070028011204482e-05, "loss": 0.6679, "mean_token_accuracy": 0.8162686228752136, "num_tokens": 672714253.0, "step": 64850 }, { "entropy": 0.7028339564800262, "epoch": 0.51888, "grad_norm": 2.1576547622680664, "learning_rate": 2.4066026410564226e-05, "loss": 0.6982, "mean_token_accuracy": 0.7877252519130706, "num_tokens": 672877621.0, "step": 64860 }, { "entropy": 0.6543850004673004, "epoch": 0.51896, "grad_norm": 3.070394515991211, "learning_rate": 2.4062024809923973e-05, "loss": 0.6451, "mean_token_accuracy": 0.8137753069400787, "num_tokens": 672953348.0, "step": 64870 }, { "entropy": 0.7018808156251908, "epoch": 0.51904, "grad_norm": 1.4877634048461914, "learning_rate": 2.4058023209283713e-05, "loss": 0.7144, "mean_token_accuracy": 0.7988519370555878, "num_tokens": 673046062.0, "step": 64880 }, { "entropy": 0.714112913608551, "epoch": 0.51912, "grad_norm": 2.583163261413574, "learning_rate": 2.405402160864346e-05, "loss": 0.7106, "mean_token_accuracy": 0.7853566706180573, "num_tokens": 673192828.0, "step": 64890 }, { "entropy": 0.731869387626648, "epoch": 0.5192, "grad_norm": 5.142854690551758, "learning_rate": 2.40500200080032e-05, "loss": 0.7235, "mean_token_accuracy": 0.8093137741088867, "num_tokens": 673230859.0, "step": 64900 }, { "entropy": 0.5974458128213882, "epoch": 0.51928, "grad_norm": 1.5274577140808105, "learning_rate": 2.4046018407362948e-05, "loss": 0.6036, "mean_token_accuracy": 0.8069736242294312, "num_tokens": 673394699.0, "step": 64910 }, { "entropy": 0.665349292755127, "epoch": 0.51936, "grad_norm": 3.134185314178467, "learning_rate": 2.4042016806722688e-05, "loss": 0.6481, "mean_token_accuracy": 0.8116039097309112, "num_tokens": 673493516.0, "step": 64920 }, { "entropy": 0.6983683466911316, "epoch": 0.51944, "grad_norm": 1.8357443809509277, "learning_rate": 2.4038015206082435e-05, "loss": 0.6936, "mean_token_accuracy": 0.7990276277065277, "num_tokens": 673587454.0, "step": 64930 }, { "entropy": 0.6458372354507447, "epoch": 0.51952, "grad_norm": 2.7974324226379395, "learning_rate": 2.403401360544218e-05, "loss": 0.647, "mean_token_accuracy": 0.7999643981456757, "num_tokens": 673721569.0, "step": 64940 }, { "entropy": 0.6326035410165787, "epoch": 0.5196, "grad_norm": 5.358120441436768, "learning_rate": 2.4030012004801923e-05, "loss": 0.6306, "mean_token_accuracy": 0.8320146441459656, "num_tokens": 673757794.0, "step": 64950 }, { "entropy": 0.7156812012195587, "epoch": 0.51968, "grad_norm": 1.6809734106063843, "learning_rate": 2.4026010404161666e-05, "loss": 0.7128, "mean_token_accuracy": 0.787524425983429, "num_tokens": 673921634.0, "step": 64960 }, { "entropy": 0.7076406091451645, "epoch": 0.51976, "grad_norm": 5.435229301452637, "learning_rate": 2.402200880352141e-05, "loss": 0.7079, "mean_token_accuracy": 0.7991056561470031, "num_tokens": 674010759.0, "step": 64970 }, { "entropy": 0.6815585196018219, "epoch": 0.51984, "grad_norm": 1.7116032838821411, "learning_rate": 2.4018007202881154e-05, "loss": 0.6718, "mean_token_accuracy": 0.8025704383850097, "num_tokens": 674106059.0, "step": 64980 }, { "entropy": 0.7070675432682038, "epoch": 0.51992, "grad_norm": 2.0817668437957764, "learning_rate": 2.4014005602240898e-05, "loss": 0.7057, "mean_token_accuracy": 0.7873043358325958, "num_tokens": 674259429.0, "step": 64990 }, { "entropy": 0.7038462877273559, "epoch": 0.52, "grad_norm": 5.274851322174072, "learning_rate": 2.401000400160064e-05, "loss": 0.7034, "mean_token_accuracy": 0.8079121828079223, "num_tokens": 674307533.0, "step": 65000 }, { "entropy": 0.710547399520874, "epoch": 0.52008, "grad_norm": 1.816295862197876, "learning_rate": 2.4006002400960385e-05, "loss": 0.7067, "mean_token_accuracy": 0.7810863673686981, "num_tokens": 674470260.0, "step": 65010 }, { "entropy": 0.6295238167047501, "epoch": 0.52016, "grad_norm": 3.6697497367858887, "learning_rate": 2.400200080032013e-05, "loss": 0.6258, "mean_token_accuracy": 0.8256213963031769, "num_tokens": 674541490.0, "step": 65020 }, { "entropy": 0.7026835262775422, "epoch": 0.52024, "grad_norm": 2.7626805305480957, "learning_rate": 2.3997999199679876e-05, "loss": 0.6881, "mean_token_accuracy": 0.8045144259929657, "num_tokens": 674634392.0, "step": 65030 }, { "entropy": 0.7342460632324219, "epoch": 0.52032, "grad_norm": 2.903449773788452, "learning_rate": 2.3993997599039616e-05, "loss": 0.7332, "mean_token_accuracy": 0.7815425395965576, "num_tokens": 674772253.0, "step": 65040 }, { "entropy": 0.7093579709529877, "epoch": 0.5204, "grad_norm": 5.78926420211792, "learning_rate": 2.3989995998399363e-05, "loss": 0.7021, "mean_token_accuracy": 0.8139022290706635, "num_tokens": 674806106.0, "step": 65050 }, { "entropy": 0.6449681401252747, "epoch": 0.52048, "grad_norm": 1.9556034803390503, "learning_rate": 2.3985994397759104e-05, "loss": 0.6412, "mean_token_accuracy": 0.7953407466411591, "num_tokens": 674969946.0, "step": 65060 }, { "entropy": 0.6595765262842178, "epoch": 0.52056, "grad_norm": 2.953615188598633, "learning_rate": 2.398199279711885e-05, "loss": 0.6528, "mean_token_accuracy": 0.8163413405418396, "num_tokens": 675047082.0, "step": 65070 }, { "entropy": 0.693769884109497, "epoch": 0.52064, "grad_norm": 1.5678088665008545, "learning_rate": 2.397799119647859e-05, "loss": 0.6954, "mean_token_accuracy": 0.799546217918396, "num_tokens": 675139617.0, "step": 65080 }, { "entropy": 0.6709149837493896, "epoch": 0.52072, "grad_norm": 2.317344903945923, "learning_rate": 2.3973989595838338e-05, "loss": 0.6632, "mean_token_accuracy": 0.7972354292869568, "num_tokens": 675272783.0, "step": 65090 }, { "entropy": 0.7160750389099121, "epoch": 0.5208, "grad_norm": 4.908519268035889, "learning_rate": 2.3969987995198082e-05, "loss": 0.7079, "mean_token_accuracy": 0.8105519652366638, "num_tokens": 675309682.0, "step": 65100 }, { "entropy": 0.6782248973846435, "epoch": 0.52088, "grad_norm": 1.8863966464996338, "learning_rate": 2.3965986394557826e-05, "loss": 0.6786, "mean_token_accuracy": 0.7879409074783326, "num_tokens": 675472967.0, "step": 65110 }, { "entropy": 0.6686437755823136, "epoch": 0.52096, "grad_norm": 3.6677405834198, "learning_rate": 2.396198479391757e-05, "loss": 0.6531, "mean_token_accuracy": 0.8150980532169342, "num_tokens": 675552826.0, "step": 65120 }, { "entropy": 0.7005393624305725, "epoch": 0.52104, "grad_norm": 1.9385651350021362, "learning_rate": 2.3957983193277313e-05, "loss": 0.7083, "mean_token_accuracy": 0.7975947320461273, "num_tokens": 675647322.0, "step": 65130 }, { "entropy": 0.7151827156543732, "epoch": 0.52112, "grad_norm": 2.665977716445923, "learning_rate": 2.3953981592637057e-05, "loss": 0.7082, "mean_token_accuracy": 0.7837055742740631, "num_tokens": 675787450.0, "step": 65140 }, { "entropy": 0.7008076190948487, "epoch": 0.5212, "grad_norm": 4.584321022033691, "learning_rate": 2.39499799919968e-05, "loss": 0.6849, "mean_token_accuracy": 0.816451370716095, "num_tokens": 675824986.0, "step": 65150 }, { "entropy": 0.637781971693039, "epoch": 0.52128, "grad_norm": 1.9685643911361694, "learning_rate": 2.3945978391356544e-05, "loss": 0.6421, "mean_token_accuracy": 0.7959880352020263, "num_tokens": 675988826.0, "step": 65160 }, { "entropy": 0.6309371799230575, "epoch": 0.52136, "grad_norm": 3.2344603538513184, "learning_rate": 2.3941976790716288e-05, "loss": 0.6135, "mean_token_accuracy": 0.822870796918869, "num_tokens": 676074547.0, "step": 65170 }, { "entropy": 0.6145367622375488, "epoch": 0.52144, "grad_norm": 1.4932174682617188, "learning_rate": 2.393797519007603e-05, "loss": 0.6163, "mean_token_accuracy": 0.8250398635864258, "num_tokens": 676168608.0, "step": 65180 }, { "entropy": 0.6801047265529633, "epoch": 0.52152, "grad_norm": 3.5798606872558594, "learning_rate": 2.3933973589435775e-05, "loss": 0.6779, "mean_token_accuracy": 0.7940997302532196, "num_tokens": 676309994.0, "step": 65190 }, { "entropy": 0.6385393679141999, "epoch": 0.5216, "grad_norm": 4.6274495124816895, "learning_rate": 2.392997198879552e-05, "loss": 0.6319, "mean_token_accuracy": 0.8302349627017975, "num_tokens": 676349486.0, "step": 65200 }, { "entropy": 0.6251818805932998, "epoch": 0.52168, "grad_norm": 2.0894603729248047, "learning_rate": 2.3925970388155263e-05, "loss": 0.6253, "mean_token_accuracy": 0.8015876948833466, "num_tokens": 676513326.0, "step": 65210 }, { "entropy": 0.585845735669136, "epoch": 0.52176, "grad_norm": 4.304083347320557, "learning_rate": 2.3921968787515006e-05, "loss": 0.5798, "mean_token_accuracy": 0.8272666931152344, "num_tokens": 676609605.0, "step": 65220 }, { "entropy": 0.6911777138710022, "epoch": 0.52184, "grad_norm": 2.700685739517212, "learning_rate": 2.391796718687475e-05, "loss": 0.6793, "mean_token_accuracy": 0.802818775177002, "num_tokens": 676705208.0, "step": 65230 }, { "entropy": 0.7036262333393097, "epoch": 0.52192, "grad_norm": 3.8385989665985107, "learning_rate": 2.3913965586234494e-05, "loss": 0.7025, "mean_token_accuracy": 0.7878259897232056, "num_tokens": 676838188.0, "step": 65240 }, { "entropy": 0.6930771768093109, "epoch": 0.522, "grad_norm": 4.8169989585876465, "learning_rate": 2.3909963985594238e-05, "loss": 0.6778, "mean_token_accuracy": 0.8199513256549835, "num_tokens": 676873182.0, "step": 65250 }, { "entropy": 0.6087692469358444, "epoch": 0.52208, "grad_norm": 1.6410940885543823, "learning_rate": 2.3905962384953985e-05, "loss": 0.6029, "mean_token_accuracy": 0.806283587217331, "num_tokens": 677037022.0, "step": 65260 }, { "entropy": 0.69021235704422, "epoch": 0.52216, "grad_norm": 3.0988831520080566, "learning_rate": 2.3901960784313725e-05, "loss": 0.6901, "mean_token_accuracy": 0.8010561227798462, "num_tokens": 677132783.0, "step": 65270 }, { "entropy": 0.8048532664775848, "epoch": 0.52224, "grad_norm": 1.361112117767334, "learning_rate": 2.3897959183673472e-05, "loss": 0.7951, "mean_token_accuracy": 0.7761330783367157, "num_tokens": 677228548.0, "step": 65280 }, { "entropy": 0.7117878258228302, "epoch": 0.52232, "grad_norm": 2.6891586780548096, "learning_rate": 2.3893957583033212e-05, "loss": 0.7173, "mean_token_accuracy": 0.784841787815094, "num_tokens": 677372657.0, "step": 65290 }, { "entropy": 0.6349273532629013, "epoch": 0.5224, "grad_norm": 5.733823299407959, "learning_rate": 2.388995598239296e-05, "loss": 0.6173, "mean_token_accuracy": 0.8275227248668671, "num_tokens": 677409669.0, "step": 65300 }, { "entropy": 0.6555743038654327, "epoch": 0.52248, "grad_norm": 2.260267972946167, "learning_rate": 2.38859543817527e-05, "loss": 0.6546, "mean_token_accuracy": 0.7946751415729523, "num_tokens": 677573509.0, "step": 65310 }, { "entropy": 0.6409658819437027, "epoch": 0.52256, "grad_norm": 4.079573154449463, "learning_rate": 2.3881952781112447e-05, "loss": 0.6272, "mean_token_accuracy": 0.8118343472480773, "num_tokens": 677663991.0, "step": 65320 }, { "entropy": 0.6703810274600983, "epoch": 0.52264, "grad_norm": 2.4087436199188232, "learning_rate": 2.387795118047219e-05, "loss": 0.6793, "mean_token_accuracy": 0.8051952362060547, "num_tokens": 677757877.0, "step": 65330 }, { "entropy": 0.6501958817243576, "epoch": 0.52272, "grad_norm": 2.167692184448242, "learning_rate": 2.3873949579831934e-05, "loss": 0.6465, "mean_token_accuracy": 0.802798068523407, "num_tokens": 677892408.0, "step": 65340 }, { "entropy": 0.7023620307445526, "epoch": 0.5228, "grad_norm": 3.8918423652648926, "learning_rate": 2.3869947979191678e-05, "loss": 0.6973, "mean_token_accuracy": 0.8140408158302307, "num_tokens": 677934288.0, "step": 65350 }, { "entropy": 0.7435391664505004, "epoch": 0.52288, "grad_norm": 2.051103115081787, "learning_rate": 2.3865946378551422e-05, "loss": 0.7412, "mean_token_accuracy": 0.7783158242702484, "num_tokens": 678098128.0, "step": 65360 }, { "entropy": 0.7516630858182907, "epoch": 0.52296, "grad_norm": 2.889616012573242, "learning_rate": 2.3861944777911166e-05, "loss": 0.7501, "mean_token_accuracy": 0.7889079749584198, "num_tokens": 678197671.0, "step": 65370 }, { "entropy": 0.6993005275726318, "epoch": 0.52304, "grad_norm": 1.732487678527832, "learning_rate": 2.385794317727091e-05, "loss": 0.703, "mean_token_accuracy": 0.8013499677181244, "num_tokens": 678291321.0, "step": 65380 }, { "entropy": 0.7257639586925506, "epoch": 0.52312, "grad_norm": 2.425619602203369, "learning_rate": 2.3853941576630653e-05, "loss": 0.7171, "mean_token_accuracy": 0.7811049163341522, "num_tokens": 678442434.0, "step": 65390 }, { "entropy": 0.6817639470100403, "epoch": 0.5232, "grad_norm": 4.581266403198242, "learning_rate": 2.38499399759904e-05, "loss": 0.6776, "mean_token_accuracy": 0.8157903909683227, "num_tokens": 678489770.0, "step": 65400 }, { "entropy": 0.6468994617462158, "epoch": 0.52328, "grad_norm": 1.5960174798965454, "learning_rate": 2.384593837535014e-05, "loss": 0.6401, "mean_token_accuracy": 0.7972136378288269, "num_tokens": 678652505.0, "step": 65410 }, { "entropy": 0.6364691317081451, "epoch": 0.52336, "grad_norm": 3.868459701538086, "learning_rate": 2.3841936774709887e-05, "loss": 0.6259, "mean_token_accuracy": 0.8212664604187012, "num_tokens": 678726331.0, "step": 65420 }, { "entropy": 0.6916824460029602, "epoch": 0.52344, "grad_norm": 2.597658157348633, "learning_rate": 2.3837935174069628e-05, "loss": 0.6912, "mean_token_accuracy": 0.8032073020935059, "num_tokens": 678820821.0, "step": 65430 }, { "entropy": 0.6751315474510193, "epoch": 0.52352, "grad_norm": 3.1078736782073975, "learning_rate": 2.3833933573429375e-05, "loss": 0.6692, "mean_token_accuracy": 0.7980189561843872, "num_tokens": 678959682.0, "step": 65440 }, { "entropy": 0.6942387521266937, "epoch": 0.5236, "grad_norm": 4.511228561401367, "learning_rate": 2.3829931972789115e-05, "loss": 0.6772, "mean_token_accuracy": 0.8161547660827637, "num_tokens": 678997569.0, "step": 65450 }, { "entropy": 0.6142378151416779, "epoch": 0.52368, "grad_norm": 1.5703332424163818, "learning_rate": 2.3825930372148862e-05, "loss": 0.614, "mean_token_accuracy": 0.8020599246025085, "num_tokens": 679161174.0, "step": 65460 }, { "entropy": 0.7168028354644775, "epoch": 0.52376, "grad_norm": 2.7637734413146973, "learning_rate": 2.3821928771508606e-05, "loss": 0.713, "mean_token_accuracy": 0.7968554317951202, "num_tokens": 679246299.0, "step": 65470 }, { "entropy": 0.7172234714031219, "epoch": 0.52384, "grad_norm": 1.6420955657958984, "learning_rate": 2.381792717086835e-05, "loss": 0.7202, "mean_token_accuracy": 0.7933364152908325, "num_tokens": 679339290.0, "step": 65480 }, { "entropy": 0.6503628253936767, "epoch": 0.52392, "grad_norm": 3.180959701538086, "learning_rate": 2.3813925570228093e-05, "loss": 0.6454, "mean_token_accuracy": 0.8007101774215698, "num_tokens": 679479440.0, "step": 65490 }, { "entropy": 0.7621937036514282, "epoch": 0.524, "grad_norm": 4.350693225860596, "learning_rate": 2.3809923969587837e-05, "loss": 0.7728, "mean_token_accuracy": 0.7940534710884094, "num_tokens": 679517199.0, "step": 65500 }, { "entropy": 0.6174089431762695, "epoch": 0.52408, "grad_norm": 2.1370961666107178, "learning_rate": 2.380592236894758e-05, "loss": 0.6112, "mean_token_accuracy": 0.8062347292900085, "num_tokens": 679681039.0, "step": 65510 }, { "entropy": 0.7428431928157806, "epoch": 0.52416, "grad_norm": 4.178177833557129, "learning_rate": 2.3801920768307325e-05, "loss": 0.7357, "mean_token_accuracy": 0.7896374046802521, "num_tokens": 679781007.0, "step": 65520 }, { "entropy": 0.701077914237976, "epoch": 0.52424, "grad_norm": 1.9640649557113647, "learning_rate": 2.3797919167667068e-05, "loss": 0.6888, "mean_token_accuracy": 0.8000056564807891, "num_tokens": 679876221.0, "step": 65530 }, { "entropy": 0.6991970002651214, "epoch": 0.52432, "grad_norm": 3.3345839977264404, "learning_rate": 2.3793917567026812e-05, "loss": 0.6971, "mean_token_accuracy": 0.788553673028946, "num_tokens": 680010607.0, "step": 65540 }, { "entropy": 0.6897876679897308, "epoch": 0.5244, "grad_norm": 6.003995418548584, "learning_rate": 2.3789915966386556e-05, "loss": 0.6871, "mean_token_accuracy": 0.8152447938919067, "num_tokens": 680051546.0, "step": 65550 }, { "entropy": 0.6905583024024964, "epoch": 0.52448, "grad_norm": 2.1522817611694336, "learning_rate": 2.37859143657463e-05, "loss": 0.6934, "mean_token_accuracy": 0.7856436312198639, "num_tokens": 680215386.0, "step": 65560 }, { "entropy": 0.5912861138582229, "epoch": 0.52456, "grad_norm": 2.31430721282959, "learning_rate": 2.3781912765106043e-05, "loss": 0.5769, "mean_token_accuracy": 0.8310504674911499, "num_tokens": 680302434.0, "step": 65570 }, { "entropy": 0.7668782353401185, "epoch": 0.52464, "grad_norm": 2.0404787063598633, "learning_rate": 2.3777911164465787e-05, "loss": 0.7661, "mean_token_accuracy": 0.7873212695121765, "num_tokens": 680395360.0, "step": 65580 }, { "entropy": 0.6975838005542755, "epoch": 0.52472, "grad_norm": 2.7526707649230957, "learning_rate": 2.377390956382553e-05, "loss": 0.6957, "mean_token_accuracy": 0.786374169588089, "num_tokens": 680535473.0, "step": 65590 }, { "entropy": 0.6721028804779052, "epoch": 0.5248, "grad_norm": 5.132789611816406, "learning_rate": 2.3769907963185274e-05, "loss": 0.6612, "mean_token_accuracy": 0.825426709651947, "num_tokens": 680567935.0, "step": 65600 }, { "entropy": 0.6950624883174896, "epoch": 0.52488, "grad_norm": 2.412874460220337, "learning_rate": 2.3765906362545018e-05, "loss": 0.6975, "mean_token_accuracy": 0.785250848531723, "num_tokens": 680731754.0, "step": 65610 }, { "entropy": 0.6390143156051635, "epoch": 0.52496, "grad_norm": 3.044438600540161, "learning_rate": 2.3761904761904762e-05, "loss": 0.6191, "mean_token_accuracy": 0.8202166974544525, "num_tokens": 680817578.0, "step": 65620 }, { "entropy": 0.6278048634529114, "epoch": 0.52504, "grad_norm": 2.2751071453094482, "learning_rate": 2.375790316126451e-05, "loss": 0.6285, "mean_token_accuracy": 0.8146472752094269, "num_tokens": 680911873.0, "step": 65630 }, { "entropy": 0.6970721065998078, "epoch": 0.52512, "grad_norm": 1.8783220052719116, "learning_rate": 2.375390156062425e-05, "loss": 0.6877, "mean_token_accuracy": 0.7876082956790924, "num_tokens": 681062487.0, "step": 65640 }, { "entropy": 0.6978322088718414, "epoch": 0.5252, "grad_norm": 6.088903903961182, "learning_rate": 2.3749899959983996e-05, "loss": 0.7154, "mean_token_accuracy": 0.8122473955154419, "num_tokens": 681106454.0, "step": 65650 }, { "entropy": 0.6697133779525757, "epoch": 0.52528, "grad_norm": 1.4807037115097046, "learning_rate": 2.3745898359343737e-05, "loss": 0.6596, "mean_token_accuracy": 0.7977729260921478, "num_tokens": 681269006.0, "step": 65660 }, { "entropy": 0.6504054993391037, "epoch": 0.52536, "grad_norm": 3.2920098304748535, "learning_rate": 2.3741896758703484e-05, "loss": 0.6399, "mean_token_accuracy": 0.8159576714038849, "num_tokens": 681344332.0, "step": 65670 }, { "entropy": 0.6774742901325226, "epoch": 0.52544, "grad_norm": 2.072305202484131, "learning_rate": 2.3737895158063224e-05, "loss": 0.6921, "mean_token_accuracy": 0.8001887917518615, "num_tokens": 681436476.0, "step": 65680 }, { "entropy": 0.6418882846832276, "epoch": 0.52552, "grad_norm": 2.494154691696167, "learning_rate": 2.373389355742297e-05, "loss": 0.6361, "mean_token_accuracy": 0.8048501551151276, "num_tokens": 681569515.0, "step": 65690 }, { "entropy": 0.6528503477573395, "epoch": 0.5256, "grad_norm": 4.342129707336426, "learning_rate": 2.3729891956782715e-05, "loss": 0.6549, "mean_token_accuracy": 0.8222752153873444, "num_tokens": 681606893.0, "step": 65700 }, { "entropy": 0.6330434739589691, "epoch": 0.52568, "grad_norm": 2.201890707015991, "learning_rate": 2.372589035614246e-05, "loss": 0.6275, "mean_token_accuracy": 0.7996091842651367, "num_tokens": 681770733.0, "step": 65710 }, { "entropy": 0.6507109761238098, "epoch": 0.52576, "grad_norm": 2.7978806495666504, "learning_rate": 2.3721888755502202e-05, "loss": 0.6371, "mean_token_accuracy": 0.8113677024841308, "num_tokens": 681865020.0, "step": 65720 }, { "entropy": 0.6497059643268586, "epoch": 0.52584, "grad_norm": 1.4418641328811646, "learning_rate": 2.3717887154861946e-05, "loss": 0.6494, "mean_token_accuracy": 0.8120947897434234, "num_tokens": 681959105.0, "step": 65730 }, { "entropy": 0.6766288578510284, "epoch": 0.52592, "grad_norm": 1.9596197605133057, "learning_rate": 2.371388555422169e-05, "loss": 0.6744, "mean_token_accuracy": 0.7926072835922241, "num_tokens": 682108387.0, "step": 65740 }, { "entropy": 0.6597445189952851, "epoch": 0.526, "grad_norm": 5.461314678192139, "learning_rate": 2.3709883953581433e-05, "loss": 0.6606, "mean_token_accuracy": 0.818920087814331, "num_tokens": 682156034.0, "step": 65750 }, { "entropy": 0.6711259663105011, "epoch": 0.52608, "grad_norm": 1.9574185609817505, "learning_rate": 2.3705882352941177e-05, "loss": 0.6635, "mean_token_accuracy": 0.789618194103241, "num_tokens": 682319307.0, "step": 65760 }, { "entropy": 0.6944647818803787, "epoch": 0.52616, "grad_norm": 3.3848979473114014, "learning_rate": 2.3701880752300924e-05, "loss": 0.6959, "mean_token_accuracy": 0.8015403509140014, "num_tokens": 682391004.0, "step": 65770 }, { "entropy": 0.6826310217380523, "epoch": 0.52624, "grad_norm": 1.9513094425201416, "learning_rate": 2.3697879151660665e-05, "loss": 0.6769, "mean_token_accuracy": 0.8079633116722107, "num_tokens": 682483786.0, "step": 65780 }, { "entropy": 0.6656541258096695, "epoch": 0.52632, "grad_norm": 2.3825955390930176, "learning_rate": 2.369387755102041e-05, "loss": 0.6639, "mean_token_accuracy": 0.8030539512634277, "num_tokens": 682617982.0, "step": 65790 }, { "entropy": 0.7057676672935486, "epoch": 0.5264, "grad_norm": 5.02653694152832, "learning_rate": 2.3689875950380152e-05, "loss": 0.692, "mean_token_accuracy": 0.8085903644561767, "num_tokens": 682655387.0, "step": 65800 }, { "entropy": 0.5904103994369507, "epoch": 0.52648, "grad_norm": 2.099822759628296, "learning_rate": 2.36858743497399e-05, "loss": 0.5961, "mean_token_accuracy": 0.8067365169525147, "num_tokens": 682818667.0, "step": 65810 }, { "entropy": 0.7118953585624694, "epoch": 0.52656, "grad_norm": 4.140548229217529, "learning_rate": 2.368187274909964e-05, "loss": 0.6946, "mean_token_accuracy": 0.8064860939979553, "num_tokens": 682889311.0, "step": 65820 }, { "entropy": 0.7103381156921387, "epoch": 0.52664, "grad_norm": 2.163295269012451, "learning_rate": 2.3677871148459386e-05, "loss": 0.6927, "mean_token_accuracy": 0.8025705754756928, "num_tokens": 682981171.0, "step": 65830 }, { "entropy": 0.6231846570968628, "epoch": 0.52672, "grad_norm": 3.2406327724456787, "learning_rate": 2.3673869547819127e-05, "loss": 0.6282, "mean_token_accuracy": 0.8074575960636139, "num_tokens": 683117827.0, "step": 65840 }, { "entropy": 0.7010738968849182, "epoch": 0.5268, "grad_norm": 3.8791592121124268, "learning_rate": 2.3669867947178874e-05, "loss": 0.7111, "mean_token_accuracy": 0.8107200860977173, "num_tokens": 683154772.0, "step": 65850 }, { "entropy": 0.6439801394939423, "epoch": 0.52688, "grad_norm": 1.62369966506958, "learning_rate": 2.3665866346538618e-05, "loss": 0.6413, "mean_token_accuracy": 0.7962506055831909, "num_tokens": 683318612.0, "step": 65860 }, { "entropy": 0.6313695073127746, "epoch": 0.52696, "grad_norm": 3.4778873920440674, "learning_rate": 2.366186474589836e-05, "loss": 0.6193, "mean_token_accuracy": 0.8224515736103057, "num_tokens": 683394269.0, "step": 65870 }, { "entropy": 0.6673673152923584, "epoch": 0.52704, "grad_norm": 1.6141316890716553, "learning_rate": 2.3657863145258105e-05, "loss": 0.6673, "mean_token_accuracy": 0.8058831751346588, "num_tokens": 683489053.0, "step": 65880 }, { "entropy": 0.7250244915485382, "epoch": 0.52712, "grad_norm": 3.993194103240967, "learning_rate": 2.365386154461785e-05, "loss": 0.7111, "mean_token_accuracy": 0.7860437035560608, "num_tokens": 683630992.0, "step": 65890 }, { "entropy": 0.6732983022928238, "epoch": 0.5272, "grad_norm": 6.626317501068115, "learning_rate": 2.3649859943977592e-05, "loss": 0.6594, "mean_token_accuracy": 0.8178734481334686, "num_tokens": 683670214.0, "step": 65900 }, { "entropy": 0.6837319672107697, "epoch": 0.52728, "grad_norm": 2.1125688552856445, "learning_rate": 2.3645858343337336e-05, "loss": 0.6984, "mean_token_accuracy": 0.7810576558113098, "num_tokens": 683834054.0, "step": 65910 }, { "entropy": 0.7156048059463501, "epoch": 0.52736, "grad_norm": 2.920168399810791, "learning_rate": 2.364185674269708e-05, "loss": 0.6971, "mean_token_accuracy": 0.8026753962039948, "num_tokens": 683925209.0, "step": 65920 }, { "entropy": 0.6496852159500122, "epoch": 0.52744, "grad_norm": 1.8274290561676025, "learning_rate": 2.3637855142056824e-05, "loss": 0.6299, "mean_token_accuracy": 0.815303772687912, "num_tokens": 684020295.0, "step": 65930 }, { "entropy": 0.7490340888500213, "epoch": 0.52752, "grad_norm": 2.652888536453247, "learning_rate": 2.3633853541416567e-05, "loss": 0.7684, "mean_token_accuracy": 0.7734799087047577, "num_tokens": 684165624.0, "step": 65940 }, { "entropy": 0.6928147614002228, "epoch": 0.5276, "grad_norm": 4.8072333335876465, "learning_rate": 2.362985194077631e-05, "loss": 0.6868, "mean_token_accuracy": 0.8164450526237488, "num_tokens": 684205393.0, "step": 65950 }, { "entropy": 0.6088992416858673, "epoch": 0.52768, "grad_norm": 1.630725622177124, "learning_rate": 2.3625850340136055e-05, "loss": 0.6037, "mean_token_accuracy": 0.8082926094532012, "num_tokens": 684365911.0, "step": 65960 }, { "entropy": 0.6571737080812454, "epoch": 0.52776, "grad_norm": 3.5295968055725098, "learning_rate": 2.36218487394958e-05, "loss": 0.6608, "mean_token_accuracy": 0.8168087065219879, "num_tokens": 684430796.0, "step": 65970 }, { "entropy": 0.6989542245864868, "epoch": 0.52784, "grad_norm": 2.1194584369659424, "learning_rate": 2.3617847138855542e-05, "loss": 0.6767, "mean_token_accuracy": 0.8083300948143005, "num_tokens": 684522573.0, "step": 65980 }, { "entropy": 0.7156071364879608, "epoch": 0.52792, "grad_norm": 3.6176235675811768, "learning_rate": 2.3613845538215286e-05, "loss": 0.7083, "mean_token_accuracy": 0.7854281485080719, "num_tokens": 684652458.0, "step": 65990 }, { "entropy": 0.6158035635948181, "epoch": 0.528, "grad_norm": 4.713787078857422, "learning_rate": 2.3609843937575033e-05, "loss": 0.6049, "mean_token_accuracy": 0.8328906238079071, "num_tokens": 684691600.0, "step": 66000 } ], "logging_steps": 10, "max_steps": 125000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1746909280081543e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }