diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31901 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4553, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 7.812648680225853, + "learning_rate": 2.9197080291970804e-07, + "loss": 2.184, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 8.477363513420425, + "learning_rate": 5.839416058394161e-07, + "loss": 2.3547, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 8.667187992946625, + "learning_rate": 8.759124087591242e-07, + "loss": 2.3479, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 8.975762686028327, + "learning_rate": 1.1678832116788322e-06, + "loss": 2.5346, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 9.085661236932879, + "learning_rate": 1.4598540145985402e-06, + "loss": 2.4729, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 7.885329238897419, + "learning_rate": 1.7518248175182485e-06, + "loss": 2.2296, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 8.312140664960022, + "learning_rate": 2.0437956204379563e-06, + "loss": 2.3955, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 7.9862669851230255, + "learning_rate": 2.3357664233576643e-06, + "loss": 2.3144, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 6.660452935107435, + "learning_rate": 2.627737226277373e-06, + "loss": 2.2619, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 5.839865306189313, + "learning_rate": 2.9197080291970804e-06, + "loss": 2.0876, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 6.218399728082995, + "learning_rate": 3.2116788321167884e-06, + "loss": 2.1238, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 4.2177468696706075, + "learning_rate": 3.503649635036497e-06, + "loss": 1.9576, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 4.546161818531478, + "learning_rate": 3.7956204379562045e-06, + "loss": 1.8759, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 3.6162250465985224, + "learning_rate": 4.0875912408759126e-06, + "loss": 1.7203, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 3.134376192710703, + "learning_rate": 4.379562043795621e-06, + "loss": 1.6647, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 2.625610462231106, + "learning_rate": 4.671532846715329e-06, + "loss": 1.6871, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.90084477103016, + "learning_rate": 4.963503649635037e-06, + "loss": 1.5972, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.074734287101781, + "learning_rate": 5.255474452554746e-06, + "loss": 1.5691, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 2.3580179922502973, + "learning_rate": 5.547445255474453e-06, + "loss": 1.4562, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.1643910072609365, + "learning_rate": 5.839416058394161e-06, + "loss": 1.4711, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 1.9961717444782574, + "learning_rate": 6.13138686131387e-06, + "loss": 1.5118, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 1.9070903614201673, + "learning_rate": 6.423357664233577e-06, + "loss": 1.5549, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 2.1352623364264467, + "learning_rate": 6.715328467153285e-06, + "loss": 1.5622, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 2.1105178562720255, + "learning_rate": 7.007299270072994e-06, + "loss": 1.526, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 1.943169696695018, + "learning_rate": 7.299270072992701e-06, + "loss": 1.308, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 1.9445026964211596, + "learning_rate": 7.591240875912409e-06, + "loss": 1.4118, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 1.6960437094301213, + "learning_rate": 7.883211678832117e-06, + "loss": 1.3774, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 1.806297937832825, + "learning_rate": 8.175182481751825e-06, + "loss": 1.3251, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 1.925963480221896, + "learning_rate": 8.467153284671533e-06, + "loss": 1.2795, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 1.761128734910925, + "learning_rate": 8.759124087591241e-06, + "loss": 1.3643, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 1.6845485873825556, + "learning_rate": 9.05109489051095e-06, + "loss": 1.1851, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 1.6619942577047109, + "learning_rate": 9.343065693430657e-06, + "loss": 1.3317, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 1.684565641977221, + "learning_rate": 9.635036496350367e-06, + "loss": 1.4255, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 1.7809624690835513, + "learning_rate": 9.927007299270073e-06, + "loss": 1.24, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 1.5927230667953016, + "learning_rate": 1.0218978102189783e-05, + "loss": 1.1678, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 1.708251796110156, + "learning_rate": 1.0510948905109491e-05, + "loss": 1.2783, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 1.61533474408533, + "learning_rate": 1.0802919708029198e-05, + "loss": 1.2122, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 1.534946029285862, + "learning_rate": 1.1094890510948906e-05, + "loss": 1.086, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 1.7288084999717537, + "learning_rate": 1.1386861313868614e-05, + "loss": 1.2476, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 1.6147373823283309, + "learning_rate": 1.1678832116788322e-05, + "loss": 1.2814, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.6600660591309946, + "learning_rate": 1.1970802919708031e-05, + "loss": 1.3103, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 1.5377864291720114, + "learning_rate": 1.226277372262774e-05, + "loss": 1.2692, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 1.5523788716951157, + "learning_rate": 1.2554744525547446e-05, + "loss": 1.1435, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 1.5435394390617185, + "learning_rate": 1.2846715328467154e-05, + "loss": 1.1664, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 1.6764266187454582, + "learning_rate": 1.3138686131386862e-05, + "loss": 1.1319, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.5873994149609632, + "learning_rate": 1.343065693430657e-05, + "loss": 1.2192, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 1.8001990049344387, + "learning_rate": 1.372262773722628e-05, + "loss": 1.1245, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 1.8115756867886097, + "learning_rate": 1.4014598540145988e-05, + "loss": 1.2252, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 1.577246544210627, + "learning_rate": 1.4306569343065696e-05, + "loss": 1.1797, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.5385036325197972, + "learning_rate": 1.4598540145985402e-05, + "loss": 1.0706, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.5936588934494904, + "learning_rate": 1.489051094890511e-05, + "loss": 1.1354, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 1.5968142951430568, + "learning_rate": 1.5182481751824818e-05, + "loss": 1.1386, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.5492790883712821, + "learning_rate": 1.5474452554744528e-05, + "loss": 1.1177, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 1.6938133819094698, + "learning_rate": 1.5766423357664234e-05, + "loss": 1.178, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 1.4934152874568871, + "learning_rate": 1.6058394160583944e-05, + "loss": 1.1386, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.5257065527660285, + "learning_rate": 1.635036496350365e-05, + "loss": 0.9883, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.3479382206148167, + "learning_rate": 1.664233576642336e-05, + "loss": 0.9845, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 1.4843433577527922, + "learning_rate": 1.6934306569343066e-05, + "loss": 1.034, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 1.5578311593823568, + "learning_rate": 1.7226277372262773e-05, + "loss": 1.0024, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 1.6841397207836093, + "learning_rate": 1.7518248175182482e-05, + "loss": 1.1098, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.5967023842041272, + "learning_rate": 1.7810218978102192e-05, + "loss": 1.0532, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.458201329751063, + "learning_rate": 1.81021897810219e-05, + "loss": 1.0601, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 1.4896668719717416, + "learning_rate": 1.8394160583941608e-05, + "loss": 0.9523, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.594477543540855, + "learning_rate": 1.8686131386861315e-05, + "loss": 1.0128, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.5053984138120584, + "learning_rate": 1.897810218978102e-05, + "loss": 1.0969, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.4337321118516833, + "learning_rate": 1.9270072992700734e-05, + "loss": 0.978, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 1.5514572553484482, + "learning_rate": 1.956204379562044e-05, + "loss": 1.0519, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 1.5646238007711162, + "learning_rate": 1.9854014598540147e-05, + "loss": 1.0664, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 1.5916594463537463, + "learning_rate": 2.0145985401459857e-05, + "loss": 1.0785, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 1.3923766430469164, + "learning_rate": 2.0437956204379566e-05, + "loss": 0.9919, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 1.6016619395577614, + "learning_rate": 2.0729927007299273e-05, + "loss": 1.0618, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 1.5406397758843127, + "learning_rate": 2.1021897810218982e-05, + "loss": 1.0485, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 1.6912492146176519, + "learning_rate": 2.131386861313869e-05, + "loss": 1.1137, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 1.4428980274558936, + "learning_rate": 2.1605839416058395e-05, + "loss": 1.0067, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 1.4853785284781014, + "learning_rate": 2.1897810218978105e-05, + "loss": 1.0771, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.5161668261227093, + "learning_rate": 2.218978102189781e-05, + "loss": 1.0348, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 1.4922789700419026, + "learning_rate": 2.248175182481752e-05, + "loss": 1.0129, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 1.3967521784164405, + "learning_rate": 2.2773722627737227e-05, + "loss": 0.9375, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 1.4611514337742868, + "learning_rate": 2.3065693430656934e-05, + "loss": 1.0035, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 1.4726560071545307, + "learning_rate": 2.3357664233576643e-05, + "loss": 0.9118, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 1.509851608690166, + "learning_rate": 2.3649635036496353e-05, + "loss": 0.9736, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 1.4854110314617655, + "learning_rate": 2.3941605839416063e-05, + "loss": 1.0382, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 1.4438095939193825, + "learning_rate": 2.423357664233577e-05, + "loss": 0.9426, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 1.588853832486513, + "learning_rate": 2.452554744525548e-05, + "loss": 1.0912, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 1.4629616000754389, + "learning_rate": 2.4817518248175185e-05, + "loss": 0.9363, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 1.5126486586979293, + "learning_rate": 2.510948905109489e-05, + "loss": 0.8875, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 1.6442965886013419, + "learning_rate": 2.54014598540146e-05, + "loss": 1.0676, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 1.5786077686311497, + "learning_rate": 2.5693430656934308e-05, + "loss": 1.0544, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 1.5424329382663844, + "learning_rate": 2.5985401459854017e-05, + "loss": 1.0603, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 1.3763163074731595, + "learning_rate": 2.6277372262773724e-05, + "loss": 0.9341, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 1.36645217739804, + "learning_rate": 2.656934306569343e-05, + "loss": 0.9277, + "step": 91 + }, + { + "epoch": 0.02, + "grad_norm": 1.476560239436927, + "learning_rate": 2.686131386861314e-05, + "loss": 0.9549, + "step": 92 + }, + { + "epoch": 0.02, + "grad_norm": 1.651790661601902, + "learning_rate": 2.7153284671532846e-05, + "loss": 1.0434, + "step": 93 + }, + { + "epoch": 0.02, + "grad_norm": 1.564711569632748, + "learning_rate": 2.744525547445256e-05, + "loss": 0.9528, + "step": 94 + }, + { + "epoch": 0.02, + "grad_norm": 1.3892575516439725, + "learning_rate": 2.7737226277372266e-05, + "loss": 0.9055, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 1.47338517258931, + "learning_rate": 2.8029197080291975e-05, + "loss": 0.9045, + "step": 96 + }, + { + "epoch": 0.02, + "grad_norm": 1.4283014468117288, + "learning_rate": 2.832116788321168e-05, + "loss": 0.9418, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 1.4984556035926158, + "learning_rate": 2.861313868613139e-05, + "loss": 1.0147, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 1.5189781457158873, + "learning_rate": 2.8905109489051098e-05, + "loss": 0.9159, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 1.4628925807313362, + "learning_rate": 2.9197080291970804e-05, + "loss": 0.9209, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 1.5929870275147635, + "learning_rate": 2.9489051094890514e-05, + "loss": 0.9557, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 1.4503689132203692, + "learning_rate": 2.978102189781022e-05, + "loss": 0.8777, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 1.387439894877902, + "learning_rate": 3.007299270072993e-05, + "loss": 0.8537, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 1.4809151733801909, + "learning_rate": 3.0364963503649636e-05, + "loss": 0.8067, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 1.4872114687064437, + "learning_rate": 3.0656934306569346e-05, + "loss": 0.9561, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 1.3704334201331871, + "learning_rate": 3.0948905109489056e-05, + "loss": 0.832, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 1.2970664101409684, + "learning_rate": 3.1240875912408765e-05, + "loss": 0.8222, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 1.4920176464295187, + "learning_rate": 3.153284671532847e-05, + "loss": 0.9368, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 1.3703280318629414, + "learning_rate": 3.182481751824818e-05, + "loss": 0.9345, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 1.3550291640320893, + "learning_rate": 3.211678832116789e-05, + "loss": 0.8403, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 1.4486058010850487, + "learning_rate": 3.24087591240876e-05, + "loss": 0.8456, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 1.5455373509810715, + "learning_rate": 3.27007299270073e-05, + "loss": 0.9237, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 1.562125351812911, + "learning_rate": 3.299270072992701e-05, + "loss": 0.8496, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 1.409157185100568, + "learning_rate": 3.328467153284672e-05, + "loss": 0.8066, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 1.5594940262435777, + "learning_rate": 3.357664233576642e-05, + "loss": 0.8461, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 1.587613727971374, + "learning_rate": 3.386861313868613e-05, + "loss": 0.8593, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 1.4364812807164014, + "learning_rate": 3.416058394160584e-05, + "loss": 0.9084, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 1.4354830980730897, + "learning_rate": 3.4452554744525545e-05, + "loss": 0.8356, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 1.370928545179788, + "learning_rate": 3.474452554744526e-05, + "loss": 0.8, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 1.4805453237896125, + "learning_rate": 3.5036496350364965e-05, + "loss": 0.8707, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 1.3529579147537623, + "learning_rate": 3.5328467153284675e-05, + "loss": 0.7764, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 1.547681039749328, + "learning_rate": 3.5620437956204384e-05, + "loss": 0.8775, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 1.5332301823846584, + "learning_rate": 3.5912408759124094e-05, + "loss": 0.9876, + "step": 123 + }, + { + "epoch": 0.03, + "grad_norm": 1.512126463485808, + "learning_rate": 3.62043795620438e-05, + "loss": 0.9528, + "step": 124 + }, + { + "epoch": 0.03, + "grad_norm": 1.4554233218365884, + "learning_rate": 3.649635036496351e-05, + "loss": 0.8889, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 1.3808554901543213, + "learning_rate": 3.6788321167883217e-05, + "loss": 0.7677, + "step": 126 + }, + { + "epoch": 0.03, + "grad_norm": 1.4277159223916567, + "learning_rate": 3.708029197080292e-05, + "loss": 0.7742, + "step": 127 + }, + { + "epoch": 0.03, + "grad_norm": 1.49818015966614, + "learning_rate": 3.737226277372263e-05, + "loss": 0.7219, + "step": 128 + }, + { + "epoch": 0.03, + "grad_norm": 1.6176138053372855, + "learning_rate": 3.766423357664234e-05, + "loss": 0.9805, + "step": 129 + }, + { + "epoch": 0.03, + "grad_norm": 1.6421738352692505, + "learning_rate": 3.795620437956204e-05, + "loss": 0.9051, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 1.3299414386620134, + "learning_rate": 3.824817518248176e-05, + "loss": 0.7942, + "step": 131 + }, + { + "epoch": 0.03, + "grad_norm": 1.390975222259264, + "learning_rate": 3.854014598540147e-05, + "loss": 0.9009, + "step": 132 + }, + { + "epoch": 0.03, + "grad_norm": 1.33912799723986, + "learning_rate": 3.883211678832117e-05, + "loss": 0.8367, + "step": 133 + }, + { + "epoch": 0.03, + "grad_norm": 1.302621175946519, + "learning_rate": 3.912408759124088e-05, + "loss": 0.7549, + "step": 134 + }, + { + "epoch": 0.03, + "grad_norm": 1.405998800941922, + "learning_rate": 3.941605839416059e-05, + "loss": 0.7584, + "step": 135 + }, + { + "epoch": 0.03, + "grad_norm": 1.5325238903977623, + "learning_rate": 3.9708029197080294e-05, + "loss": 0.8071, + "step": 136 + }, + { + "epoch": 0.03, + "grad_norm": 1.4485039194756526, + "learning_rate": 4e-05, + "loss": 0.7941, + "step": 137 + }, + { + "epoch": 0.03, + "grad_norm": 1.4259171930589734, + "learning_rate": 3.99999949389387e-05, + "loss": 0.8034, + "step": 138 + }, + { + "epoch": 0.03, + "grad_norm": 1.3034281791915299, + "learning_rate": 3.999997975575736e-05, + "loss": 0.7876, + "step": 139 + }, + { + "epoch": 0.03, + "grad_norm": 1.5026318191108297, + "learning_rate": 3.9999954450463665e-05, + "loss": 0.8762, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 1.3526883590322558, + "learning_rate": 3.9999919023070414e-05, + "loss": 0.8062, + "step": 141 + }, + { + "epoch": 0.03, + "grad_norm": 1.385679946587705, + "learning_rate": 3.999987347359555e-05, + "loss": 0.7728, + "step": 142 + }, + { + "epoch": 0.03, + "grad_norm": 1.4683155472393012, + "learning_rate": 3.999981780206212e-05, + "loss": 0.7596, + "step": 143 + }, + { + "epoch": 0.03, + "grad_norm": 1.3424429191942802, + "learning_rate": 3.99997520084983e-05, + "loss": 0.7176, + "step": 144 + }, + { + "epoch": 0.03, + "grad_norm": 1.4314773279087492, + "learning_rate": 3.999967609293739e-05, + "loss": 0.9188, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 1.3056482672181737, + "learning_rate": 3.99995900554178e-05, + "loss": 0.7416, + "step": 146 + }, + { + "epoch": 0.03, + "grad_norm": 1.4585949890098764, + "learning_rate": 3.999949389598309e-05, + "loss": 0.824, + "step": 147 + }, + { + "epoch": 0.03, + "grad_norm": 1.337799603783892, + "learning_rate": 3.999938761468192e-05, + "loss": 0.6902, + "step": 148 + }, + { + "epoch": 0.03, + "grad_norm": 1.5249693382807843, + "learning_rate": 3.9999271211568084e-05, + "loss": 0.7957, + "step": 149 + }, + { + "epoch": 0.03, + "grad_norm": 1.5249336134421996, + "learning_rate": 3.999914468670048e-05, + "loss": 0.7885, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 1.3788826483849044, + "learning_rate": 3.999900804014317e-05, + "loss": 0.807, + "step": 151 + }, + { + "epoch": 0.03, + "grad_norm": 1.408791428424769, + "learning_rate": 3.9998861271965285e-05, + "loss": 0.7954, + "step": 152 + }, + { + "epoch": 0.03, + "grad_norm": 1.288136513782112, + "learning_rate": 3.999870438224111e-05, + "loss": 0.7176, + "step": 153 + }, + { + "epoch": 0.03, + "grad_norm": 1.3975412917477223, + "learning_rate": 3.999853737105007e-05, + "loss": 0.8284, + "step": 154 + }, + { + "epoch": 0.03, + "grad_norm": 1.5646770569437787, + "learning_rate": 3.9998360238476655e-05, + "loss": 0.8673, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 1.4448607692497883, + "learning_rate": 3.999817298461054e-05, + "loss": 0.8099, + "step": 156 + }, + { + "epoch": 0.03, + "grad_norm": 1.4221062105434186, + "learning_rate": 3.999797560954649e-05, + "loss": 0.8186, + "step": 157 + }, + { + "epoch": 0.03, + "grad_norm": 1.3541790105795066, + "learning_rate": 3.999776811338439e-05, + "loss": 0.7527, + "step": 158 + }, + { + "epoch": 0.03, + "grad_norm": 1.310569615173891, + "learning_rate": 3.999755049622926e-05, + "loss": 0.6947, + "step": 159 + }, + { + "epoch": 0.04, + "grad_norm": 1.2855053459421322, + "learning_rate": 3.9997322758191244e-05, + "loss": 0.6425, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 1.456957613232235, + "learning_rate": 3.999708489938559e-05, + "loss": 0.7138, + "step": 161 + }, + { + "epoch": 0.04, + "grad_norm": 1.3154442325459081, + "learning_rate": 3.999683691993268e-05, + "loss": 0.6217, + "step": 162 + }, + { + "epoch": 0.04, + "grad_norm": 1.3707120967385478, + "learning_rate": 3.999657881995802e-05, + "loss": 0.734, + "step": 163 + }, + { + "epoch": 0.04, + "grad_norm": 1.3464468131540774, + "learning_rate": 3.9996310599592244e-05, + "loss": 0.8018, + "step": 164 + }, + { + "epoch": 0.04, + "grad_norm": 1.3131685370844959, + "learning_rate": 3.9996032258971097e-05, + "loss": 0.7634, + "step": 165 + }, + { + "epoch": 0.04, + "grad_norm": 1.3014680349591794, + "learning_rate": 3.9995743798235445e-05, + "loss": 0.7904, + "step": 166 + }, + { + "epoch": 0.04, + "grad_norm": 1.3306836781749012, + "learning_rate": 3.999544521753128e-05, + "loss": 0.7516, + "step": 167 + }, + { + "epoch": 0.04, + "grad_norm": 1.3280616745156995, + "learning_rate": 3.999513651700971e-05, + "loss": 0.7404, + "step": 168 + }, + { + "epoch": 0.04, + "grad_norm": 1.382294035015746, + "learning_rate": 3.999481769682699e-05, + "loss": 0.7423, + "step": 169 + }, + { + "epoch": 0.04, + "grad_norm": 1.289816572248431, + "learning_rate": 3.9994488757144454e-05, + "loss": 0.6838, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 1.351114849421247, + "learning_rate": 3.999414969812859e-05, + "loss": 0.7064, + "step": 171 + }, + { + "epoch": 0.04, + "grad_norm": 1.112602374906335, + "learning_rate": 3.9993800519951e-05, + "loss": 0.5968, + "step": 172 + }, + { + "epoch": 0.04, + "grad_norm": 1.3237891836733933, + "learning_rate": 3.99934412227884e-05, + "loss": 0.7149, + "step": 173 + }, + { + "epoch": 0.04, + "grad_norm": 1.3127998282064572, + "learning_rate": 3.999307180682264e-05, + "loss": 0.6785, + "step": 174 + }, + { + "epoch": 0.04, + "grad_norm": 1.3832342075926638, + "learning_rate": 3.9992692272240684e-05, + "loss": 0.681, + "step": 175 + }, + { + "epoch": 0.04, + "grad_norm": 1.2908340765054303, + "learning_rate": 3.99923026192346e-05, + "loss": 0.6676, + "step": 176 + }, + { + "epoch": 0.04, + "grad_norm": 1.3544599313339964, + "learning_rate": 3.999190284800162e-05, + "loss": 0.7121, + "step": 177 + }, + { + "epoch": 0.04, + "grad_norm": 1.2556712245455064, + "learning_rate": 3.9991492958744046e-05, + "loss": 0.7016, + "step": 178 + }, + { + "epoch": 0.04, + "grad_norm": 1.3426939784728575, + "learning_rate": 3.9991072951669334e-05, + "loss": 0.7133, + "step": 179 + }, + { + "epoch": 0.04, + "grad_norm": 1.2358437986095847, + "learning_rate": 3.999064282699006e-05, + "loss": 0.6931, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 1.4329296933567661, + "learning_rate": 3.999020258492391e-05, + "loss": 0.7873, + "step": 181 + }, + { + "epoch": 0.04, + "grad_norm": 1.3413303680374504, + "learning_rate": 3.998975222569368e-05, + "loss": 0.668, + "step": 182 + }, + { + "epoch": 0.04, + "grad_norm": 1.3699727488682398, + "learning_rate": 3.9989291749527314e-05, + "loss": 0.744, + "step": 183 + }, + { + "epoch": 0.04, + "grad_norm": 1.2672579556409398, + "learning_rate": 3.998882115665786e-05, + "loss": 0.7541, + "step": 184 + }, + { + "epoch": 0.04, + "grad_norm": 1.1930919291337678, + "learning_rate": 3.998834044732348e-05, + "loss": 0.6611, + "step": 185 + }, + { + "epoch": 0.04, + "grad_norm": 1.3237952171145917, + "learning_rate": 3.9987849621767473e-05, + "loss": 0.6954, + "step": 186 + }, + { + "epoch": 0.04, + "grad_norm": 1.3491031122269155, + "learning_rate": 3.998734868023825e-05, + "loss": 0.6532, + "step": 187 + }, + { + "epoch": 0.04, + "grad_norm": 1.2474273911255802, + "learning_rate": 3.998683762298933e-05, + "loss": 0.614, + "step": 188 + }, + { + "epoch": 0.04, + "grad_norm": 1.4339242497046318, + "learning_rate": 3.9986316450279365e-05, + "loss": 0.7592, + "step": 189 + }, + { + "epoch": 0.04, + "grad_norm": 1.3155313182105586, + "learning_rate": 3.9985785162372135e-05, + "loss": 0.6463, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 1.2720545649011348, + "learning_rate": 3.998524375953651e-05, + "loss": 0.6925, + "step": 191 + }, + { + "epoch": 0.04, + "grad_norm": 1.3250032424887084, + "learning_rate": 3.998469224204652e-05, + "loss": 0.6736, + "step": 192 + }, + { + "epoch": 0.04, + "grad_norm": 1.3797040149914255, + "learning_rate": 3.998413061018126e-05, + "loss": 0.7955, + "step": 193 + }, + { + "epoch": 0.04, + "grad_norm": 1.2805602845669852, + "learning_rate": 3.9983558864225005e-05, + "loss": 0.7063, + "step": 194 + }, + { + "epoch": 0.04, + "grad_norm": 1.4422115567764362, + "learning_rate": 3.9982977004467106e-05, + "loss": 0.6954, + "step": 195 + }, + { + "epoch": 0.04, + "grad_norm": 1.207992023189052, + "learning_rate": 3.998238503120205e-05, + "loss": 0.6594, + "step": 196 + }, + { + "epoch": 0.04, + "grad_norm": 1.363506484348838, + "learning_rate": 3.998178294472944e-05, + "loss": 0.6448, + "step": 197 + }, + { + "epoch": 0.04, + "grad_norm": 1.322121278442904, + "learning_rate": 3.998117074535398e-05, + "loss": 0.6708, + "step": 198 + }, + { + "epoch": 0.04, + "grad_norm": 1.349313308768587, + "learning_rate": 3.9980548433385525e-05, + "loss": 0.6601, + "step": 199 + }, + { + "epoch": 0.04, + "grad_norm": 1.3259708662327934, + "learning_rate": 3.997991600913903e-05, + "loss": 0.6067, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 1.2868747989456146, + "learning_rate": 3.9979273472934556e-05, + "loss": 0.6959, + "step": 201 + }, + { + "epoch": 0.04, + "grad_norm": 1.2581457798279778, + "learning_rate": 3.9978620825097306e-05, + "loss": 0.6414, + "step": 202 + }, + { + "epoch": 0.04, + "grad_norm": 1.2659091865878211, + "learning_rate": 3.997795806595758e-05, + "loss": 0.6121, + "step": 203 + }, + { + "epoch": 0.04, + "grad_norm": 1.2788722607588214, + "learning_rate": 3.9977285195850816e-05, + "loss": 0.7051, + "step": 204 + }, + { + "epoch": 0.05, + "grad_norm": 1.268616331979891, + "learning_rate": 3.9976602215117554e-05, + "loss": 0.5972, + "step": 205 + }, + { + "epoch": 0.05, + "grad_norm": 1.2646466465057298, + "learning_rate": 3.997590912410345e-05, + "loss": 0.5288, + "step": 206 + }, + { + "epoch": 0.05, + "grad_norm": 1.4566461703664726, + "learning_rate": 3.997520592315929e-05, + "loss": 0.706, + "step": 207 + }, + { + "epoch": 0.05, + "grad_norm": 1.298420685844027, + "learning_rate": 3.997449261264095e-05, + "loss": 0.673, + "step": 208 + }, + { + "epoch": 0.05, + "grad_norm": 1.2174582849832838, + "learning_rate": 3.997376919290946e-05, + "loss": 0.5606, + "step": 209 + }, + { + "epoch": 0.05, + "grad_norm": 1.164802930233094, + "learning_rate": 3.997303566433094e-05, + "loss": 0.5624, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 1.1716197435217768, + "learning_rate": 3.997229202727663e-05, + "loss": 0.5921, + "step": 211 + }, + { + "epoch": 0.05, + "grad_norm": 1.2121105296122996, + "learning_rate": 3.99715382821229e-05, + "loss": 0.6432, + "step": 212 + }, + { + "epoch": 0.05, + "grad_norm": 1.2524098531752716, + "learning_rate": 3.997077442925122e-05, + "loss": 0.6192, + "step": 213 + }, + { + "epoch": 0.05, + "grad_norm": 1.1942955627853422, + "learning_rate": 3.997000046904817e-05, + "loss": 0.6344, + "step": 214 + }, + { + "epoch": 0.05, + "grad_norm": 1.2520875665187947, + "learning_rate": 3.996921640190547e-05, + "loss": 0.6857, + "step": 215 + }, + { + "epoch": 0.05, + "grad_norm": 1.177316721281303, + "learning_rate": 3.996842222821994e-05, + "loss": 0.5736, + "step": 216 + }, + { + "epoch": 0.05, + "grad_norm": 1.224012311462119, + "learning_rate": 3.9967617948393504e-05, + "loss": 0.6102, + "step": 217 + }, + { + "epoch": 0.05, + "grad_norm": 1.2894347607016468, + "learning_rate": 3.996680356283322e-05, + "loss": 0.6487, + "step": 218 + }, + { + "epoch": 0.05, + "grad_norm": 1.3895495252866978, + "learning_rate": 3.996597907195126e-05, + "loss": 0.661, + "step": 219 + }, + { + "epoch": 0.05, + "grad_norm": 1.32050710293565, + "learning_rate": 3.996514447616489e-05, + "loss": 0.6459, + "step": 220 + }, + { + "epoch": 0.05, + "grad_norm": 1.180191235644313, + "learning_rate": 3.996429977589653e-05, + "loss": 0.6245, + "step": 221 + }, + { + "epoch": 0.05, + "grad_norm": 1.249228191518979, + "learning_rate": 3.9963444971573656e-05, + "loss": 0.6258, + "step": 222 + }, + { + "epoch": 0.05, + "grad_norm": 1.2439700598103596, + "learning_rate": 3.996258006362891e-05, + "loss": 0.6282, + "step": 223 + }, + { + "epoch": 0.05, + "grad_norm": 1.262995307376024, + "learning_rate": 3.996170505250002e-05, + "loss": 0.5968, + "step": 224 + }, + { + "epoch": 0.05, + "grad_norm": 1.2049038110227466, + "learning_rate": 3.9960819938629834e-05, + "loss": 0.6372, + "step": 225 + }, + { + "epoch": 0.05, + "grad_norm": 1.1953981655564385, + "learning_rate": 3.995992472246632e-05, + "loss": 0.5788, + "step": 226 + }, + { + "epoch": 0.05, + "grad_norm": 1.1900866094695106, + "learning_rate": 3.995901940446254e-05, + "loss": 0.571, + "step": 227 + }, + { + "epoch": 0.05, + "grad_norm": 1.2301740989906733, + "learning_rate": 3.995810398507669e-05, + "loss": 0.5917, + "step": 228 + }, + { + "epoch": 0.05, + "grad_norm": 1.2523850099236096, + "learning_rate": 3.995717846477207e-05, + "loss": 0.5853, + "step": 229 + }, + { + "epoch": 0.05, + "grad_norm": 1.2378971695538876, + "learning_rate": 3.9956242844017094e-05, + "loss": 0.5738, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 1.1288816455944937, + "learning_rate": 3.995529712328528e-05, + "loss": 0.5542, + "step": 231 + }, + { + "epoch": 0.05, + "grad_norm": 1.1148924829622244, + "learning_rate": 3.995434130305526e-05, + "loss": 0.5093, + "step": 232 + }, + { + "epoch": 0.05, + "grad_norm": 1.2799793548707197, + "learning_rate": 3.995337538381079e-05, + "loss": 0.5565, + "step": 233 + }, + { + "epoch": 0.05, + "grad_norm": 1.231420239912531, + "learning_rate": 3.995239936604072e-05, + "loss": 0.5714, + "step": 234 + }, + { + "epoch": 0.05, + "grad_norm": 1.3992004870442785, + "learning_rate": 3.995141325023902e-05, + "loss": 0.6154, + "step": 235 + }, + { + "epoch": 0.05, + "grad_norm": 1.1949779565924818, + "learning_rate": 3.995041703690477e-05, + "loss": 0.5143, + "step": 236 + }, + { + "epoch": 0.05, + "grad_norm": 1.198730767088812, + "learning_rate": 3.994941072654215e-05, + "loss": 0.4691, + "step": 237 + }, + { + "epoch": 0.05, + "grad_norm": 1.2731486267833902, + "learning_rate": 3.9948394319660485e-05, + "loss": 0.5808, + "step": 238 + }, + { + "epoch": 0.05, + "grad_norm": 1.3653911887989123, + "learning_rate": 3.994736781677416e-05, + "loss": 0.65, + "step": 239 + }, + { + "epoch": 0.05, + "grad_norm": 1.3105154929120355, + "learning_rate": 3.994633121840271e-05, + "loss": 0.6434, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 1.2381977811292837, + "learning_rate": 3.994528452507076e-05, + "loss": 0.5829, + "step": 241 + }, + { + "epoch": 0.05, + "grad_norm": 1.1402137746645646, + "learning_rate": 3.994422773730803e-05, + "loss": 0.509, + "step": 242 + }, + { + "epoch": 0.05, + "grad_norm": 1.208429458784759, + "learning_rate": 3.99431608556494e-05, + "loss": 0.6454, + "step": 243 + }, + { + "epoch": 0.05, + "grad_norm": 1.1542773043398307, + "learning_rate": 3.99420838806348e-05, + "loss": 0.5745, + "step": 244 + }, + { + "epoch": 0.05, + "grad_norm": 1.0919978139422268, + "learning_rate": 3.99409968128093e-05, + "loss": 0.4986, + "step": 245 + }, + { + "epoch": 0.05, + "grad_norm": 1.113345977507724, + "learning_rate": 3.993989965272308e-05, + "loss": 0.5712, + "step": 246 + }, + { + "epoch": 0.05, + "grad_norm": 1.3593819278233579, + "learning_rate": 3.99387924009314e-05, + "loss": 0.6304, + "step": 247 + }, + { + "epoch": 0.05, + "grad_norm": 1.1946094913942888, + "learning_rate": 3.9937675057994666e-05, + "loss": 0.5803, + "step": 248 + }, + { + "epoch": 0.05, + "grad_norm": 1.1889538648909175, + "learning_rate": 3.993654762447837e-05, + "loss": 0.5862, + "step": 249 + }, + { + "epoch": 0.05, + "grad_norm": 1.0987200486198738, + "learning_rate": 3.9935410100953105e-05, + "loss": 0.4049, + "step": 250 + }, + { + "epoch": 0.06, + "grad_norm": 1.2101428962694776, + "learning_rate": 3.993426248799458e-05, + "loss": 0.57, + "step": 251 + }, + { + "epoch": 0.06, + "grad_norm": 1.234140639450306, + "learning_rate": 3.993310478618361e-05, + "loss": 0.5653, + "step": 252 + }, + { + "epoch": 0.06, + "grad_norm": 1.2708140773043917, + "learning_rate": 3.993193699610612e-05, + "loss": 0.588, + "step": 253 + }, + { + "epoch": 0.06, + "grad_norm": 1.2785513299746012, + "learning_rate": 3.9930759118353124e-05, + "loss": 0.6005, + "step": 254 + }, + { + "epoch": 0.06, + "grad_norm": 1.4274452845681305, + "learning_rate": 3.992957115352077e-05, + "loss": 0.6807, + "step": 255 + }, + { + "epoch": 0.06, + "grad_norm": 1.2004749788586693, + "learning_rate": 3.992837310221028e-05, + "loss": 0.587, + "step": 256 + }, + { + "epoch": 0.06, + "grad_norm": 1.1666004752501002, + "learning_rate": 3.9927164965028006e-05, + "loss": 0.4766, + "step": 257 + }, + { + "epoch": 0.06, + "grad_norm": 1.18853834833233, + "learning_rate": 3.9925946742585385e-05, + "loss": 0.5422, + "step": 258 + }, + { + "epoch": 0.06, + "grad_norm": 1.2094667424537495, + "learning_rate": 3.9924718435498964e-05, + "loss": 0.559, + "step": 259 + }, + { + "epoch": 0.06, + "grad_norm": 1.2747637155785871, + "learning_rate": 3.9923480044390405e-05, + "loss": 0.5448, + "step": 260 + }, + { + "epoch": 0.06, + "grad_norm": 1.1455729564410346, + "learning_rate": 3.9922231569886464e-05, + "loss": 0.4595, + "step": 261 + }, + { + "epoch": 0.06, + "grad_norm": 1.2280623868943858, + "learning_rate": 3.9920973012619e-05, + "loss": 0.4862, + "step": 262 + }, + { + "epoch": 0.06, + "grad_norm": 1.0514020822946684, + "learning_rate": 3.9919704373224984e-05, + "loss": 0.4702, + "step": 263 + }, + { + "epoch": 0.06, + "grad_norm": 1.1276534365607926, + "learning_rate": 3.991842565234647e-05, + "loss": 0.5056, + "step": 264 + }, + { + "epoch": 0.06, + "grad_norm": 1.1465460843532795, + "learning_rate": 3.991713685063063e-05, + "loss": 0.4813, + "step": 265 + }, + { + "epoch": 0.06, + "grad_norm": 1.0890863993360724, + "learning_rate": 3.991583796872974e-05, + "loss": 0.5131, + "step": 266 + }, + { + "epoch": 0.06, + "grad_norm": 1.291958832256156, + "learning_rate": 3.991452900730116e-05, + "loss": 0.5547, + "step": 267 + }, + { + "epoch": 0.06, + "grad_norm": 1.1784316698414166, + "learning_rate": 3.991320996700737e-05, + "loss": 0.4902, + "step": 268 + }, + { + "epoch": 0.06, + "grad_norm": 1.1724553006564273, + "learning_rate": 3.991188084851596e-05, + "loss": 0.4878, + "step": 269 + }, + { + "epoch": 0.06, + "grad_norm": 1.2264566558071297, + "learning_rate": 3.991054165249958e-05, + "loss": 0.5361, + "step": 270 + }, + { + "epoch": 0.06, + "grad_norm": 1.204194868916881, + "learning_rate": 3.990919237963602e-05, + "loss": 0.4693, + "step": 271 + }, + { + "epoch": 0.06, + "grad_norm": 1.165114880738373, + "learning_rate": 3.9907833030608153e-05, + "loss": 0.4728, + "step": 272 + }, + { + "epoch": 0.06, + "grad_norm": 1.2662323801183593, + "learning_rate": 3.990646360610395e-05, + "loss": 0.5497, + "step": 273 + }, + { + "epoch": 0.06, + "grad_norm": 1.3041430039539232, + "learning_rate": 3.9905084106816494e-05, + "loss": 0.5175, + "step": 274 + }, + { + "epoch": 0.06, + "grad_norm": 1.2265639578452223, + "learning_rate": 3.990369453344394e-05, + "loss": 0.5045, + "step": 275 + }, + { + "epoch": 0.06, + "grad_norm": 1.100467868486642, + "learning_rate": 3.9902294886689576e-05, + "loss": 0.5279, + "step": 276 + }, + { + "epoch": 0.06, + "grad_norm": 1.2000185011391165, + "learning_rate": 3.990088516726177e-05, + "loss": 0.508, + "step": 277 + }, + { + "epoch": 0.06, + "grad_norm": 1.2701732714039602, + "learning_rate": 3.9899465375873985e-05, + "loss": 0.5277, + "step": 278 + }, + { + "epoch": 0.06, + "grad_norm": 1.3908324783365231, + "learning_rate": 3.989803551324479e-05, + "loss": 0.6504, + "step": 279 + }, + { + "epoch": 0.06, + "grad_norm": 1.2523779661533918, + "learning_rate": 3.989659558009784e-05, + "loss": 0.5366, + "step": 280 + }, + { + "epoch": 0.06, + "grad_norm": 1.122515159453593, + "learning_rate": 3.98951455771619e-05, + "loss": 0.4976, + "step": 281 + }, + { + "epoch": 0.06, + "grad_norm": 1.2132206711933338, + "learning_rate": 3.989368550517083e-05, + "loss": 0.5127, + "step": 282 + }, + { + "epoch": 0.06, + "grad_norm": 1.0510171391498828, + "learning_rate": 3.989221536486357e-05, + "loss": 0.3852, + "step": 283 + }, + { + "epoch": 0.06, + "grad_norm": 1.084079497691504, + "learning_rate": 3.989073515698417e-05, + "loss": 0.4238, + "step": 284 + }, + { + "epoch": 0.06, + "grad_norm": 1.2574323944666181, + "learning_rate": 3.988924488228178e-05, + "loss": 0.4225, + "step": 285 + }, + { + "epoch": 0.06, + "grad_norm": 1.2767442702237093, + "learning_rate": 3.988774454151063e-05, + "loss": 0.4731, + "step": 286 + }, + { + "epoch": 0.06, + "grad_norm": 1.1875667553258227, + "learning_rate": 3.988623413543006e-05, + "loss": 0.4941, + "step": 287 + }, + { + "epoch": 0.06, + "grad_norm": 1.1722931179545868, + "learning_rate": 3.9884713664804485e-05, + "loss": 0.5455, + "step": 288 + }, + { + "epoch": 0.06, + "grad_norm": 1.1571276658690817, + "learning_rate": 3.9883183130403424e-05, + "loss": 0.5155, + "step": 289 + }, + { + "epoch": 0.06, + "grad_norm": 1.273868698414468, + "learning_rate": 3.98816425330015e-05, + "loss": 0.6212, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 1.1349200407927071, + "learning_rate": 3.9880091873378416e-05, + "loss": 0.5138, + "step": 291 + }, + { + "epoch": 0.06, + "grad_norm": 1.2036075862726578, + "learning_rate": 3.9878531152318966e-05, + "loss": 0.4458, + "step": 292 + }, + { + "epoch": 0.06, + "grad_norm": 1.2523328946048258, + "learning_rate": 3.987696037061304e-05, + "loss": 0.5277, + "step": 293 + }, + { + "epoch": 0.06, + "grad_norm": 1.0786026711932888, + "learning_rate": 3.9875379529055624e-05, + "loss": 0.4084, + "step": 294 + }, + { + "epoch": 0.06, + "grad_norm": 1.3213455402401926, + "learning_rate": 3.987378862844679e-05, + "loss": 0.5018, + "step": 295 + }, + { + "epoch": 0.07, + "grad_norm": 1.3459430807302186, + "learning_rate": 3.987218766959171e-05, + "loss": 0.5742, + "step": 296 + }, + { + "epoch": 0.07, + "grad_norm": 1.16850411383607, + "learning_rate": 3.987057665330063e-05, + "loss": 0.5138, + "step": 297 + }, + { + "epoch": 0.07, + "grad_norm": 1.1359638447179328, + "learning_rate": 3.986895558038889e-05, + "loss": 0.4713, + "step": 298 + }, + { + "epoch": 0.07, + "grad_norm": 1.2118807164844807, + "learning_rate": 3.986732445167694e-05, + "loss": 0.4895, + "step": 299 + }, + { + "epoch": 0.07, + "grad_norm": 1.1783109806499985, + "learning_rate": 3.9865683267990295e-05, + "loss": 0.4614, + "step": 300 + }, + { + "epoch": 0.07, + "grad_norm": 1.2716055957300278, + "learning_rate": 3.986403203015957e-05, + "loss": 0.4847, + "step": 301 + }, + { + "epoch": 0.07, + "grad_norm": 1.293195952307599, + "learning_rate": 3.9862370739020455e-05, + "loss": 0.5353, + "step": 302 + }, + { + "epoch": 0.07, + "grad_norm": 1.2657770607105185, + "learning_rate": 3.9860699395413764e-05, + "loss": 0.499, + "step": 303 + }, + { + "epoch": 0.07, + "grad_norm": 1.0391015640602386, + "learning_rate": 3.985901800018535e-05, + "loss": 0.4169, + "step": 304 + }, + { + "epoch": 0.07, + "grad_norm": 1.215289852748404, + "learning_rate": 3.98573265541862e-05, + "loss": 0.5027, + "step": 305 + }, + { + "epoch": 0.07, + "grad_norm": 1.1711716130426661, + "learning_rate": 3.985562505827235e-05, + "loss": 0.4303, + "step": 306 + }, + { + "epoch": 0.07, + "grad_norm": 1.1602282774687112, + "learning_rate": 3.985391351330494e-05, + "loss": 0.4306, + "step": 307 + }, + { + "epoch": 0.07, + "grad_norm": 1.0702587587550187, + "learning_rate": 3.985219192015019e-05, + "loss": 0.4604, + "step": 308 + }, + { + "epoch": 0.07, + "grad_norm": 1.163744991959594, + "learning_rate": 3.985046027967943e-05, + "loss": 0.5091, + "step": 309 + }, + { + "epoch": 0.07, + "grad_norm": 1.0533315348509231, + "learning_rate": 3.984871859276902e-05, + "loss": 0.4756, + "step": 310 + }, + { + "epoch": 0.07, + "grad_norm": 1.164256649339924, + "learning_rate": 3.984696686030046e-05, + "loss": 0.4769, + "step": 311 + }, + { + "epoch": 0.07, + "grad_norm": 1.099305458146492, + "learning_rate": 3.9845205083160315e-05, + "loss": 0.441, + "step": 312 + }, + { + "epoch": 0.07, + "grad_norm": 1.0560989542749786, + "learning_rate": 3.984343326224022e-05, + "loss": 0.4744, + "step": 313 + }, + { + "epoch": 0.07, + "grad_norm": 1.2728584236684755, + "learning_rate": 3.9841651398436907e-05, + "loss": 0.5036, + "step": 314 + }, + { + "epoch": 0.07, + "grad_norm": 1.092996828428869, + "learning_rate": 3.983985949265219e-05, + "loss": 0.4692, + "step": 315 + }, + { + "epoch": 0.07, + "grad_norm": 1.0802089067026577, + "learning_rate": 3.983805754579297e-05, + "loss": 0.3904, + "step": 316 + }, + { + "epoch": 0.07, + "grad_norm": 1.1006761055323477, + "learning_rate": 3.98362455587712e-05, + "loss": 0.4057, + "step": 317 + }, + { + "epoch": 0.07, + "grad_norm": 1.109374892543734, + "learning_rate": 3.9834423532503975e-05, + "loss": 0.4865, + "step": 318 + }, + { + "epoch": 0.07, + "grad_norm": 1.0895982144411165, + "learning_rate": 3.9832591467913405e-05, + "loss": 0.4018, + "step": 319 + }, + { + "epoch": 0.07, + "grad_norm": 1.1037257711790474, + "learning_rate": 3.9830749365926716e-05, + "loss": 0.4063, + "step": 320 + }, + { + "epoch": 0.07, + "grad_norm": 1.137745763617826, + "learning_rate": 3.982889722747621e-05, + "loss": 0.4296, + "step": 321 + }, + { + "epoch": 0.07, + "grad_norm": 1.0638032101547767, + "learning_rate": 3.9827035053499264e-05, + "loss": 0.4478, + "step": 322 + }, + { + "epoch": 0.07, + "grad_norm": 1.1201766246933287, + "learning_rate": 3.982516284493834e-05, + "loss": 0.4129, + "step": 323 + }, + { + "epoch": 0.07, + "grad_norm": 1.0943915529159343, + "learning_rate": 3.982328060274097e-05, + "loss": 0.4141, + "step": 324 + }, + { + "epoch": 0.07, + "grad_norm": 1.0233495543454487, + "learning_rate": 3.982138832785976e-05, + "loss": 0.4061, + "step": 325 + }, + { + "epoch": 0.07, + "grad_norm": 1.2608735646201592, + "learning_rate": 3.981948602125242e-05, + "loss": 0.4925, + "step": 326 + }, + { + "epoch": 0.07, + "grad_norm": 1.1493183831878495, + "learning_rate": 3.98175736838817e-05, + "loss": 0.465, + "step": 327 + }, + { + "epoch": 0.07, + "grad_norm": 1.1636103321511055, + "learning_rate": 3.981565131671546e-05, + "loss": 0.5018, + "step": 328 + }, + { + "epoch": 0.07, + "grad_norm": 1.0613981592394197, + "learning_rate": 3.981371892072661e-05, + "loss": 0.4161, + "step": 329 + }, + { + "epoch": 0.07, + "grad_norm": 1.1859586148399008, + "learning_rate": 3.981177649689317e-05, + "loss": 0.4475, + "step": 330 + }, + { + "epoch": 0.07, + "grad_norm": 1.0116317888392918, + "learning_rate": 3.980982404619819e-05, + "loss": 0.42, + "step": 331 + }, + { + "epoch": 0.07, + "grad_norm": 1.1123049018464455, + "learning_rate": 3.9807861569629815e-05, + "loss": 0.4625, + "step": 332 + }, + { + "epoch": 0.07, + "grad_norm": 1.0650799226994028, + "learning_rate": 3.980588906818129e-05, + "loss": 0.3434, + "step": 333 + }, + { + "epoch": 0.07, + "grad_norm": 1.1856989001065437, + "learning_rate": 3.980390654285088e-05, + "loss": 0.4546, + "step": 334 + }, + { + "epoch": 0.07, + "grad_norm": 1.1165114271140357, + "learning_rate": 3.980191399464198e-05, + "loss": 0.4023, + "step": 335 + }, + { + "epoch": 0.07, + "grad_norm": 1.1062497456868197, + "learning_rate": 3.979991142456302e-05, + "loss": 0.3757, + "step": 336 + }, + { + "epoch": 0.07, + "grad_norm": 1.1251817132637552, + "learning_rate": 3.9797898833627514e-05, + "loss": 0.4204, + "step": 337 + }, + { + "epoch": 0.07, + "grad_norm": 1.076857499009745, + "learning_rate": 3.979587622285404e-05, + "loss": 0.3577, + "step": 338 + }, + { + "epoch": 0.07, + "grad_norm": 1.149526689402929, + "learning_rate": 3.979384359326626e-05, + "loss": 0.5064, + "step": 339 + }, + { + "epoch": 0.07, + "grad_norm": 1.0409359785175913, + "learning_rate": 3.97918009458929e-05, + "loss": 0.3905, + "step": 340 + }, + { + "epoch": 0.07, + "grad_norm": 1.0025633502264812, + "learning_rate": 3.9789748281767754e-05, + "loss": 0.3792, + "step": 341 + }, + { + "epoch": 0.08, + "grad_norm": 1.1626214445933571, + "learning_rate": 3.978768560192969e-05, + "loss": 0.4412, + "step": 342 + }, + { + "epoch": 0.08, + "grad_norm": 1.2902301008688999, + "learning_rate": 3.978561290742265e-05, + "loss": 0.4021, + "step": 343 + }, + { + "epoch": 0.08, + "grad_norm": 1.1143634555494666, + "learning_rate": 3.978353019929562e-05, + "loss": 0.4048, + "step": 344 + }, + { + "epoch": 0.08, + "grad_norm": 1.195219148468686, + "learning_rate": 3.978143747860269e-05, + "loss": 0.4526, + "step": 345 + }, + { + "epoch": 0.08, + "grad_norm": 1.0725898738809163, + "learning_rate": 3.977933474640298e-05, + "loss": 0.4468, + "step": 346 + }, + { + "epoch": 0.08, + "grad_norm": 1.1025137615426255, + "learning_rate": 3.9777222003760714e-05, + "loss": 0.4239, + "step": 347 + }, + { + "epoch": 0.08, + "grad_norm": 1.0611781143806014, + "learning_rate": 3.977509925174515e-05, + "loss": 0.4622, + "step": 348 + }, + { + "epoch": 0.08, + "grad_norm": 1.1455237866366754, + "learning_rate": 3.977296649143064e-05, + "loss": 0.4485, + "step": 349 + }, + { + "epoch": 0.08, + "grad_norm": 1.0029408603777608, + "learning_rate": 3.9770823723896574e-05, + "loss": 0.3871, + "step": 350 + }, + { + "epoch": 0.08, + "grad_norm": 1.0213749026395467, + "learning_rate": 3.976867095022742e-05, + "loss": 0.4471, + "step": 351 + }, + { + "epoch": 0.08, + "grad_norm": 1.148935430626447, + "learning_rate": 3.9766508171512715e-05, + "loss": 0.4755, + "step": 352 + }, + { + "epoch": 0.08, + "grad_norm": 1.0381439853059833, + "learning_rate": 3.976433538884706e-05, + "loss": 0.376, + "step": 353 + }, + { + "epoch": 0.08, + "grad_norm": 1.2431017791896115, + "learning_rate": 3.97621526033301e-05, + "loss": 0.478, + "step": 354 + }, + { + "epoch": 0.08, + "grad_norm": 1.1850241399471089, + "learning_rate": 3.9759959816066575e-05, + "loss": 0.4424, + "step": 355 + }, + { + "epoch": 0.08, + "grad_norm": 1.0482157192652117, + "learning_rate": 3.975775702816625e-05, + "loss": 0.3953, + "step": 356 + }, + { + "epoch": 0.08, + "grad_norm": 1.173974001130304, + "learning_rate": 3.975554424074397e-05, + "loss": 0.4347, + "step": 357 + }, + { + "epoch": 0.08, + "grad_norm": 1.1813716361579474, + "learning_rate": 3.975332145491965e-05, + "loss": 0.4359, + "step": 358 + }, + { + "epoch": 0.08, + "grad_norm": 1.1235249173635753, + "learning_rate": 3.975108867181826e-05, + "loss": 0.3873, + "step": 359 + }, + { + "epoch": 0.08, + "grad_norm": 1.1060145348426775, + "learning_rate": 3.974884589256981e-05, + "loss": 0.3915, + "step": 360 + }, + { + "epoch": 0.08, + "grad_norm": 1.2229112353142, + "learning_rate": 3.97465931183094e-05, + "loss": 0.4774, + "step": 361 + }, + { + "epoch": 0.08, + "grad_norm": 1.0023051158568568, + "learning_rate": 3.9744330350177156e-05, + "loss": 0.3639, + "step": 362 + }, + { + "epoch": 0.08, + "grad_norm": 1.1390226422572052, + "learning_rate": 3.974205758931828e-05, + "loss": 0.3841, + "step": 363 + }, + { + "epoch": 0.08, + "grad_norm": 1.079691315794632, + "learning_rate": 3.973977483688305e-05, + "loss": 0.3615, + "step": 364 + }, + { + "epoch": 0.08, + "grad_norm": 1.1729070569328504, + "learning_rate": 3.9737482094026764e-05, + "loss": 0.3758, + "step": 365 + }, + { + "epoch": 0.08, + "grad_norm": 1.250347184603744, + "learning_rate": 3.9735179361909803e-05, + "loss": 0.4381, + "step": 366 + }, + { + "epoch": 0.08, + "grad_norm": 1.0533736911619336, + "learning_rate": 3.9732866641697586e-05, + "loss": 0.3279, + "step": 367 + }, + { + "epoch": 0.08, + "grad_norm": 1.454656798533515, + "learning_rate": 3.9730543934560595e-05, + "loss": 0.3942, + "step": 368 + }, + { + "epoch": 0.08, + "grad_norm": 1.079306576961687, + "learning_rate": 3.9728211241674363e-05, + "loss": 0.3655, + "step": 369 + }, + { + "epoch": 0.08, + "grad_norm": 1.1195722616367776, + "learning_rate": 3.972586856421949e-05, + "loss": 0.4542, + "step": 370 + }, + { + "epoch": 0.08, + "grad_norm": 1.1305671274736946, + "learning_rate": 3.9723515903381625e-05, + "loss": 0.4273, + "step": 371 + }, + { + "epoch": 0.08, + "grad_norm": 1.050735189002656, + "learning_rate": 3.9721153260351446e-05, + "loss": 0.3854, + "step": 372 + }, + { + "epoch": 0.08, + "grad_norm": 1.0818517653131625, + "learning_rate": 3.971878063632471e-05, + "loss": 0.395, + "step": 373 + }, + { + "epoch": 0.08, + "grad_norm": 1.0001708123235014, + "learning_rate": 3.971639803250221e-05, + "loss": 0.4188, + "step": 374 + }, + { + "epoch": 0.08, + "grad_norm": 1.0316954142109276, + "learning_rate": 3.9714005450089815e-05, + "loss": 0.3553, + "step": 375 + }, + { + "epoch": 0.08, + "grad_norm": 1.0061576919391308, + "learning_rate": 3.971160289029841e-05, + "loss": 0.3474, + "step": 376 + }, + { + "epoch": 0.08, + "grad_norm": 1.07023146942143, + "learning_rate": 3.9709190354343936e-05, + "loss": 0.382, + "step": 377 + }, + { + "epoch": 0.08, + "grad_norm": 0.9957014321050548, + "learning_rate": 3.9706767843447417e-05, + "loss": 0.3575, + "step": 378 + }, + { + "epoch": 0.08, + "grad_norm": 0.9866634592012732, + "learning_rate": 3.970433535883489e-05, + "loss": 0.3528, + "step": 379 + }, + { + "epoch": 0.08, + "grad_norm": 1.2674516282642205, + "learning_rate": 3.970189290173744e-05, + "loss": 0.4578, + "step": 380 + }, + { + "epoch": 0.08, + "grad_norm": 1.0813481346976104, + "learning_rate": 3.969944047339122e-05, + "loss": 0.3588, + "step": 381 + }, + { + "epoch": 0.08, + "grad_norm": 1.035604185679093, + "learning_rate": 3.969697807503742e-05, + "loss": 0.3632, + "step": 382 + }, + { + "epoch": 0.08, + "grad_norm": 1.0593263688079588, + "learning_rate": 3.969450570792227e-05, + "loss": 0.3982, + "step": 383 + }, + { + "epoch": 0.08, + "grad_norm": 1.1305488858558645, + "learning_rate": 3.969202337329705e-05, + "loss": 0.4326, + "step": 384 + }, + { + "epoch": 0.08, + "grad_norm": 0.9800345078543079, + "learning_rate": 3.968953107241809e-05, + "loss": 0.3461, + "step": 385 + }, + { + "epoch": 0.08, + "grad_norm": 1.1507518570325785, + "learning_rate": 3.9687028806546756e-05, + "loss": 0.4266, + "step": 386 + }, + { + "epoch": 0.08, + "grad_norm": 1.160973771402753, + "learning_rate": 3.968451657694946e-05, + "loss": 0.3961, + "step": 387 + }, + { + "epoch": 0.09, + "grad_norm": 1.0522505120629397, + "learning_rate": 3.9681994384897654e-05, + "loss": 0.3832, + "step": 388 + }, + { + "epoch": 0.09, + "grad_norm": 1.0262066600350739, + "learning_rate": 3.967946223166784e-05, + "loss": 0.3564, + "step": 389 + }, + { + "epoch": 0.09, + "grad_norm": 1.1021390447735178, + "learning_rate": 3.967692011854155e-05, + "loss": 0.3057, + "step": 390 + }, + { + "epoch": 0.09, + "grad_norm": 1.1459370595048486, + "learning_rate": 3.967436804680537e-05, + "loss": 0.406, + "step": 391 + }, + { + "epoch": 0.09, + "grad_norm": 1.1801973971437896, + "learning_rate": 3.9671806017750915e-05, + "loss": 0.4478, + "step": 392 + }, + { + "epoch": 0.09, + "grad_norm": 1.142012235916646, + "learning_rate": 3.966923403267485e-05, + "loss": 0.3831, + "step": 393 + }, + { + "epoch": 0.09, + "grad_norm": 1.0224250852753491, + "learning_rate": 3.9666652092878856e-05, + "loss": 0.3376, + "step": 394 + }, + { + "epoch": 0.09, + "grad_norm": 1.0248021935123073, + "learning_rate": 3.966406019966968e-05, + "loss": 0.3892, + "step": 395 + }, + { + "epoch": 0.09, + "grad_norm": 1.0170305356568081, + "learning_rate": 3.9661458354359105e-05, + "loss": 0.3874, + "step": 396 + }, + { + "epoch": 0.09, + "grad_norm": 1.1045133957989504, + "learning_rate": 3.9658846558263925e-05, + "loss": 0.4759, + "step": 397 + }, + { + "epoch": 0.09, + "grad_norm": 1.0138047559369607, + "learning_rate": 3.965622481270599e-05, + "loss": 0.3802, + "step": 398 + }, + { + "epoch": 0.09, + "grad_norm": 0.9892519680502024, + "learning_rate": 3.9653593119012185e-05, + "loss": 0.3557, + "step": 399 + }, + { + "epoch": 0.09, + "grad_norm": 1.0388442075496735, + "learning_rate": 3.965095147851442e-05, + "loss": 0.4092, + "step": 400 + }, + { + "epoch": 0.09, + "grad_norm": 1.0368870498665264, + "learning_rate": 3.9648299892549654e-05, + "loss": 0.3768, + "step": 401 + }, + { + "epoch": 0.09, + "grad_norm": 1.0832108220495158, + "learning_rate": 3.964563836245987e-05, + "loss": 0.359, + "step": 402 + }, + { + "epoch": 0.09, + "grad_norm": 1.03824313562385, + "learning_rate": 3.964296688959208e-05, + "loss": 0.4537, + "step": 403 + }, + { + "epoch": 0.09, + "grad_norm": 1.054298611752923, + "learning_rate": 3.964028547529832e-05, + "loss": 0.3528, + "step": 404 + }, + { + "epoch": 0.09, + "grad_norm": 0.9577731934987448, + "learning_rate": 3.9637594120935697e-05, + "loss": 0.366, + "step": 405 + }, + { + "epoch": 0.09, + "grad_norm": 0.9435210790316217, + "learning_rate": 3.9634892827866306e-05, + "loss": 0.3064, + "step": 406 + }, + { + "epoch": 0.09, + "grad_norm": 0.9685154907886202, + "learning_rate": 3.9632181597457296e-05, + "loss": 0.3803, + "step": 407 + }, + { + "epoch": 0.09, + "grad_norm": 1.1840539421603884, + "learning_rate": 3.9629460431080825e-05, + "loss": 0.4352, + "step": 408 + }, + { + "epoch": 0.09, + "grad_norm": 1.0350869209537155, + "learning_rate": 3.96267293301141e-05, + "loss": 0.3756, + "step": 409 + }, + { + "epoch": 0.09, + "grad_norm": 0.9668692821784871, + "learning_rate": 3.962398829593935e-05, + "loss": 0.3315, + "step": 410 + }, + { + "epoch": 0.09, + "grad_norm": 1.0181116034631583, + "learning_rate": 3.962123732994383e-05, + "loss": 0.3729, + "step": 411 + }, + { + "epoch": 0.09, + "grad_norm": 1.0769434549688277, + "learning_rate": 3.961847643351981e-05, + "loss": 0.344, + "step": 412 + }, + { + "epoch": 0.09, + "grad_norm": 1.0925582933041047, + "learning_rate": 3.961570560806461e-05, + "loss": 0.4162, + "step": 413 + }, + { + "epoch": 0.09, + "grad_norm": 1.102557118811125, + "learning_rate": 3.9612924854980556e-05, + "loss": 0.4166, + "step": 414 + }, + { + "epoch": 0.09, + "grad_norm": 1.1286347270584014, + "learning_rate": 3.9610134175675e-05, + "loss": 0.3693, + "step": 415 + }, + { + "epoch": 0.09, + "grad_norm": 0.9415423662860831, + "learning_rate": 3.960733357156033e-05, + "loss": 0.3004, + "step": 416 + }, + { + "epoch": 0.09, + "grad_norm": 1.0366728990271383, + "learning_rate": 3.960452304405394e-05, + "loss": 0.3846, + "step": 417 + }, + { + "epoch": 0.09, + "grad_norm": 0.9635313119417835, + "learning_rate": 3.960170259457826e-05, + "loss": 0.2909, + "step": 418 + }, + { + "epoch": 0.09, + "grad_norm": 0.9654734792005484, + "learning_rate": 3.959887222456075e-05, + "loss": 0.3031, + "step": 419 + }, + { + "epoch": 0.09, + "grad_norm": 0.95233152536584, + "learning_rate": 3.959603193543385e-05, + "loss": 0.3166, + "step": 420 + }, + { + "epoch": 0.09, + "grad_norm": 1.0463975415402151, + "learning_rate": 3.959318172863506e-05, + "loss": 0.3488, + "step": 421 + }, + { + "epoch": 0.09, + "grad_norm": 1.1291429212168487, + "learning_rate": 3.95903216056069e-05, + "loss": 0.4022, + "step": 422 + }, + { + "epoch": 0.09, + "grad_norm": 1.0365884913215422, + "learning_rate": 3.958745156779688e-05, + "loss": 0.347, + "step": 423 + }, + { + "epoch": 0.09, + "grad_norm": 1.0666907074325134, + "learning_rate": 3.9584571616657544e-05, + "loss": 0.3873, + "step": 424 + }, + { + "epoch": 0.09, + "grad_norm": 0.9862395898344498, + "learning_rate": 3.958168175364646e-05, + "loss": 0.3259, + "step": 425 + }, + { + "epoch": 0.09, + "grad_norm": 0.8618518508668179, + "learning_rate": 3.957878198022621e-05, + "loss": 0.2429, + "step": 426 + }, + { + "epoch": 0.09, + "grad_norm": 0.9302577665948445, + "learning_rate": 3.957587229786437e-05, + "loss": 0.3339, + "step": 427 + }, + { + "epoch": 0.09, + "grad_norm": 0.9680432848986547, + "learning_rate": 3.9572952708033564e-05, + "loss": 0.3006, + "step": 428 + }, + { + "epoch": 0.09, + "grad_norm": 1.0056531379893139, + "learning_rate": 3.9570023212211405e-05, + "loss": 0.3533, + "step": 429 + }, + { + "epoch": 0.09, + "grad_norm": 1.042314188290271, + "learning_rate": 3.956708381188054e-05, + "loss": 0.3704, + "step": 430 + }, + { + "epoch": 0.09, + "grad_norm": 0.9955369477688994, + "learning_rate": 3.95641345085286e-05, + "loss": 0.3224, + "step": 431 + }, + { + "epoch": 0.09, + "grad_norm": 1.1437350162712814, + "learning_rate": 3.956117530364826e-05, + "loss": 0.3429, + "step": 432 + }, + { + "epoch": 0.1, + "grad_norm": 1.1434482489093942, + "learning_rate": 3.955820619873719e-05, + "loss": 0.3979, + "step": 433 + }, + { + "epoch": 0.1, + "grad_norm": 1.0885754934405945, + "learning_rate": 3.955522719529807e-05, + "loss": 0.3778, + "step": 434 + }, + { + "epoch": 0.1, + "grad_norm": 1.006883414746575, + "learning_rate": 3.9552238294838584e-05, + "loss": 0.3259, + "step": 435 + }, + { + "epoch": 0.1, + "grad_norm": 0.9424097245084614, + "learning_rate": 3.954923949887144e-05, + "loss": 0.2967, + "step": 436 + }, + { + "epoch": 0.1, + "grad_norm": 0.879260419905861, + "learning_rate": 3.954623080891435e-05, + "loss": 0.2797, + "step": 437 + }, + { + "epoch": 0.1, + "grad_norm": 1.0029412763867633, + "learning_rate": 3.954321222649003e-05, + "loss": 0.3128, + "step": 438 + }, + { + "epoch": 0.1, + "grad_norm": 0.9197166521725636, + "learning_rate": 3.95401837531262e-05, + "loss": 0.281, + "step": 439 + }, + { + "epoch": 0.1, + "grad_norm": 1.1461508686625692, + "learning_rate": 3.953714539035558e-05, + "loss": 0.3273, + "step": 440 + }, + { + "epoch": 0.1, + "grad_norm": 1.0014668673416294, + "learning_rate": 3.9534097139715926e-05, + "loss": 0.3021, + "step": 441 + }, + { + "epoch": 0.1, + "grad_norm": 1.0224160491504546, + "learning_rate": 3.9531039002749955e-05, + "loss": 0.331, + "step": 442 + }, + { + "epoch": 0.1, + "grad_norm": 1.0680427709794582, + "learning_rate": 3.952797098100543e-05, + "loss": 0.3789, + "step": 443 + }, + { + "epoch": 0.1, + "grad_norm": 0.9893873455653348, + "learning_rate": 3.952489307603507e-05, + "loss": 0.309, + "step": 444 + }, + { + "epoch": 0.1, + "grad_norm": 1.0181761172525154, + "learning_rate": 3.9521805289396645e-05, + "loss": 0.3104, + "step": 445 + }, + { + "epoch": 0.1, + "grad_norm": 1.1361743895130338, + "learning_rate": 3.951870762265288e-05, + "loss": 0.3481, + "step": 446 + }, + { + "epoch": 0.1, + "grad_norm": 0.9401021448397183, + "learning_rate": 3.9515600077371545e-05, + "loss": 0.2785, + "step": 447 + }, + { + "epoch": 0.1, + "grad_norm": 1.0281058266194, + "learning_rate": 3.951248265512538e-05, + "loss": 0.3919, + "step": 448 + }, + { + "epoch": 0.1, + "grad_norm": 1.0209799810421567, + "learning_rate": 3.950935535749213e-05, + "loss": 0.3407, + "step": 449 + }, + { + "epoch": 0.1, + "grad_norm": 1.0680804960868895, + "learning_rate": 3.950621818605453e-05, + "loss": 0.3009, + "step": 450 + }, + { + "epoch": 0.1, + "grad_norm": 0.9809830673051324, + "learning_rate": 3.950307114240034e-05, + "loss": 0.2788, + "step": 451 + }, + { + "epoch": 0.1, + "grad_norm": 1.091927999903504, + "learning_rate": 3.9499914228122286e-05, + "loss": 0.3584, + "step": 452 + }, + { + "epoch": 0.1, + "grad_norm": 1.0496342294388752, + "learning_rate": 3.9496747444818105e-05, + "loss": 0.3215, + "step": 453 + }, + { + "epoch": 0.1, + "grad_norm": 0.9472906994204717, + "learning_rate": 3.9493570794090524e-05, + "loss": 0.2863, + "step": 454 + }, + { + "epoch": 0.1, + "grad_norm": 1.026604796399034, + "learning_rate": 3.9490384277547266e-05, + "loss": 0.3253, + "step": 455 + }, + { + "epoch": 0.1, + "grad_norm": 1.0059919251445888, + "learning_rate": 3.9487187896801054e-05, + "loss": 0.3646, + "step": 456 + }, + { + "epoch": 0.1, + "grad_norm": 1.1146466364962124, + "learning_rate": 3.9483981653469586e-05, + "loss": 0.3671, + "step": 457 + }, + { + "epoch": 0.1, + "grad_norm": 0.8839794649332503, + "learning_rate": 3.948076554917556e-05, + "loss": 0.2903, + "step": 458 + }, + { + "epoch": 0.1, + "grad_norm": 1.0456336903595724, + "learning_rate": 3.9477539585546676e-05, + "loss": 0.335, + "step": 459 + }, + { + "epoch": 0.1, + "grad_norm": 1.052518634948309, + "learning_rate": 3.9474303764215606e-05, + "loss": 0.345, + "step": 460 + }, + { + "epoch": 0.1, + "grad_norm": 1.0089797719510034, + "learning_rate": 3.9471058086820024e-05, + "loss": 0.3258, + "step": 461 + }, + { + "epoch": 0.1, + "grad_norm": 0.9364948924689621, + "learning_rate": 3.9467802555002584e-05, + "loss": 0.2942, + "step": 462 + }, + { + "epoch": 0.1, + "grad_norm": 1.027574984559381, + "learning_rate": 3.946453717041093e-05, + "loss": 0.3173, + "step": 463 + }, + { + "epoch": 0.1, + "grad_norm": 1.019409640137024, + "learning_rate": 3.94612619346977e-05, + "loss": 0.3705, + "step": 464 + }, + { + "epoch": 0.1, + "grad_norm": 0.9900787652596897, + "learning_rate": 3.94579768495205e-05, + "loss": 0.338, + "step": 465 + }, + { + "epoch": 0.1, + "grad_norm": 0.9350070869462811, + "learning_rate": 3.9454681916541936e-05, + "loss": 0.2733, + "step": 466 + }, + { + "epoch": 0.1, + "grad_norm": 1.005931903483432, + "learning_rate": 3.94513771374296e-05, + "loss": 0.2998, + "step": 467 + }, + { + "epoch": 0.1, + "grad_norm": 0.9176156521492399, + "learning_rate": 3.9448062513856056e-05, + "loss": 0.277, + "step": 468 + }, + { + "epoch": 0.1, + "grad_norm": 1.0603063040051155, + "learning_rate": 3.944473804749885e-05, + "loss": 0.3182, + "step": 469 + }, + { + "epoch": 0.1, + "grad_norm": 0.878378035447139, + "learning_rate": 3.944140374004052e-05, + "loss": 0.2254, + "step": 470 + }, + { + "epoch": 0.1, + "grad_norm": 1.0312910360783618, + "learning_rate": 3.9438059593168586e-05, + "loss": 0.3281, + "step": 471 + }, + { + "epoch": 0.1, + "grad_norm": 0.9213199884441453, + "learning_rate": 3.943470560857553e-05, + "loss": 0.2669, + "step": 472 + }, + { + "epoch": 0.1, + "grad_norm": 1.0196847617381182, + "learning_rate": 3.943134178795883e-05, + "loss": 0.2907, + "step": 473 + }, + { + "epoch": 0.1, + "grad_norm": 1.0983891331319187, + "learning_rate": 3.942796813302094e-05, + "loss": 0.3776, + "step": 474 + }, + { + "epoch": 0.1, + "grad_norm": 0.9573627754065492, + "learning_rate": 3.942458464546928e-05, + "loss": 0.3094, + "step": 475 + }, + { + "epoch": 0.1, + "grad_norm": 1.002048777872047, + "learning_rate": 3.942119132701625e-05, + "loss": 0.3293, + "step": 476 + }, + { + "epoch": 0.1, + "grad_norm": 0.9371921987692436, + "learning_rate": 3.9417788179379245e-05, + "loss": 0.3096, + "step": 477 + }, + { + "epoch": 0.1, + "grad_norm": 0.9228511103876207, + "learning_rate": 3.941437520428061e-05, + "loss": 0.2665, + "step": 478 + }, + { + "epoch": 0.11, + "grad_norm": 0.7805922600482893, + "learning_rate": 3.941095240344766e-05, + "loss": 0.2244, + "step": 479 + }, + { + "epoch": 0.11, + "grad_norm": 0.9002954594099436, + "learning_rate": 3.940751977861272e-05, + "loss": 0.341, + "step": 480 + }, + { + "epoch": 0.11, + "grad_norm": 0.8502514747667408, + "learning_rate": 3.9404077331513044e-05, + "loss": 0.3013, + "step": 481 + }, + { + "epoch": 0.11, + "grad_norm": 0.9184259777508003, + "learning_rate": 3.940062506389089e-05, + "loss": 0.2894, + "step": 482 + }, + { + "epoch": 0.11, + "grad_norm": 0.973871506001312, + "learning_rate": 3.9397162977493455e-05, + "loss": 0.2658, + "step": 483 + }, + { + "epoch": 0.11, + "grad_norm": 0.9796821204711598, + "learning_rate": 3.939369107407293e-05, + "loss": 0.3231, + "step": 484 + }, + { + "epoch": 0.11, + "grad_norm": 0.9145901127233058, + "learning_rate": 3.939020935538647e-05, + "loss": 0.2403, + "step": 485 + }, + { + "epoch": 0.11, + "grad_norm": 0.9041537672139351, + "learning_rate": 3.938671782319619e-05, + "loss": 0.253, + "step": 486 + }, + { + "epoch": 0.11, + "grad_norm": 0.9090908183292864, + "learning_rate": 3.938321647926918e-05, + "loss": 0.3126, + "step": 487 + }, + { + "epoch": 0.11, + "grad_norm": 0.939022195366846, + "learning_rate": 3.937970532537749e-05, + "loss": 0.2991, + "step": 488 + }, + { + "epoch": 0.11, + "grad_norm": 0.9406854553058501, + "learning_rate": 3.937618436329813e-05, + "loss": 0.2799, + "step": 489 + }, + { + "epoch": 0.11, + "grad_norm": 1.028239160942009, + "learning_rate": 3.937265359481309e-05, + "loss": 0.2919, + "step": 490 + }, + { + "epoch": 0.11, + "grad_norm": 0.9503870238436994, + "learning_rate": 3.936911302170931e-05, + "loss": 0.2791, + "step": 491 + }, + { + "epoch": 0.11, + "grad_norm": 1.0751408174236268, + "learning_rate": 3.936556264577869e-05, + "loss": 0.331, + "step": 492 + }, + { + "epoch": 0.11, + "grad_norm": 0.8973227509717273, + "learning_rate": 3.9362002468818105e-05, + "loss": 0.2709, + "step": 493 + }, + { + "epoch": 0.11, + "grad_norm": 0.9372297533259436, + "learning_rate": 3.935843249262939e-05, + "loss": 0.2964, + "step": 494 + }, + { + "epoch": 0.11, + "grad_norm": 0.8989041982170147, + "learning_rate": 3.9354852719019306e-05, + "loss": 0.2787, + "step": 495 + }, + { + "epoch": 0.11, + "grad_norm": 0.8896321903424332, + "learning_rate": 3.935126314979962e-05, + "loss": 0.3095, + "step": 496 + }, + { + "epoch": 0.11, + "grad_norm": 0.8254472878283721, + "learning_rate": 3.934766378678704e-05, + "loss": 0.2711, + "step": 497 + }, + { + "epoch": 0.11, + "grad_norm": 0.9562348458080392, + "learning_rate": 3.93440546318032e-05, + "loss": 0.2691, + "step": 498 + }, + { + "epoch": 0.11, + "grad_norm": 0.9090852124946016, + "learning_rate": 3.934043568667473e-05, + "loss": 0.2988, + "step": 499 + }, + { + "epoch": 0.11, + "grad_norm": 1.056004246219132, + "learning_rate": 3.933680695323321e-05, + "loss": 0.3244, + "step": 500 + }, + { + "epoch": 0.11, + "grad_norm": 0.927229316161898, + "learning_rate": 3.9333168433315144e-05, + "loss": 0.2705, + "step": 501 + }, + { + "epoch": 0.11, + "grad_norm": 0.9840266504477935, + "learning_rate": 3.932952012876203e-05, + "loss": 0.2898, + "step": 502 + }, + { + "epoch": 0.11, + "grad_norm": 0.7816009496163775, + "learning_rate": 3.9325862041420275e-05, + "loss": 0.2449, + "step": 503 + }, + { + "epoch": 0.11, + "grad_norm": 0.8185233814439627, + "learning_rate": 3.9322194173141284e-05, + "loss": 0.2487, + "step": 504 + }, + { + "epoch": 0.11, + "grad_norm": 0.9029351150313263, + "learning_rate": 3.931851652578137e-05, + "loss": 0.2685, + "step": 505 + }, + { + "epoch": 0.11, + "grad_norm": 0.827850296234817, + "learning_rate": 3.9314829101201814e-05, + "loss": 0.2831, + "step": 506 + }, + { + "epoch": 0.11, + "grad_norm": 0.7696270568383994, + "learning_rate": 3.9311131901268855e-05, + "loss": 0.2205, + "step": 507 + }, + { + "epoch": 0.11, + "grad_norm": 1.0038396909743488, + "learning_rate": 3.930742492785366e-05, + "loss": 0.3061, + "step": 508 + }, + { + "epoch": 0.11, + "grad_norm": 0.9429509639247784, + "learning_rate": 3.930370818283235e-05, + "loss": 0.2612, + "step": 509 + }, + { + "epoch": 0.11, + "grad_norm": 0.938932373628078, + "learning_rate": 3.9299981668085997e-05, + "loss": 0.288, + "step": 510 + }, + { + "epoch": 0.11, + "grad_norm": 0.9599716170406715, + "learning_rate": 3.929624538550061e-05, + "loss": 0.3057, + "step": 511 + }, + { + "epoch": 0.11, + "grad_norm": 0.9938410999118946, + "learning_rate": 3.929249933696715e-05, + "loss": 0.3066, + "step": 512 + }, + { + "epoch": 0.11, + "grad_norm": 0.9474069706180214, + "learning_rate": 3.92887435243815e-05, + "loss": 0.2881, + "step": 513 + }, + { + "epoch": 0.11, + "grad_norm": 0.9390446335906655, + "learning_rate": 3.928497794964452e-05, + "loss": 0.3123, + "step": 514 + }, + { + "epoch": 0.11, + "grad_norm": 0.8349768564591767, + "learning_rate": 3.928120261466198e-05, + "loss": 0.2915, + "step": 515 + }, + { + "epoch": 0.11, + "grad_norm": 0.7805330038112448, + "learning_rate": 3.92774175213446e-05, + "loss": 0.2132, + "step": 516 + }, + { + "epoch": 0.11, + "grad_norm": 0.9165333015064896, + "learning_rate": 3.927362267160804e-05, + "loss": 0.2749, + "step": 517 + }, + { + "epoch": 0.11, + "grad_norm": 0.9711977889904639, + "learning_rate": 3.92698180673729e-05, + "loss": 0.2505, + "step": 518 + }, + { + "epoch": 0.11, + "grad_norm": 1.0065510862094755, + "learning_rate": 3.9266003710564706e-05, + "loss": 0.3477, + "step": 519 + }, + { + "epoch": 0.11, + "grad_norm": 0.876275699363774, + "learning_rate": 3.9262179603113934e-05, + "loss": 0.2555, + "step": 520 + }, + { + "epoch": 0.11, + "grad_norm": 0.8644224193306435, + "learning_rate": 3.925834574695599e-05, + "loss": 0.2725, + "step": 521 + }, + { + "epoch": 0.11, + "grad_norm": 1.0584898049535252, + "learning_rate": 3.9254502144031204e-05, + "loss": 0.3504, + "step": 522 + }, + { + "epoch": 0.11, + "grad_norm": 0.869473785667698, + "learning_rate": 3.925064879628485e-05, + "loss": 0.2564, + "step": 523 + }, + { + "epoch": 0.12, + "grad_norm": 0.9656636079150069, + "learning_rate": 3.924678570566714e-05, + "loss": 0.3155, + "step": 524 + }, + { + "epoch": 0.12, + "grad_norm": 0.8794120519948292, + "learning_rate": 3.9242912874133186e-05, + "loss": 0.2562, + "step": 525 + }, + { + "epoch": 0.12, + "grad_norm": 0.9395312239131293, + "learning_rate": 3.9239030303643074e-05, + "loss": 0.2911, + "step": 526 + }, + { + "epoch": 0.12, + "grad_norm": 0.9857349991843363, + "learning_rate": 3.9235137996161786e-05, + "loss": 0.307, + "step": 527 + }, + { + "epoch": 0.12, + "grad_norm": 0.8903683074995272, + "learning_rate": 3.9231235953659244e-05, + "loss": 0.2542, + "step": 528 + }, + { + "epoch": 0.12, + "grad_norm": 0.9085176641041314, + "learning_rate": 3.9227324178110295e-05, + "loss": 0.2314, + "step": 529 + }, + { + "epoch": 0.12, + "grad_norm": 0.7851733685045973, + "learning_rate": 3.922340267149472e-05, + "loss": 0.2062, + "step": 530 + }, + { + "epoch": 0.12, + "grad_norm": 1.0448656298749703, + "learning_rate": 3.9219471435797205e-05, + "loss": 0.3111, + "step": 531 + }, + { + "epoch": 0.12, + "grad_norm": 0.9052352485708491, + "learning_rate": 3.921553047300739e-05, + "loss": 0.2788, + "step": 532 + }, + { + "epoch": 0.12, + "grad_norm": 0.9514362508272822, + "learning_rate": 3.9211579785119804e-05, + "loss": 0.3458, + "step": 533 + }, + { + "epoch": 0.12, + "grad_norm": 0.9124283488112793, + "learning_rate": 3.9207619374133917e-05, + "loss": 0.2927, + "step": 534 + }, + { + "epoch": 0.12, + "grad_norm": 0.7971053071157986, + "learning_rate": 3.920364924205412e-05, + "loss": 0.2527, + "step": 535 + }, + { + "epoch": 0.12, + "grad_norm": 0.8210814368924492, + "learning_rate": 3.9199669390889725e-05, + "loss": 0.2371, + "step": 536 + }, + { + "epoch": 0.12, + "grad_norm": 0.8722992155557622, + "learning_rate": 3.919567982265495e-05, + "loss": 0.2327, + "step": 537 + }, + { + "epoch": 0.12, + "grad_norm": 1.0838935882765854, + "learning_rate": 3.9191680539368956e-05, + "loss": 0.2947, + "step": 538 + }, + { + "epoch": 0.12, + "grad_norm": 0.9371555967328974, + "learning_rate": 3.9187671543055785e-05, + "loss": 0.2702, + "step": 539 + }, + { + "epoch": 0.12, + "grad_norm": 1.017192862829085, + "learning_rate": 3.918365283574443e-05, + "loss": 0.3159, + "step": 540 + }, + { + "epoch": 0.12, + "grad_norm": 0.9847778827974156, + "learning_rate": 3.9179624419468766e-05, + "loss": 0.3008, + "step": 541 + }, + { + "epoch": 0.12, + "grad_norm": 0.9191660114354011, + "learning_rate": 3.917558629626762e-05, + "loss": 0.2984, + "step": 542 + }, + { + "epoch": 0.12, + "grad_norm": 0.8005048321274897, + "learning_rate": 3.917153846818471e-05, + "loss": 0.258, + "step": 543 + }, + { + "epoch": 0.12, + "grad_norm": 0.8634314427972728, + "learning_rate": 3.916748093726864e-05, + "loss": 0.2936, + "step": 544 + }, + { + "epoch": 0.12, + "grad_norm": 0.8338975347471274, + "learning_rate": 3.9163413705572984e-05, + "loss": 0.2798, + "step": 545 + }, + { + "epoch": 0.12, + "grad_norm": 0.8007292939985748, + "learning_rate": 3.9159336775156165e-05, + "loss": 0.2621, + "step": 546 + }, + { + "epoch": 0.12, + "grad_norm": 0.7741079025905953, + "learning_rate": 3.9155250148081564e-05, + "loss": 0.2351, + "step": 547 + }, + { + "epoch": 0.12, + "grad_norm": 0.7709212940307969, + "learning_rate": 3.9151153826417436e-05, + "loss": 0.2426, + "step": 548 + }, + { + "epoch": 0.12, + "grad_norm": 1.028936449677963, + "learning_rate": 3.914704781223696e-05, + "loss": 0.3413, + "step": 549 + }, + { + "epoch": 0.12, + "grad_norm": 0.8438898606413698, + "learning_rate": 3.9142932107618214e-05, + "loss": 0.28, + "step": 550 + }, + { + "epoch": 0.12, + "grad_norm": 0.8175362974968846, + "learning_rate": 3.913880671464418e-05, + "loss": 0.2164, + "step": 551 + }, + { + "epoch": 0.12, + "grad_norm": 0.9028316681419125, + "learning_rate": 3.9134671635402745e-05, + "loss": 0.2527, + "step": 552 + }, + { + "epoch": 0.12, + "grad_norm": 0.9539675506215372, + "learning_rate": 3.91305268719867e-05, + "loss": 0.3183, + "step": 553 + }, + { + "epoch": 0.12, + "grad_norm": 0.9654179150393969, + "learning_rate": 3.912637242649373e-05, + "loss": 0.2934, + "step": 554 + }, + { + "epoch": 0.12, + "grad_norm": 0.997502725449283, + "learning_rate": 3.912220830102643e-05, + "loss": 0.3352, + "step": 555 + }, + { + "epoch": 0.12, + "grad_norm": 0.903468665235631, + "learning_rate": 3.911803449769228e-05, + "loss": 0.2729, + "step": 556 + }, + { + "epoch": 0.12, + "grad_norm": 0.866565928350931, + "learning_rate": 3.911385101860369e-05, + "loss": 0.2233, + "step": 557 + }, + { + "epoch": 0.12, + "grad_norm": 0.940595000286064, + "learning_rate": 3.9109657865877924e-05, + "loss": 0.282, + "step": 558 + }, + { + "epoch": 0.12, + "grad_norm": 0.9123056270044905, + "learning_rate": 3.910545504163716e-05, + "loss": 0.2633, + "step": 559 + }, + { + "epoch": 0.12, + "grad_norm": 0.8137460104380139, + "learning_rate": 3.9101242548008496e-05, + "loss": 0.2456, + "step": 560 + }, + { + "epoch": 0.12, + "grad_norm": 0.9614642800546382, + "learning_rate": 3.9097020387123876e-05, + "loss": 0.2924, + "step": 561 + }, + { + "epoch": 0.12, + "grad_norm": 0.816181425687044, + "learning_rate": 3.9092788561120174e-05, + "loss": 0.2334, + "step": 562 + }, + { + "epoch": 0.12, + "grad_norm": 0.8107074624429237, + "learning_rate": 3.9088547072139145e-05, + "loss": 0.2132, + "step": 563 + }, + { + "epoch": 0.12, + "grad_norm": 1.000443360272963, + "learning_rate": 3.9084295922327414e-05, + "loss": 0.2928, + "step": 564 + }, + { + "epoch": 0.12, + "grad_norm": 0.8417879179369573, + "learning_rate": 3.908003511383654e-05, + "loss": 0.2363, + "step": 565 + }, + { + "epoch": 0.12, + "grad_norm": 0.8597390530824697, + "learning_rate": 3.907576464882294e-05, + "loss": 0.2598, + "step": 566 + }, + { + "epoch": 0.12, + "grad_norm": 0.8753681089747455, + "learning_rate": 3.90714845294479e-05, + "loss": 0.2594, + "step": 567 + }, + { + "epoch": 0.12, + "grad_norm": 0.8363784268898656, + "learning_rate": 3.9067194757877635e-05, + "loss": 0.2449, + "step": 568 + }, + { + "epoch": 0.12, + "grad_norm": 0.7827517000446115, + "learning_rate": 3.906289533628322e-05, + "loss": 0.2065, + "step": 569 + }, + { + "epoch": 0.13, + "grad_norm": 0.7850525441169758, + "learning_rate": 3.9058586266840614e-05, + "loss": 0.2414, + "step": 570 + }, + { + "epoch": 0.13, + "grad_norm": 0.7769308862816632, + "learning_rate": 3.905426755173068e-05, + "loss": 0.2141, + "step": 571 + }, + { + "epoch": 0.13, + "grad_norm": 0.8177360400387669, + "learning_rate": 3.904993919313912e-05, + "loss": 0.247, + "step": 572 + }, + { + "epoch": 0.13, + "grad_norm": 0.8041845064375056, + "learning_rate": 3.9045601193256564e-05, + "loss": 0.2307, + "step": 573 + }, + { + "epoch": 0.13, + "grad_norm": 0.9279095293640436, + "learning_rate": 3.9041253554278486e-05, + "loss": 0.2471, + "step": 574 + }, + { + "epoch": 0.13, + "grad_norm": 0.9726708652363452, + "learning_rate": 3.9036896278405264e-05, + "loss": 0.3427, + "step": 575 + }, + { + "epoch": 0.13, + "grad_norm": 0.9646956604271677, + "learning_rate": 3.9032529367842145e-05, + "loss": 0.2814, + "step": 576 + }, + { + "epoch": 0.13, + "grad_norm": 0.893512462181103, + "learning_rate": 3.902815282479923e-05, + "loss": 0.2492, + "step": 577 + }, + { + "epoch": 0.13, + "grad_norm": 0.7250958066974518, + "learning_rate": 3.902376665149153e-05, + "loss": 0.2237, + "step": 578 + }, + { + "epoch": 0.13, + "grad_norm": 0.9281262022068445, + "learning_rate": 3.9019370850138915e-05, + "loss": 0.3319, + "step": 579 + }, + { + "epoch": 0.13, + "grad_norm": 0.9334790785845333, + "learning_rate": 3.9014965422966115e-05, + "loss": 0.2829, + "step": 580 + }, + { + "epoch": 0.13, + "grad_norm": 0.718761526927567, + "learning_rate": 3.9010550372202756e-05, + "loss": 0.1868, + "step": 581 + }, + { + "epoch": 0.13, + "grad_norm": 0.860903057648702, + "learning_rate": 3.900612570008331e-05, + "loss": 0.2548, + "step": 582 + }, + { + "epoch": 0.13, + "grad_norm": 0.8865750234134181, + "learning_rate": 3.900169140884715e-05, + "loss": 0.2613, + "step": 583 + }, + { + "epoch": 0.13, + "grad_norm": 0.8633144251460134, + "learning_rate": 3.899724750073848e-05, + "loss": 0.2258, + "step": 584 + }, + { + "epoch": 0.13, + "grad_norm": 0.7977090846094947, + "learning_rate": 3.899279397800639e-05, + "loss": 0.2384, + "step": 585 + }, + { + "epoch": 0.13, + "grad_norm": 0.8879218656965611, + "learning_rate": 3.8988330842904844e-05, + "loss": 0.2288, + "step": 586 + }, + { + "epoch": 0.13, + "grad_norm": 0.9116832056120259, + "learning_rate": 3.8983858097692656e-05, + "loss": 0.2735, + "step": 587 + }, + { + "epoch": 0.13, + "grad_norm": 0.9106254187002378, + "learning_rate": 3.8979375744633515e-05, + "loss": 0.3284, + "step": 588 + }, + { + "epoch": 0.13, + "grad_norm": 0.7889815185084196, + "learning_rate": 3.897488378599596e-05, + "loss": 0.2806, + "step": 589 + }, + { + "epoch": 0.13, + "grad_norm": 0.7409425835211813, + "learning_rate": 3.8970382224053414e-05, + "loss": 0.2252, + "step": 590 + }, + { + "epoch": 0.13, + "grad_norm": 0.8416200881445274, + "learning_rate": 3.8965871061084126e-05, + "loss": 0.2315, + "step": 591 + }, + { + "epoch": 0.13, + "grad_norm": 0.8657765637959218, + "learning_rate": 3.896135029937123e-05, + "loss": 0.2482, + "step": 592 + }, + { + "epoch": 0.13, + "grad_norm": 0.9129124767347182, + "learning_rate": 3.895681994120272e-05, + "loss": 0.2974, + "step": 593 + }, + { + "epoch": 0.13, + "grad_norm": 0.8568144364938592, + "learning_rate": 3.8952279988871425e-05, + "loss": 0.2579, + "step": 594 + }, + { + "epoch": 0.13, + "grad_norm": 0.8096790861857919, + "learning_rate": 3.894773044467505e-05, + "loss": 0.2494, + "step": 595 + }, + { + "epoch": 0.13, + "grad_norm": 0.9138980293654226, + "learning_rate": 3.8943171310916146e-05, + "loss": 0.2804, + "step": 596 + }, + { + "epoch": 0.13, + "grad_norm": 0.7697566907591287, + "learning_rate": 3.893860258990212e-05, + "loss": 0.2106, + "step": 597 + }, + { + "epoch": 0.13, + "grad_norm": 0.8346549918050894, + "learning_rate": 3.893402428394522e-05, + "loss": 0.2223, + "step": 598 + }, + { + "epoch": 0.13, + "grad_norm": 0.923388192914631, + "learning_rate": 3.892943639536257e-05, + "loss": 0.2345, + "step": 599 + }, + { + "epoch": 0.13, + "grad_norm": 0.88024735599371, + "learning_rate": 3.8924838926476114e-05, + "loss": 0.2564, + "step": 600 + }, + { + "epoch": 0.13, + "grad_norm": 0.7871692719872424, + "learning_rate": 3.892023187961268e-05, + "loss": 0.2286, + "step": 601 + }, + { + "epoch": 0.13, + "grad_norm": 0.9327809454662354, + "learning_rate": 3.891561525710389e-05, + "loss": 0.2585, + "step": 602 + }, + { + "epoch": 0.13, + "grad_norm": 0.7632589174906094, + "learning_rate": 3.891098906128628e-05, + "loss": 0.211, + "step": 603 + }, + { + "epoch": 0.13, + "grad_norm": 1.020860879227997, + "learning_rate": 3.890635329450118e-05, + "loss": 0.2721, + "step": 604 + }, + { + "epoch": 0.13, + "grad_norm": 0.8419214497236249, + "learning_rate": 3.890170795909477e-05, + "loss": 0.2499, + "step": 605 + }, + { + "epoch": 0.13, + "grad_norm": 0.6970832009057898, + "learning_rate": 3.88970530574181e-05, + "loss": 0.1971, + "step": 606 + }, + { + "epoch": 0.13, + "grad_norm": 0.9467412696800499, + "learning_rate": 3.889238859182703e-05, + "loss": 0.2719, + "step": 607 + }, + { + "epoch": 0.13, + "grad_norm": 0.9102254685698031, + "learning_rate": 3.888771456468229e-05, + "loss": 0.2563, + "step": 608 + }, + { + "epoch": 0.13, + "grad_norm": 0.8363262947985433, + "learning_rate": 3.8883030978349416e-05, + "loss": 0.196, + "step": 609 + }, + { + "epoch": 0.13, + "grad_norm": 0.8584818620470888, + "learning_rate": 3.887833783519882e-05, + "loss": 0.2561, + "step": 610 + }, + { + "epoch": 0.13, + "grad_norm": 0.839118786563085, + "learning_rate": 3.887363513760571e-05, + "loss": 0.275, + "step": 611 + }, + { + "epoch": 0.13, + "grad_norm": 0.8060011279995126, + "learning_rate": 3.8868922887950165e-05, + "loss": 0.2507, + "step": 612 + }, + { + "epoch": 0.13, + "grad_norm": 0.8452849730912027, + "learning_rate": 3.886420108861708e-05, + "loss": 0.2726, + "step": 613 + }, + { + "epoch": 0.13, + "grad_norm": 0.7714129837419096, + "learning_rate": 3.885946974199618e-05, + "loss": 0.2003, + "step": 614 + }, + { + "epoch": 0.14, + "grad_norm": 0.6995611577801152, + "learning_rate": 3.8854728850482034e-05, + "loss": 0.1937, + "step": 615 + }, + { + "epoch": 0.14, + "grad_norm": 0.9346788556596127, + "learning_rate": 3.884997841647404e-05, + "loss": 0.2539, + "step": 616 + }, + { + "epoch": 0.14, + "grad_norm": 0.9487142595839332, + "learning_rate": 3.8845218442376416e-05, + "loss": 0.2369, + "step": 617 + }, + { + "epoch": 0.14, + "grad_norm": 0.7760025415649283, + "learning_rate": 3.8840448930598216e-05, + "loss": 0.2482, + "step": 618 + }, + { + "epoch": 0.14, + "grad_norm": 0.7882415731253646, + "learning_rate": 3.8835669883553315e-05, + "loss": 0.199, + "step": 619 + }, + { + "epoch": 0.14, + "grad_norm": 0.8341878959807444, + "learning_rate": 3.883088130366042e-05, + "loss": 0.2105, + "step": 620 + }, + { + "epoch": 0.14, + "grad_norm": 0.8429857961152651, + "learning_rate": 3.882608319334306e-05, + "loss": 0.2527, + "step": 621 + }, + { + "epoch": 0.14, + "grad_norm": 0.8876945403277204, + "learning_rate": 3.88212755550296e-05, + "loss": 0.29, + "step": 622 + }, + { + "epoch": 0.14, + "grad_norm": 0.8439356988078114, + "learning_rate": 3.88164583911532e-05, + "loss": 0.2386, + "step": 623 + }, + { + "epoch": 0.14, + "grad_norm": 0.7962827164352877, + "learning_rate": 3.881163170415186e-05, + "loss": 0.1941, + "step": 624 + }, + { + "epoch": 0.14, + "grad_norm": 0.8120727149134773, + "learning_rate": 3.88067954964684e-05, + "loss": 0.2458, + "step": 625 + }, + { + "epoch": 0.14, + "grad_norm": 0.8753465238779636, + "learning_rate": 3.880194977055045e-05, + "loss": 0.2773, + "step": 626 + }, + { + "epoch": 0.14, + "grad_norm": 0.8029545020424, + "learning_rate": 3.8797094528850474e-05, + "loss": 0.2832, + "step": 627 + }, + { + "epoch": 0.14, + "grad_norm": 0.7780470996760221, + "learning_rate": 3.8792229773825716e-05, + "loss": 0.2225, + "step": 628 + }, + { + "epoch": 0.14, + "grad_norm": 0.7320854126262574, + "learning_rate": 3.878735550793827e-05, + "loss": 0.1794, + "step": 629 + }, + { + "epoch": 0.14, + "grad_norm": 0.7694790430319758, + "learning_rate": 3.8782471733655044e-05, + "loss": 0.1954, + "step": 630 + }, + { + "epoch": 0.14, + "grad_norm": 0.8376801766691149, + "learning_rate": 3.877757845344773e-05, + "loss": 0.2288, + "step": 631 + }, + { + "epoch": 0.14, + "grad_norm": 0.8453942983822659, + "learning_rate": 3.8772675669792855e-05, + "loss": 0.2335, + "step": 632 + }, + { + "epoch": 0.14, + "grad_norm": 0.8760557277124035, + "learning_rate": 3.876776338517174e-05, + "loss": 0.2096, + "step": 633 + }, + { + "epoch": 0.14, + "grad_norm": 1.0169787261523278, + "learning_rate": 3.876284160207053e-05, + "loss": 0.2896, + "step": 634 + }, + { + "epoch": 0.14, + "grad_norm": 0.9192946577722143, + "learning_rate": 3.875791032298017e-05, + "loss": 0.2784, + "step": 635 + }, + { + "epoch": 0.14, + "grad_norm": 0.9142417405933866, + "learning_rate": 3.875296955039641e-05, + "loss": 0.302, + "step": 636 + }, + { + "epoch": 0.14, + "grad_norm": 0.8178038976593689, + "learning_rate": 3.874801928681979e-05, + "loss": 0.1832, + "step": 637 + }, + { + "epoch": 0.14, + "grad_norm": 0.760477864838044, + "learning_rate": 3.87430595347557e-05, + "loss": 0.2009, + "step": 638 + }, + { + "epoch": 0.14, + "grad_norm": 0.8154029704759584, + "learning_rate": 3.873809029671427e-05, + "loss": 0.2425, + "step": 639 + }, + { + "epoch": 0.14, + "grad_norm": 0.8286592451665006, + "learning_rate": 3.873311157521048e-05, + "loss": 0.2414, + "step": 640 + }, + { + "epoch": 0.14, + "grad_norm": 0.9469051028965093, + "learning_rate": 3.8728123372764085e-05, + "loss": 0.2503, + "step": 641 + }, + { + "epoch": 0.14, + "grad_norm": 0.727655593449102, + "learning_rate": 3.8723125691899646e-05, + "loss": 0.2048, + "step": 642 + }, + { + "epoch": 0.14, + "grad_norm": 1.1400898762459823, + "learning_rate": 3.871811853514652e-05, + "loss": 0.1988, + "step": 643 + }, + { + "epoch": 0.14, + "grad_norm": 0.8699188520141753, + "learning_rate": 3.871310190503886e-05, + "loss": 0.2335, + "step": 644 + }, + { + "epoch": 0.14, + "grad_norm": 0.9371101840525764, + "learning_rate": 3.870807580411561e-05, + "loss": 0.2717, + "step": 645 + }, + { + "epoch": 0.14, + "grad_norm": 0.9328906871545495, + "learning_rate": 3.870304023492051e-05, + "loss": 0.2805, + "step": 646 + }, + { + "epoch": 0.14, + "grad_norm": 0.7801770662800196, + "learning_rate": 3.8697995200002105e-05, + "loss": 0.236, + "step": 647 + }, + { + "epoch": 0.14, + "grad_norm": 1.1148783153797293, + "learning_rate": 3.8692940701913706e-05, + "loss": 0.2292, + "step": 648 + }, + { + "epoch": 0.14, + "grad_norm": 0.7670811410430906, + "learning_rate": 3.868787674321343e-05, + "loss": 0.2143, + "step": 649 + }, + { + "epoch": 0.14, + "grad_norm": 0.8094460238957172, + "learning_rate": 3.868280332646417e-05, + "loss": 0.2613, + "step": 650 + }, + { + "epoch": 0.14, + "grad_norm": 0.7476612820351332, + "learning_rate": 3.867772045423362e-05, + "loss": 0.2234, + "step": 651 + }, + { + "epoch": 0.14, + "grad_norm": 0.784818700094082, + "learning_rate": 3.8672628129094255e-05, + "loss": 0.2476, + "step": 652 + }, + { + "epoch": 0.14, + "grad_norm": 0.793072773541811, + "learning_rate": 3.8667526353623326e-05, + "loss": 0.194, + "step": 653 + }, + { + "epoch": 0.14, + "grad_norm": 0.892512980615161, + "learning_rate": 3.866241513040288e-05, + "loss": 0.2593, + "step": 654 + }, + { + "epoch": 0.14, + "grad_norm": 0.9285416794821778, + "learning_rate": 3.8657294462019735e-05, + "loss": 0.2425, + "step": 655 + }, + { + "epoch": 0.14, + "grad_norm": 0.8956017231761512, + "learning_rate": 3.865216435106549e-05, + "loss": 0.2275, + "step": 656 + }, + { + "epoch": 0.14, + "grad_norm": 0.7304122413360475, + "learning_rate": 3.8647024800136524e-05, + "loss": 0.1792, + "step": 657 + }, + { + "epoch": 0.14, + "grad_norm": 0.8574688318554834, + "learning_rate": 3.8641875811834004e-05, + "loss": 0.2151, + "step": 658 + }, + { + "epoch": 0.14, + "grad_norm": 0.7538143754231571, + "learning_rate": 3.863671738876385e-05, + "loss": 0.2138, + "step": 659 + }, + { + "epoch": 0.14, + "grad_norm": 0.8502167733846998, + "learning_rate": 3.863154953353679e-05, + "loss": 0.2274, + "step": 660 + }, + { + "epoch": 0.15, + "grad_norm": 0.8107420169367895, + "learning_rate": 3.8626372248768295e-05, + "loss": 0.2394, + "step": 661 + }, + { + "epoch": 0.15, + "grad_norm": 0.9014610828498489, + "learning_rate": 3.862118553707863e-05, + "loss": 0.3027, + "step": 662 + }, + { + "epoch": 0.15, + "grad_norm": 0.9112825423972205, + "learning_rate": 3.86159894010928e-05, + "loss": 0.2908, + "step": 663 + }, + { + "epoch": 0.15, + "grad_norm": 0.6628921068182834, + "learning_rate": 3.8610783843440626e-05, + "loss": 0.1389, + "step": 664 + }, + { + "epoch": 0.15, + "grad_norm": 0.7176790136393478, + "learning_rate": 3.8605568866756666e-05, + "loss": 0.1723, + "step": 665 + }, + { + "epoch": 0.15, + "grad_norm": 0.7682664032440877, + "learning_rate": 3.860034447368024e-05, + "loss": 0.2148, + "step": 666 + }, + { + "epoch": 0.15, + "grad_norm": 0.9302215965381995, + "learning_rate": 3.8595110666855466e-05, + "loss": 0.296, + "step": 667 + }, + { + "epoch": 0.15, + "grad_norm": 0.8521139476720233, + "learning_rate": 3.858986744893119e-05, + "loss": 0.23, + "step": 668 + }, + { + "epoch": 0.15, + "grad_norm": 0.7464405472134348, + "learning_rate": 3.858461482256103e-05, + "loss": 0.2147, + "step": 669 + }, + { + "epoch": 0.15, + "grad_norm": 0.8596453154852116, + "learning_rate": 3.8579352790403395e-05, + "loss": 0.2161, + "step": 670 + }, + { + "epoch": 0.15, + "grad_norm": 0.809061992905901, + "learning_rate": 3.857408135512142e-05, + "loss": 0.2014, + "step": 671 + }, + { + "epoch": 0.15, + "grad_norm": 0.8437509889950078, + "learning_rate": 3.8568800519383e-05, + "loss": 0.2255, + "step": 672 + }, + { + "epoch": 0.15, + "grad_norm": 0.7568962614568128, + "learning_rate": 3.856351028586082e-05, + "loss": 0.1787, + "step": 673 + }, + { + "epoch": 0.15, + "grad_norm": 0.8055013226380432, + "learning_rate": 3.855821065723228e-05, + "loss": 0.2017, + "step": 674 + }, + { + "epoch": 0.15, + "grad_norm": 0.8826729993400942, + "learning_rate": 3.855290163617956e-05, + "loss": 0.2608, + "step": 675 + }, + { + "epoch": 0.15, + "grad_norm": 0.7806975127749172, + "learning_rate": 3.8547583225389596e-05, + "loss": 0.2001, + "step": 676 + }, + { + "epoch": 0.15, + "grad_norm": 0.7618422970469434, + "learning_rate": 3.8542255427554065e-05, + "loss": 0.2289, + "step": 677 + }, + { + "epoch": 0.15, + "grad_norm": 0.7862885694034616, + "learning_rate": 3.85369182453694e-05, + "loss": 0.2798, + "step": 678 + }, + { + "epoch": 0.15, + "grad_norm": 0.6671069424883396, + "learning_rate": 3.853157168153677e-05, + "loss": 0.1671, + "step": 679 + }, + { + "epoch": 0.15, + "grad_norm": 0.739590608981885, + "learning_rate": 3.852621573876212e-05, + "loss": 0.2328, + "step": 680 + }, + { + "epoch": 0.15, + "grad_norm": 0.7928258081145453, + "learning_rate": 3.8520850419756104e-05, + "loss": 0.2329, + "step": 681 + }, + { + "epoch": 0.15, + "grad_norm": 0.8702197972343868, + "learning_rate": 3.851547572723416e-05, + "loss": 0.2651, + "step": 682 + }, + { + "epoch": 0.15, + "grad_norm": 0.8421114800562508, + "learning_rate": 3.851009166391646e-05, + "loss": 0.2441, + "step": 683 + }, + { + "epoch": 0.15, + "grad_norm": 0.8450856410310613, + "learning_rate": 3.850469823252789e-05, + "loss": 0.2525, + "step": 684 + }, + { + "epoch": 0.15, + "grad_norm": 0.7375467269448306, + "learning_rate": 3.849929543579812e-05, + "loss": 0.187, + "step": 685 + }, + { + "epoch": 0.15, + "grad_norm": 0.807894049241486, + "learning_rate": 3.849388327646152e-05, + "loss": 0.2275, + "step": 686 + }, + { + "epoch": 0.15, + "grad_norm": 0.8135449219336123, + "learning_rate": 3.848846175725722e-05, + "loss": 0.2376, + "step": 687 + }, + { + "epoch": 0.15, + "grad_norm": 0.8856364968360584, + "learning_rate": 3.84830308809291e-05, + "loss": 0.2332, + "step": 688 + }, + { + "epoch": 0.15, + "grad_norm": 0.6883587675575743, + "learning_rate": 3.8477590650225735e-05, + "loss": 0.164, + "step": 689 + }, + { + "epoch": 0.15, + "grad_norm": 0.7784342568726977, + "learning_rate": 3.8472141067900485e-05, + "loss": 0.1987, + "step": 690 + }, + { + "epoch": 0.15, + "grad_norm": 0.7494893719948371, + "learning_rate": 3.84666821367114e-05, + "loss": 0.2167, + "step": 691 + }, + { + "epoch": 0.15, + "grad_norm": 0.739221327572983, + "learning_rate": 3.846121385942128e-05, + "loss": 0.1996, + "step": 692 + }, + { + "epoch": 0.15, + "grad_norm": 0.8289543173872378, + "learning_rate": 3.845573623879766e-05, + "loss": 0.1907, + "step": 693 + }, + { + "epoch": 0.15, + "grad_norm": 0.8561945235943589, + "learning_rate": 3.845024927761279e-05, + "loss": 0.2405, + "step": 694 + }, + { + "epoch": 0.15, + "grad_norm": 0.7586867804796876, + "learning_rate": 3.844475297864366e-05, + "loss": 0.1756, + "step": 695 + }, + { + "epoch": 0.15, + "grad_norm": 0.820059891816306, + "learning_rate": 3.843924734467199e-05, + "loss": 0.2047, + "step": 696 + }, + { + "epoch": 0.15, + "grad_norm": 0.8113901187168117, + "learning_rate": 3.843373237848419e-05, + "loss": 0.1931, + "step": 697 + }, + { + "epoch": 0.15, + "grad_norm": 0.8109688269530724, + "learning_rate": 3.842820808287144e-05, + "loss": 0.2202, + "step": 698 + }, + { + "epoch": 0.15, + "grad_norm": 0.8227791892675015, + "learning_rate": 3.842267446062962e-05, + "loss": 0.2283, + "step": 699 + }, + { + "epoch": 0.15, + "grad_norm": 0.7573331516312104, + "learning_rate": 3.841713151455931e-05, + "loss": 0.1784, + "step": 700 + }, + { + "epoch": 0.15, + "grad_norm": 0.7528941421925391, + "learning_rate": 3.8411579247465845e-05, + "loss": 0.2109, + "step": 701 + }, + { + "epoch": 0.15, + "grad_norm": 0.7843007603501683, + "learning_rate": 3.840601766215926e-05, + "loss": 0.21, + "step": 702 + }, + { + "epoch": 0.15, + "grad_norm": 0.8236345145931844, + "learning_rate": 3.840044676145431e-05, + "loss": 0.2033, + "step": 703 + }, + { + "epoch": 0.15, + "grad_norm": 0.752640763991213, + "learning_rate": 3.839486654817045e-05, + "loss": 0.2086, + "step": 704 + }, + { + "epoch": 0.15, + "grad_norm": 0.8262639276251157, + "learning_rate": 3.838927702513187e-05, + "loss": 0.2025, + "step": 705 + }, + { + "epoch": 0.16, + "grad_norm": 0.7761474298353154, + "learning_rate": 3.838367819516746e-05, + "loss": 0.1692, + "step": 706 + }, + { + "epoch": 0.16, + "grad_norm": 0.8734125314600135, + "learning_rate": 3.837807006111082e-05, + "loss": 0.2258, + "step": 707 + }, + { + "epoch": 0.16, + "grad_norm": 0.7484943854071063, + "learning_rate": 3.837245262580027e-05, + "loss": 0.1844, + "step": 708 + }, + { + "epoch": 0.16, + "grad_norm": 0.8271628494538433, + "learning_rate": 3.836682589207882e-05, + "loss": 0.1746, + "step": 709 + }, + { + "epoch": 0.16, + "grad_norm": 0.8443352117983162, + "learning_rate": 3.836118986279419e-05, + "loss": 0.235, + "step": 710 + }, + { + "epoch": 0.16, + "grad_norm": 0.8112915295285071, + "learning_rate": 3.835554454079882e-05, + "loss": 0.2069, + "step": 711 + }, + { + "epoch": 0.16, + "grad_norm": 0.7067376891994592, + "learning_rate": 3.834988992894983e-05, + "loss": 0.1963, + "step": 712 + }, + { + "epoch": 0.16, + "grad_norm": 0.7762592744503582, + "learning_rate": 3.834422603010906e-05, + "loss": 0.2214, + "step": 713 + }, + { + "epoch": 0.16, + "grad_norm": 0.7241916326324417, + "learning_rate": 3.833855284714305e-05, + "loss": 0.2028, + "step": 714 + }, + { + "epoch": 0.16, + "grad_norm": 0.8083965411167431, + "learning_rate": 3.833287038292303e-05, + "loss": 0.2359, + "step": 715 + }, + { + "epoch": 0.16, + "grad_norm": 0.7589641747017328, + "learning_rate": 3.832717864032492e-05, + "loss": 0.1956, + "step": 716 + }, + { + "epoch": 0.16, + "grad_norm": 0.7667360613003436, + "learning_rate": 3.832147762222936e-05, + "loss": 0.1972, + "step": 717 + }, + { + "epoch": 0.16, + "grad_norm": 0.7712735817641144, + "learning_rate": 3.8315767331521655e-05, + "loss": 0.1818, + "step": 718 + }, + { + "epoch": 0.16, + "grad_norm": 0.9010674834916976, + "learning_rate": 3.831004777109183e-05, + "loss": 0.2536, + "step": 719 + }, + { + "epoch": 0.16, + "grad_norm": 0.8485400958054092, + "learning_rate": 3.8304318943834584e-05, + "loss": 0.2258, + "step": 720 + }, + { + "epoch": 0.16, + "grad_norm": 0.7302916613026862, + "learning_rate": 3.8298580852649316e-05, + "loss": 0.2157, + "step": 721 + }, + { + "epoch": 0.16, + "grad_norm": 0.7151270654118754, + "learning_rate": 3.82928335004401e-05, + "loss": 0.1962, + "step": 722 + }, + { + "epoch": 0.16, + "grad_norm": 0.7447894614304839, + "learning_rate": 3.828707689011572e-05, + "loss": 0.2116, + "step": 723 + }, + { + "epoch": 0.16, + "grad_norm": 0.7271553000291014, + "learning_rate": 3.828131102458962e-05, + "loss": 0.1966, + "step": 724 + }, + { + "epoch": 0.16, + "grad_norm": 0.7429093314076324, + "learning_rate": 3.827553590677996e-05, + "loss": 0.1906, + "step": 725 + }, + { + "epoch": 0.16, + "grad_norm": 0.716800949542447, + "learning_rate": 3.8269751539609525e-05, + "loss": 0.1946, + "step": 726 + }, + { + "epoch": 0.16, + "grad_norm": 0.7666347229218324, + "learning_rate": 3.8263957926005855e-05, + "loss": 0.2524, + "step": 727 + }, + { + "epoch": 0.16, + "grad_norm": 0.8161614155859114, + "learning_rate": 3.825815506890111e-05, + "loss": 0.2168, + "step": 728 + }, + { + "epoch": 0.16, + "grad_norm": 0.6637236010874694, + "learning_rate": 3.825234297123216e-05, + "loss": 0.1729, + "step": 729 + }, + { + "epoch": 0.16, + "grad_norm": 0.8388439526076809, + "learning_rate": 3.824652163594056e-05, + "loss": 0.2296, + "step": 730 + }, + { + "epoch": 0.16, + "grad_norm": 0.8218573419682884, + "learning_rate": 3.8240691065972486e-05, + "loss": 0.2196, + "step": 731 + }, + { + "epoch": 0.16, + "grad_norm": 0.8293467452318114, + "learning_rate": 3.823485126427886e-05, + "loss": 0.2345, + "step": 732 + }, + { + "epoch": 0.16, + "grad_norm": 0.6922598919352861, + "learning_rate": 3.822900223381522e-05, + "loss": 0.1656, + "step": 733 + }, + { + "epoch": 0.16, + "grad_norm": 0.8068375981982576, + "learning_rate": 3.8223143977541806e-05, + "loss": 0.2273, + "step": 734 + }, + { + "epoch": 0.16, + "grad_norm": 0.7797871053978261, + "learning_rate": 3.821727649842352e-05, + "loss": 0.2594, + "step": 735 + }, + { + "epoch": 0.16, + "grad_norm": 0.7766407226832587, + "learning_rate": 3.821139979942992e-05, + "loss": 0.1986, + "step": 736 + }, + { + "epoch": 0.16, + "grad_norm": 0.8017956519170995, + "learning_rate": 3.820551388353525e-05, + "loss": 0.2132, + "step": 737 + }, + { + "epoch": 0.16, + "grad_norm": 0.7925431937110463, + "learning_rate": 3.819961875371839e-05, + "loss": 0.2119, + "step": 738 + }, + { + "epoch": 0.16, + "grad_norm": 0.7698235900481465, + "learning_rate": 3.819371441296292e-05, + "loss": 0.2025, + "step": 739 + }, + { + "epoch": 0.16, + "grad_norm": 0.8134358225280137, + "learning_rate": 3.8187800864257065e-05, + "loss": 0.2063, + "step": 740 + }, + { + "epoch": 0.16, + "grad_norm": 0.7621643328146783, + "learning_rate": 3.818187811059369e-05, + "loss": 0.1876, + "step": 741 + }, + { + "epoch": 0.16, + "grad_norm": 0.9041169813600348, + "learning_rate": 3.817594615497035e-05, + "loss": 0.2496, + "step": 742 + }, + { + "epoch": 0.16, + "grad_norm": 0.7800227635069699, + "learning_rate": 3.817000500038924e-05, + "loss": 0.2249, + "step": 743 + }, + { + "epoch": 0.16, + "grad_norm": 0.7763990863508718, + "learning_rate": 3.8164054649857206e-05, + "loss": 0.204, + "step": 744 + }, + { + "epoch": 0.16, + "grad_norm": 0.6792244923631663, + "learning_rate": 3.815809510638578e-05, + "loss": 0.1702, + "step": 745 + }, + { + "epoch": 0.16, + "grad_norm": 0.8416913667837724, + "learning_rate": 3.81521263729911e-05, + "loss": 0.2529, + "step": 746 + }, + { + "epoch": 0.16, + "grad_norm": 0.6874297279510242, + "learning_rate": 3.8146148452694e-05, + "loss": 0.182, + "step": 747 + }, + { + "epoch": 0.16, + "grad_norm": 0.7552469616361313, + "learning_rate": 3.8140161348519924e-05, + "loss": 0.238, + "step": 748 + }, + { + "epoch": 0.16, + "grad_norm": 0.7362886210805616, + "learning_rate": 3.813416506349899e-05, + "loss": 0.1778, + "step": 749 + }, + { + "epoch": 0.16, + "grad_norm": 0.6942190723817684, + "learning_rate": 3.8128159600665954e-05, + "loss": 0.184, + "step": 750 + }, + { + "epoch": 0.16, + "grad_norm": 0.7012048145090823, + "learning_rate": 3.812214496306022e-05, + "loss": 0.1744, + "step": 751 + }, + { + "epoch": 0.17, + "grad_norm": 0.7449986674789698, + "learning_rate": 3.8116121153725824e-05, + "loss": 0.1862, + "step": 752 + }, + { + "epoch": 0.17, + "grad_norm": 0.7682776915316732, + "learning_rate": 3.8110088175711456e-05, + "loss": 0.2081, + "step": 753 + }, + { + "epoch": 0.17, + "grad_norm": 0.7913974923405264, + "learning_rate": 3.810404603207045e-05, + "loss": 0.2172, + "step": 754 + }, + { + "epoch": 0.17, + "grad_norm": 0.9037616043612097, + "learning_rate": 3.809799472586077e-05, + "loss": 0.2525, + "step": 755 + }, + { + "epoch": 0.17, + "grad_norm": 0.9625775987134346, + "learning_rate": 3.809193426014501e-05, + "loss": 0.2522, + "step": 756 + }, + { + "epoch": 0.17, + "grad_norm": 0.7113503315961358, + "learning_rate": 3.808586463799042e-05, + "loss": 0.1654, + "step": 757 + }, + { + "epoch": 0.17, + "grad_norm": 0.748331040872349, + "learning_rate": 3.807978586246887e-05, + "loss": 0.1693, + "step": 758 + }, + { + "epoch": 0.17, + "grad_norm": 0.6747668375753214, + "learning_rate": 3.8073697936656866e-05, + "loss": 0.1564, + "step": 759 + }, + { + "epoch": 0.17, + "grad_norm": 0.7857764152943993, + "learning_rate": 3.806760086363554e-05, + "loss": 0.2224, + "step": 760 + }, + { + "epoch": 0.17, + "grad_norm": 0.8058116800725232, + "learning_rate": 3.806149464649066e-05, + "loss": 0.2057, + "step": 761 + }, + { + "epoch": 0.17, + "grad_norm": 0.7572603869023338, + "learning_rate": 3.8055379288312625e-05, + "loss": 0.1831, + "step": 762 + }, + { + "epoch": 0.17, + "grad_norm": 0.8191501193026167, + "learning_rate": 3.8049254792196443e-05, + "loss": 0.1991, + "step": 763 + }, + { + "epoch": 0.17, + "grad_norm": 0.7519661245832501, + "learning_rate": 3.804312116124177e-05, + "loss": 0.2015, + "step": 764 + }, + { + "epoch": 0.17, + "grad_norm": 0.8708368444295916, + "learning_rate": 3.8036978398552876e-05, + "loss": 0.2625, + "step": 765 + }, + { + "epoch": 0.17, + "grad_norm": 0.6962016132327279, + "learning_rate": 3.803082650723864e-05, + "loss": 0.1752, + "step": 766 + }, + { + "epoch": 0.17, + "grad_norm": 0.7236730484580767, + "learning_rate": 3.802466549041258e-05, + "loss": 0.2141, + "step": 767 + }, + { + "epoch": 0.17, + "grad_norm": 0.7697581619865724, + "learning_rate": 3.8018495351192825e-05, + "loss": 0.2054, + "step": 768 + }, + { + "epoch": 0.17, + "grad_norm": 0.769659506086983, + "learning_rate": 3.801231609270212e-05, + "loss": 0.203, + "step": 769 + }, + { + "epoch": 0.17, + "grad_norm": 0.6636546733887948, + "learning_rate": 3.800612771806781e-05, + "loss": 0.1446, + "step": 770 + }, + { + "epoch": 0.17, + "grad_norm": 0.6869472535775166, + "learning_rate": 3.79999302304219e-05, + "loss": 0.1652, + "step": 771 + }, + { + "epoch": 0.17, + "grad_norm": 0.7367351024507836, + "learning_rate": 3.799372363290095e-05, + "loss": 0.1737, + "step": 772 + }, + { + "epoch": 0.17, + "grad_norm": 0.7346502964947343, + "learning_rate": 3.798750792864617e-05, + "loss": 0.1551, + "step": 773 + }, + { + "epoch": 0.17, + "grad_norm": 0.8866153270215831, + "learning_rate": 3.798128312080336e-05, + "loss": 0.2897, + "step": 774 + }, + { + "epoch": 0.17, + "grad_norm": 0.7273975618740585, + "learning_rate": 3.7975049212522934e-05, + "loss": 0.1959, + "step": 775 + }, + { + "epoch": 0.17, + "grad_norm": 0.7774320772682493, + "learning_rate": 3.7968806206959915e-05, + "loss": 0.1848, + "step": 776 + }, + { + "epoch": 0.17, + "grad_norm": 0.8070108520057053, + "learning_rate": 3.7962554107273926e-05, + "loss": 0.2094, + "step": 777 + }, + { + "epoch": 0.17, + "grad_norm": 0.7002195516351798, + "learning_rate": 3.795629291662919e-05, + "loss": 0.2065, + "step": 778 + }, + { + "epoch": 0.17, + "grad_norm": 0.7795505349867997, + "learning_rate": 3.795002263819453e-05, + "loss": 0.1989, + "step": 779 + }, + { + "epoch": 0.17, + "grad_norm": 0.7985563678683656, + "learning_rate": 3.7943743275143384e-05, + "loss": 0.2017, + "step": 780 + }, + { + "epoch": 0.17, + "grad_norm": 0.8227484003895893, + "learning_rate": 3.793745483065377e-05, + "loss": 0.2073, + "step": 781 + }, + { + "epoch": 0.17, + "grad_norm": 0.7756298858710011, + "learning_rate": 3.7931157307908304e-05, + "loss": 0.1646, + "step": 782 + }, + { + "epoch": 0.17, + "grad_norm": 0.7802675554915577, + "learning_rate": 3.792485071009421e-05, + "loss": 0.2172, + "step": 783 + }, + { + "epoch": 0.17, + "grad_norm": 0.7938318675917536, + "learning_rate": 3.7918535040403284e-05, + "loss": 0.2116, + "step": 784 + }, + { + "epoch": 0.17, + "grad_norm": 0.788083214737397, + "learning_rate": 3.791221030203193e-05, + "loss": 0.1677, + "step": 785 + }, + { + "epoch": 0.17, + "grad_norm": 0.8406960709035459, + "learning_rate": 3.790587649818115e-05, + "loss": 0.2083, + "step": 786 + }, + { + "epoch": 0.17, + "grad_norm": 0.8092112371843942, + "learning_rate": 3.78995336320565e-05, + "loss": 0.2049, + "step": 787 + }, + { + "epoch": 0.17, + "grad_norm": 0.7295052196559106, + "learning_rate": 3.789318170686816e-05, + "loss": 0.2061, + "step": 788 + }, + { + "epoch": 0.17, + "grad_norm": 0.8338696939204928, + "learning_rate": 3.788682072583087e-05, + "loss": 0.206, + "step": 789 + }, + { + "epoch": 0.17, + "grad_norm": 0.8162035530743967, + "learning_rate": 3.788045069216396e-05, + "loss": 0.2347, + "step": 790 + }, + { + "epoch": 0.17, + "grad_norm": 0.7600016092609635, + "learning_rate": 3.787407160909134e-05, + "loss": 0.2107, + "step": 791 + }, + { + "epoch": 0.17, + "grad_norm": 0.7609946305841209, + "learning_rate": 3.786768347984152e-05, + "loss": 0.1974, + "step": 792 + }, + { + "epoch": 0.17, + "grad_norm": 0.735952873636344, + "learning_rate": 3.7861286307647555e-05, + "loss": 0.1909, + "step": 793 + }, + { + "epoch": 0.17, + "grad_norm": 0.6722596642043193, + "learning_rate": 3.78548800957471e-05, + "loss": 0.1568, + "step": 794 + }, + { + "epoch": 0.17, + "grad_norm": 0.6333915980625863, + "learning_rate": 3.7848464847382376e-05, + "loss": 0.1725, + "step": 795 + }, + { + "epoch": 0.17, + "grad_norm": 0.796500772603936, + "learning_rate": 3.7842040565800184e-05, + "loss": 0.1909, + "step": 796 + }, + { + "epoch": 0.18, + "grad_norm": 0.7852543631975609, + "learning_rate": 3.783560725425188e-05, + "loss": 0.1969, + "step": 797 + }, + { + "epoch": 0.18, + "grad_norm": 0.7451596629183987, + "learning_rate": 3.782916491599341e-05, + "loss": 0.1965, + "step": 798 + }, + { + "epoch": 0.18, + "grad_norm": 0.6560391132655947, + "learning_rate": 3.782271355428529e-05, + "loss": 0.1756, + "step": 799 + }, + { + "epoch": 0.18, + "grad_norm": 0.859883106182944, + "learning_rate": 3.781625317239258e-05, + "loss": 0.2243, + "step": 800 + }, + { + "epoch": 0.18, + "grad_norm": 0.7195191207343877, + "learning_rate": 3.780978377358493e-05, + "loss": 0.153, + "step": 801 + }, + { + "epoch": 0.18, + "grad_norm": 0.6798048904594878, + "learning_rate": 3.7803305361136534e-05, + "loss": 0.1773, + "step": 802 + }, + { + "epoch": 0.18, + "grad_norm": 0.7079466905190117, + "learning_rate": 3.7796817938326155e-05, + "loss": 0.1877, + "step": 803 + }, + { + "epoch": 0.18, + "grad_norm": 0.7535864591552314, + "learning_rate": 3.7790321508437124e-05, + "loss": 0.1981, + "step": 804 + }, + { + "epoch": 0.18, + "grad_norm": 0.7119576397655025, + "learning_rate": 3.778381607475732e-05, + "loss": 0.1625, + "step": 805 + }, + { + "epoch": 0.18, + "grad_norm": 0.8590587033895257, + "learning_rate": 3.777730164057919e-05, + "loss": 0.2357, + "step": 806 + }, + { + "epoch": 0.18, + "grad_norm": 0.6929512219539972, + "learning_rate": 3.777077820919972e-05, + "loss": 0.1484, + "step": 807 + }, + { + "epoch": 0.18, + "grad_norm": 0.7499874829201456, + "learning_rate": 3.776424578392045e-05, + "loss": 0.2245, + "step": 808 + }, + { + "epoch": 0.18, + "grad_norm": 0.7393534307485781, + "learning_rate": 3.775770436804751e-05, + "loss": 0.1982, + "step": 809 + }, + { + "epoch": 0.18, + "grad_norm": 0.7135289926633145, + "learning_rate": 3.775115396489153e-05, + "loss": 0.1488, + "step": 810 + }, + { + "epoch": 0.18, + "grad_norm": 0.7734600025332912, + "learning_rate": 3.77445945777677e-05, + "loss": 0.1963, + "step": 811 + }, + { + "epoch": 0.18, + "grad_norm": 0.710812400458968, + "learning_rate": 3.773802620999579e-05, + "loss": 0.1591, + "step": 812 + }, + { + "epoch": 0.18, + "grad_norm": 0.7657793629017298, + "learning_rate": 3.773144886490007e-05, + "loss": 0.1688, + "step": 813 + }, + { + "epoch": 0.18, + "grad_norm": 0.811511355240138, + "learning_rate": 3.7724862545809394e-05, + "loss": 0.1988, + "step": 814 + }, + { + "epoch": 0.18, + "grad_norm": 0.7990316490808798, + "learning_rate": 3.771826725605713e-05, + "loss": 0.192, + "step": 815 + }, + { + "epoch": 0.18, + "grad_norm": 0.6723617298825829, + "learning_rate": 3.771166299898118e-05, + "loss": 0.169, + "step": 816 + }, + { + "epoch": 0.18, + "grad_norm": 0.7521643403897306, + "learning_rate": 3.770504977792402e-05, + "loss": 0.1875, + "step": 817 + }, + { + "epoch": 0.18, + "grad_norm": 0.7114388930055676, + "learning_rate": 3.7698427596232636e-05, + "loss": 0.1927, + "step": 818 + }, + { + "epoch": 0.18, + "grad_norm": 0.7085657907035885, + "learning_rate": 3.7691796457258546e-05, + "loss": 0.1811, + "step": 819 + }, + { + "epoch": 0.18, + "grad_norm": 0.7836401046171007, + "learning_rate": 3.7685156364357825e-05, + "loss": 0.1845, + "step": 820 + }, + { + "epoch": 0.18, + "grad_norm": 0.7266652024397258, + "learning_rate": 3.767850732089105e-05, + "loss": 0.2094, + "step": 821 + }, + { + "epoch": 0.18, + "grad_norm": 0.8105800759742975, + "learning_rate": 3.7671849330223345e-05, + "loss": 0.199, + "step": 822 + }, + { + "epoch": 0.18, + "grad_norm": 0.7686728710156864, + "learning_rate": 3.766518239572437e-05, + "loss": 0.2106, + "step": 823 + }, + { + "epoch": 0.18, + "grad_norm": 0.7493906724832401, + "learning_rate": 3.76585065207683e-05, + "loss": 0.2183, + "step": 824 + }, + { + "epoch": 0.18, + "grad_norm": 0.6940159657288101, + "learning_rate": 3.765182170873383e-05, + "loss": 0.1491, + "step": 825 + }, + { + "epoch": 0.18, + "grad_norm": 0.6327228718145429, + "learning_rate": 3.7645127963004176e-05, + "loss": 0.129, + "step": 826 + }, + { + "epoch": 0.18, + "grad_norm": 0.7669510937458179, + "learning_rate": 3.76384252869671e-05, + "loss": 0.191, + "step": 827 + }, + { + "epoch": 0.18, + "grad_norm": 0.757024453588316, + "learning_rate": 3.7631713684014866e-05, + "loss": 0.2127, + "step": 828 + }, + { + "epoch": 0.18, + "grad_norm": 0.7048436247771179, + "learning_rate": 3.7624993157544246e-05, + "loss": 0.1713, + "step": 829 + }, + { + "epoch": 0.18, + "grad_norm": 0.8488451798433339, + "learning_rate": 3.761826371095655e-05, + "loss": 0.2248, + "step": 830 + }, + { + "epoch": 0.18, + "grad_norm": 0.6127937468869562, + "learning_rate": 3.7611525347657584e-05, + "loss": 0.1252, + "step": 831 + }, + { + "epoch": 0.18, + "grad_norm": 0.6806426735887422, + "learning_rate": 3.7604778071057685e-05, + "loss": 0.1563, + "step": 832 + }, + { + "epoch": 0.18, + "grad_norm": 0.6743973690986225, + "learning_rate": 3.759802188457168e-05, + "loss": 0.1774, + "step": 833 + }, + { + "epoch": 0.18, + "grad_norm": 0.7603683630524931, + "learning_rate": 3.759125679161893e-05, + "loss": 0.236, + "step": 834 + }, + { + "epoch": 0.18, + "grad_norm": 0.6793108367801289, + "learning_rate": 3.758448279562327e-05, + "loss": 0.1412, + "step": 835 + }, + { + "epoch": 0.18, + "grad_norm": 0.6519421923379566, + "learning_rate": 3.757769990001308e-05, + "loss": 0.1167, + "step": 836 + }, + { + "epoch": 0.18, + "grad_norm": 0.7396227239586961, + "learning_rate": 3.757090810822122e-05, + "loss": 0.1992, + "step": 837 + }, + { + "epoch": 0.18, + "grad_norm": 0.7742992955873289, + "learning_rate": 3.756410742368505e-05, + "loss": 0.204, + "step": 838 + }, + { + "epoch": 0.18, + "grad_norm": 0.7887451364575835, + "learning_rate": 3.7557297849846444e-05, + "loss": 0.1708, + "step": 839 + }, + { + "epoch": 0.18, + "grad_norm": 0.7601953776719075, + "learning_rate": 3.7550479390151766e-05, + "loss": 0.169, + "step": 840 + }, + { + "epoch": 0.18, + "grad_norm": 0.6972835762854639, + "learning_rate": 3.754365204805189e-05, + "loss": 0.1534, + "step": 841 + }, + { + "epoch": 0.18, + "grad_norm": 0.7799931143187769, + "learning_rate": 3.753681582700216e-05, + "loss": 0.1763, + "step": 842 + }, + { + "epoch": 0.19, + "grad_norm": 0.7701780437869349, + "learning_rate": 3.752997073046244e-05, + "loss": 0.1606, + "step": 843 + }, + { + "epoch": 0.19, + "grad_norm": 0.8026366534002741, + "learning_rate": 3.752311676189708e-05, + "loss": 0.1872, + "step": 844 + }, + { + "epoch": 0.19, + "grad_norm": 0.8119168389561734, + "learning_rate": 3.75162539247749e-05, + "loss": 0.2032, + "step": 845 + }, + { + "epoch": 0.19, + "grad_norm": 0.6718226124056262, + "learning_rate": 3.750938222256924e-05, + "loss": 0.1569, + "step": 846 + }, + { + "epoch": 0.19, + "grad_norm": 0.7637510136724011, + "learning_rate": 3.75025016587579e-05, + "loss": 0.2004, + "step": 847 + }, + { + "epoch": 0.19, + "grad_norm": 0.6415349498989384, + "learning_rate": 3.7495612236823175e-05, + "loss": 0.1509, + "step": 848 + }, + { + "epoch": 0.19, + "grad_norm": 0.8286716528290092, + "learning_rate": 3.7488713960251845e-05, + "loss": 0.1616, + "step": 849 + }, + { + "epoch": 0.19, + "grad_norm": 0.6569549543464369, + "learning_rate": 3.748180683253518e-05, + "loss": 0.1562, + "step": 850 + }, + { + "epoch": 0.19, + "grad_norm": 0.7173550378920462, + "learning_rate": 3.747489085716891e-05, + "loss": 0.1558, + "step": 851 + }, + { + "epoch": 0.19, + "grad_norm": 0.7504036135246938, + "learning_rate": 3.746796603765325e-05, + "loss": 0.1769, + "step": 852 + }, + { + "epoch": 0.19, + "grad_norm": 0.6710972165250231, + "learning_rate": 3.7461032377492905e-05, + "loss": 0.1642, + "step": 853 + }, + { + "epoch": 0.19, + "grad_norm": 0.7352966450199, + "learning_rate": 3.745408988019703e-05, + "loss": 0.1819, + "step": 854 + }, + { + "epoch": 0.19, + "grad_norm": 0.7553549569396881, + "learning_rate": 3.744713854927928e-05, + "loss": 0.1826, + "step": 855 + }, + { + "epoch": 0.19, + "grad_norm": 0.8046090541490418, + "learning_rate": 3.7440178388257746e-05, + "loss": 0.2045, + "step": 856 + }, + { + "epoch": 0.19, + "grad_norm": 0.7820578784963078, + "learning_rate": 3.743320940065503e-05, + "loss": 0.1769, + "step": 857 + }, + { + "epoch": 0.19, + "grad_norm": 0.6971775715256504, + "learning_rate": 3.7426231589998166e-05, + "loss": 0.1782, + "step": 858 + }, + { + "epoch": 0.19, + "grad_norm": 0.782002353801258, + "learning_rate": 3.741924495981867e-05, + "loss": 0.2065, + "step": 859 + }, + { + "epoch": 0.19, + "grad_norm": 0.6984916614807525, + "learning_rate": 3.741224951365251e-05, + "loss": 0.1677, + "step": 860 + }, + { + "epoch": 0.19, + "grad_norm": 0.6849913343173801, + "learning_rate": 3.740524525504014e-05, + "loss": 0.156, + "step": 861 + }, + { + "epoch": 0.19, + "grad_norm": 0.6846778338963099, + "learning_rate": 3.739823218752645e-05, + "loss": 0.1699, + "step": 862 + }, + { + "epoch": 0.19, + "grad_norm": 0.7003902445939452, + "learning_rate": 3.7391210314660796e-05, + "loss": 0.1621, + "step": 863 + }, + { + "epoch": 0.19, + "grad_norm": 0.6867990822182168, + "learning_rate": 3.7384179639996997e-05, + "loss": 0.1499, + "step": 864 + }, + { + "epoch": 0.19, + "grad_norm": 0.644974619639601, + "learning_rate": 3.7377140167093316e-05, + "loss": 0.1218, + "step": 865 + }, + { + "epoch": 0.19, + "grad_norm": 0.783606332695021, + "learning_rate": 3.7370091899512464e-05, + "loss": 0.1907, + "step": 866 + }, + { + "epoch": 0.19, + "grad_norm": 0.6615394006948083, + "learning_rate": 3.736303484082163e-05, + "loss": 0.1477, + "step": 867 + }, + { + "epoch": 0.19, + "grad_norm": 0.6937158662114102, + "learning_rate": 3.7355968994592414e-05, + "loss": 0.1625, + "step": 868 + }, + { + "epoch": 0.19, + "grad_norm": 0.7429604370422286, + "learning_rate": 3.7348894364400914e-05, + "loss": 0.1805, + "step": 869 + }, + { + "epoch": 0.19, + "grad_norm": 0.7475581950878672, + "learning_rate": 3.734181095382761e-05, + "loss": 0.204, + "step": 870 + }, + { + "epoch": 0.19, + "grad_norm": 0.6430529894982431, + "learning_rate": 3.733471876645749e-05, + "loss": 0.1358, + "step": 871 + }, + { + "epoch": 0.19, + "grad_norm": 0.7923755023892722, + "learning_rate": 3.732761780587993e-05, + "loss": 0.1778, + "step": 872 + }, + { + "epoch": 0.19, + "grad_norm": 0.7170539000161786, + "learning_rate": 3.732050807568878e-05, + "loss": 0.1676, + "step": 873 + }, + { + "epoch": 0.19, + "grad_norm": 0.6665991411184689, + "learning_rate": 3.7313389579482315e-05, + "loss": 0.1833, + "step": 874 + }, + { + "epoch": 0.19, + "grad_norm": 0.6874944513270407, + "learning_rate": 3.7306262320863245e-05, + "loss": 0.161, + "step": 875 + }, + { + "epoch": 0.19, + "grad_norm": 0.6105940436302919, + "learning_rate": 3.729912630343874e-05, + "loss": 0.1261, + "step": 876 + }, + { + "epoch": 0.19, + "grad_norm": 0.6619533928976752, + "learning_rate": 3.729198153082036e-05, + "loss": 0.1569, + "step": 877 + }, + { + "epoch": 0.19, + "grad_norm": 0.569243487381569, + "learning_rate": 3.7284828006624125e-05, + "loss": 0.1229, + "step": 878 + }, + { + "epoch": 0.19, + "grad_norm": 0.7142199102812501, + "learning_rate": 3.7277665734470476e-05, + "loss": 0.1871, + "step": 879 + }, + { + "epoch": 0.19, + "grad_norm": 0.629269678533343, + "learning_rate": 3.727049471798429e-05, + "loss": 0.1472, + "step": 880 + }, + { + "epoch": 0.19, + "grad_norm": 0.654901322109771, + "learning_rate": 3.726331496079486e-05, + "loss": 0.149, + "step": 881 + }, + { + "epoch": 0.19, + "grad_norm": 0.6841025284739801, + "learning_rate": 3.7256126466535896e-05, + "loss": 0.1534, + "step": 882 + }, + { + "epoch": 0.19, + "grad_norm": 0.7865678235231162, + "learning_rate": 3.724892923884555e-05, + "loss": 0.2295, + "step": 883 + }, + { + "epoch": 0.19, + "grad_norm": 0.7841110484485813, + "learning_rate": 3.724172328136638e-05, + "loss": 0.2037, + "step": 884 + }, + { + "epoch": 0.19, + "grad_norm": 0.6599561399278121, + "learning_rate": 3.723450859774536e-05, + "loss": 0.1311, + "step": 885 + }, + { + "epoch": 0.19, + "grad_norm": 0.6987130451495835, + "learning_rate": 3.7227285191633894e-05, + "loss": 0.1495, + "step": 886 + }, + { + "epoch": 0.19, + "grad_norm": 0.7402887248919082, + "learning_rate": 3.722005306668778e-05, + "loss": 0.1579, + "step": 887 + }, + { + "epoch": 0.2, + "grad_norm": 0.6984442998567899, + "learning_rate": 3.721281222656725e-05, + "loss": 0.1619, + "step": 888 + }, + { + "epoch": 0.2, + "grad_norm": 0.7065217305585293, + "learning_rate": 3.7205562674936945e-05, + "loss": 0.1709, + "step": 889 + }, + { + "epoch": 0.2, + "grad_norm": 0.732226984161801, + "learning_rate": 3.719830441546589e-05, + "loss": 0.1782, + "step": 890 + }, + { + "epoch": 0.2, + "grad_norm": 0.6858639758075553, + "learning_rate": 3.7191037451827545e-05, + "loss": 0.1719, + "step": 891 + }, + { + "epoch": 0.2, + "grad_norm": 0.7299863476978986, + "learning_rate": 3.718376178769976e-05, + "loss": 0.2216, + "step": 892 + }, + { + "epoch": 0.2, + "grad_norm": 0.8304332736342658, + "learning_rate": 3.71764774267648e-05, + "loss": 0.2096, + "step": 893 + }, + { + "epoch": 0.2, + "grad_norm": 0.7169787516591115, + "learning_rate": 3.716918437270932e-05, + "loss": 0.1684, + "step": 894 + }, + { + "epoch": 0.2, + "grad_norm": 0.7166803665478112, + "learning_rate": 3.7161882629224386e-05, + "loss": 0.1607, + "step": 895 + }, + { + "epoch": 0.2, + "grad_norm": 0.724778723656452, + "learning_rate": 3.7154572200005446e-05, + "loss": 0.1759, + "step": 896 + }, + { + "epoch": 0.2, + "grad_norm": 0.7554750870258391, + "learning_rate": 3.714725308875236e-05, + "loss": 0.2013, + "step": 897 + }, + { + "epoch": 0.2, + "grad_norm": 0.7150835979381013, + "learning_rate": 3.713992529916936e-05, + "loss": 0.1539, + "step": 898 + }, + { + "epoch": 0.2, + "grad_norm": 0.6787440010634678, + "learning_rate": 3.7132588834965104e-05, + "loss": 0.1945, + "step": 899 + }, + { + "epoch": 0.2, + "grad_norm": 0.6542829825221722, + "learning_rate": 3.712524369985262e-05, + "loss": 0.1392, + "step": 900 + }, + { + "epoch": 0.2, + "grad_norm": 0.665774640200682, + "learning_rate": 3.711788989754931e-05, + "loss": 0.1615, + "step": 901 + }, + { + "epoch": 0.2, + "grad_norm": 0.7030846690161123, + "learning_rate": 3.711052743177699e-05, + "loss": 0.1855, + "step": 902 + }, + { + "epoch": 0.2, + "grad_norm": 0.7936334689648732, + "learning_rate": 3.710315630626185e-05, + "loss": 0.2092, + "step": 903 + }, + { + "epoch": 0.2, + "grad_norm": 0.6753815367370155, + "learning_rate": 3.7095776524734464e-05, + "loss": 0.1527, + "step": 904 + }, + { + "epoch": 0.2, + "grad_norm": 0.7084634511777324, + "learning_rate": 3.7088388090929776e-05, + "loss": 0.1491, + "step": 905 + }, + { + "epoch": 0.2, + "grad_norm": 0.7112534208835603, + "learning_rate": 3.708099100858712e-05, + "loss": 0.1755, + "step": 906 + }, + { + "epoch": 0.2, + "grad_norm": 0.6211746451040592, + "learning_rate": 3.7073585281450206e-05, + "loss": 0.1484, + "step": 907 + }, + { + "epoch": 0.2, + "grad_norm": 0.6404982883466234, + "learning_rate": 3.706617091326712e-05, + "loss": 0.1445, + "step": 908 + }, + { + "epoch": 0.2, + "grad_norm": 0.7272965199740121, + "learning_rate": 3.705874790779032e-05, + "loss": 0.1616, + "step": 909 + }, + { + "epoch": 0.2, + "grad_norm": 0.7658933441744524, + "learning_rate": 3.705131626877664e-05, + "loss": 0.2129, + "step": 910 + }, + { + "epoch": 0.2, + "grad_norm": 0.6850082803142199, + "learning_rate": 3.7043875999987254e-05, + "loss": 0.1677, + "step": 911 + }, + { + "epoch": 0.2, + "grad_norm": 0.6362997911677422, + "learning_rate": 3.7036427105187754e-05, + "loss": 0.1608, + "step": 912 + }, + { + "epoch": 0.2, + "grad_norm": 0.6987606444597272, + "learning_rate": 3.7028969588148056e-05, + "loss": 0.1946, + "step": 913 + }, + { + "epoch": 0.2, + "grad_norm": 0.7026652210384798, + "learning_rate": 3.702150345264247e-05, + "loss": 0.1781, + "step": 914 + }, + { + "epoch": 0.2, + "grad_norm": 0.7209204369399616, + "learning_rate": 3.701402870244963e-05, + "loss": 0.1754, + "step": 915 + }, + { + "epoch": 0.2, + "grad_norm": 0.727384123209215, + "learning_rate": 3.700654534135257e-05, + "loss": 0.1557, + "step": 916 + }, + { + "epoch": 0.2, + "grad_norm": 0.6332642391018083, + "learning_rate": 3.699905337313866e-05, + "loss": 0.1417, + "step": 917 + }, + { + "epoch": 0.2, + "grad_norm": 0.6780052852874722, + "learning_rate": 3.699155280159964e-05, + "loss": 0.1562, + "step": 918 + }, + { + "epoch": 0.2, + "grad_norm": 0.7154214327932445, + "learning_rate": 3.698404363053158e-05, + "loss": 0.1461, + "step": 919 + }, + { + "epoch": 0.2, + "grad_norm": 0.700188076774634, + "learning_rate": 3.697652586373493e-05, + "loss": 0.1707, + "step": 920 + }, + { + "epoch": 0.2, + "grad_norm": 0.7230025827617601, + "learning_rate": 3.696899950501447e-05, + "loss": 0.1257, + "step": 921 + }, + { + "epoch": 0.2, + "grad_norm": 0.778229060740858, + "learning_rate": 3.6961464558179333e-05, + "loss": 0.1885, + "step": 922 + }, + { + "epoch": 0.2, + "grad_norm": 0.7561932991777188, + "learning_rate": 3.695392102704302e-05, + "loss": 0.171, + "step": 923 + }, + { + "epoch": 0.2, + "grad_norm": 0.7260621922208205, + "learning_rate": 3.694636891542334e-05, + "loss": 0.1571, + "step": 924 + }, + { + "epoch": 0.2, + "grad_norm": 0.7317835353052108, + "learning_rate": 3.693880822714247e-05, + "loss": 0.1773, + "step": 925 + }, + { + "epoch": 0.2, + "grad_norm": 0.6661179092038415, + "learning_rate": 3.693123896602692e-05, + "loss": 0.1748, + "step": 926 + }, + { + "epoch": 0.2, + "grad_norm": 0.6743277867135127, + "learning_rate": 3.692366113590754e-05, + "loss": 0.1851, + "step": 927 + }, + { + "epoch": 0.2, + "grad_norm": 0.6938020824093218, + "learning_rate": 3.691607474061951e-05, + "loss": 0.164, + "step": 928 + }, + { + "epoch": 0.2, + "grad_norm": 0.6384162741338156, + "learning_rate": 3.690847978400236e-05, + "loss": 0.1469, + "step": 929 + }, + { + "epoch": 0.2, + "grad_norm": 0.6410681192219297, + "learning_rate": 3.690087626989994e-05, + "loss": 0.1861, + "step": 930 + }, + { + "epoch": 0.2, + "grad_norm": 0.5976785393840714, + "learning_rate": 3.689326420216044e-05, + "loss": 0.1303, + "step": 931 + }, + { + "epoch": 0.2, + "grad_norm": 0.6056843527575427, + "learning_rate": 3.6885643584636366e-05, + "loss": 0.1374, + "step": 932 + }, + { + "epoch": 0.2, + "grad_norm": 0.6691852000232525, + "learning_rate": 3.6878014421184565e-05, + "loss": 0.1767, + "step": 933 + }, + { + "epoch": 0.21, + "grad_norm": 0.6218312041996754, + "learning_rate": 3.68703767156662e-05, + "loss": 0.1642, + "step": 934 + }, + { + "epoch": 0.21, + "grad_norm": 0.6618795080274562, + "learning_rate": 3.6862730471946766e-05, + "loss": 0.1342, + "step": 935 + }, + { + "epoch": 0.21, + "grad_norm": 0.6863753483288044, + "learning_rate": 3.685507569389606e-05, + "loss": 0.1342, + "step": 936 + }, + { + "epoch": 0.21, + "grad_norm": 0.6551060924803995, + "learning_rate": 3.6847412385388236e-05, + "loss": 0.1366, + "step": 937 + }, + { + "epoch": 0.21, + "grad_norm": 0.7465156840052052, + "learning_rate": 3.683974055030172e-05, + "loss": 0.1627, + "step": 938 + }, + { + "epoch": 0.21, + "grad_norm": 0.6625105281209221, + "learning_rate": 3.6832060192519286e-05, + "loss": 0.1526, + "step": 939 + }, + { + "epoch": 0.21, + "grad_norm": 0.7086275742437467, + "learning_rate": 3.6824371315928e-05, + "loss": 0.1485, + "step": 940 + }, + { + "epoch": 0.21, + "grad_norm": 0.7417666798211808, + "learning_rate": 3.681667392441926e-05, + "loss": 0.1433, + "step": 941 + }, + { + "epoch": 0.21, + "grad_norm": 0.7386253806277999, + "learning_rate": 3.680896802188876e-05, + "loss": 0.1707, + "step": 942 + }, + { + "epoch": 0.21, + "grad_norm": 0.7037793611394684, + "learning_rate": 3.6801253612236506e-05, + "loss": 0.1838, + "step": 943 + }, + { + "epoch": 0.21, + "grad_norm": 0.6457543991417521, + "learning_rate": 3.679353069936681e-05, + "loss": 0.1514, + "step": 944 + }, + { + "epoch": 0.21, + "grad_norm": 0.7350562567817532, + "learning_rate": 3.678579928718827e-05, + "loss": 0.1374, + "step": 945 + }, + { + "epoch": 0.21, + "grad_norm": 0.6461300988449086, + "learning_rate": 3.6778059379613815e-05, + "loss": 0.1402, + "step": 946 + }, + { + "epoch": 0.21, + "grad_norm": 0.6394658184714149, + "learning_rate": 3.6770310980560654e-05, + "loss": 0.1477, + "step": 947 + }, + { + "epoch": 0.21, + "grad_norm": 0.7594333893105324, + "learning_rate": 3.676255409395031e-05, + "loss": 0.2031, + "step": 948 + }, + { + "epoch": 0.21, + "grad_norm": 0.7019386880161659, + "learning_rate": 3.675478872370858e-05, + "loss": 0.1665, + "step": 949 + }, + { + "epoch": 0.21, + "grad_norm": 0.6626999283463161, + "learning_rate": 3.674701487376557e-05, + "loss": 0.1439, + "step": 950 + }, + { + "epoch": 0.21, + "grad_norm": 0.6551620456940492, + "learning_rate": 3.673923254805566e-05, + "loss": 0.1425, + "step": 951 + }, + { + "epoch": 0.21, + "grad_norm": 0.63563867244813, + "learning_rate": 3.6731441750517566e-05, + "loss": 0.1256, + "step": 952 + }, + { + "epoch": 0.21, + "grad_norm": 0.6957176828119622, + "learning_rate": 3.672364248509422e-05, + "loss": 0.147, + "step": 953 + }, + { + "epoch": 0.21, + "grad_norm": 0.7562490242565576, + "learning_rate": 3.67158347557329e-05, + "loss": 0.2017, + "step": 954 + }, + { + "epoch": 0.21, + "grad_norm": 0.6225715905123154, + "learning_rate": 3.670801856638514e-05, + "loss": 0.1394, + "step": 955 + }, + { + "epoch": 0.21, + "grad_norm": 0.678394908191864, + "learning_rate": 3.6700193921006766e-05, + "loss": 0.1524, + "step": 956 + }, + { + "epoch": 0.21, + "grad_norm": 0.7398774587490798, + "learning_rate": 3.669236082355787e-05, + "loss": 0.1792, + "step": 957 + }, + { + "epoch": 0.21, + "grad_norm": 0.6206214596588789, + "learning_rate": 3.668451927800283e-05, + "loss": 0.1363, + "step": 958 + }, + { + "epoch": 0.21, + "grad_norm": 0.6027596192132293, + "learning_rate": 3.667666928831032e-05, + "loss": 0.1196, + "step": 959 + }, + { + "epoch": 0.21, + "grad_norm": 0.7351357598038659, + "learning_rate": 3.666881085845324e-05, + "loss": 0.2001, + "step": 960 + }, + { + "epoch": 0.21, + "grad_norm": 0.5988897173734646, + "learning_rate": 3.6660943992408817e-05, + "loss": 0.1227, + "step": 961 + }, + { + "epoch": 0.21, + "grad_norm": 0.6458349279158401, + "learning_rate": 3.66530686941585e-05, + "loss": 0.1292, + "step": 962 + }, + { + "epoch": 0.21, + "grad_norm": 0.7829938118934919, + "learning_rate": 3.664518496768802e-05, + "loss": 0.1853, + "step": 963 + }, + { + "epoch": 0.21, + "grad_norm": 0.6575365851561895, + "learning_rate": 3.663729281698741e-05, + "loss": 0.1553, + "step": 964 + }, + { + "epoch": 0.21, + "grad_norm": 0.6642437855720914, + "learning_rate": 3.662939224605091e-05, + "loss": 0.142, + "step": 965 + }, + { + "epoch": 0.21, + "grad_norm": 0.6479234151294552, + "learning_rate": 3.6621483258877055e-05, + "loss": 0.1318, + "step": 966 + }, + { + "epoch": 0.21, + "grad_norm": 0.7309571127932903, + "learning_rate": 3.6613565859468626e-05, + "loss": 0.1765, + "step": 967 + }, + { + "epoch": 0.21, + "grad_norm": 0.7002418747321221, + "learning_rate": 3.660564005183268e-05, + "loss": 0.179, + "step": 968 + }, + { + "epoch": 0.21, + "grad_norm": 0.7563400514440753, + "learning_rate": 3.659770583998051e-05, + "loss": 0.1672, + "step": 969 + }, + { + "epoch": 0.21, + "grad_norm": 0.77837280709059, + "learning_rate": 3.658976322792766e-05, + "loss": 0.2449, + "step": 970 + }, + { + "epoch": 0.21, + "grad_norm": 0.6816422864134033, + "learning_rate": 3.658181221969395e-05, + "loss": 0.1447, + "step": 971 + }, + { + "epoch": 0.21, + "grad_norm": 0.6553011346867443, + "learning_rate": 3.657385281930343e-05, + "loss": 0.1439, + "step": 972 + }, + { + "epoch": 0.21, + "grad_norm": 0.5941559257267682, + "learning_rate": 3.65658850307844e-05, + "loss": 0.1584, + "step": 973 + }, + { + "epoch": 0.21, + "grad_norm": 0.6940833811429217, + "learning_rate": 3.65579088581694e-05, + "loss": 0.1608, + "step": 974 + }, + { + "epoch": 0.21, + "grad_norm": 0.6568072768488372, + "learning_rate": 3.6549924305495225e-05, + "loss": 0.1553, + "step": 975 + }, + { + "epoch": 0.21, + "grad_norm": 0.7325600048784592, + "learning_rate": 3.6541931376802906e-05, + "loss": 0.1493, + "step": 976 + }, + { + "epoch": 0.21, + "grad_norm": 0.5516350533636646, + "learning_rate": 3.653393007613771e-05, + "loss": 0.1195, + "step": 977 + }, + { + "epoch": 0.21, + "grad_norm": 0.7327823413100717, + "learning_rate": 3.652592040754917e-05, + "loss": 0.1737, + "step": 978 + }, + { + "epoch": 0.22, + "grad_norm": 0.6378932955160965, + "learning_rate": 3.651790237509098e-05, + "loss": 0.1621, + "step": 979 + }, + { + "epoch": 0.22, + "grad_norm": 0.6945962967680264, + "learning_rate": 3.650987598282116e-05, + "loss": 0.1377, + "step": 980 + }, + { + "epoch": 0.22, + "grad_norm": 0.6427290487016571, + "learning_rate": 3.6501841234801886e-05, + "loss": 0.1622, + "step": 981 + }, + { + "epoch": 0.22, + "grad_norm": 0.6658021914902404, + "learning_rate": 3.649379813509961e-05, + "loss": 0.1562, + "step": 982 + }, + { + "epoch": 0.22, + "grad_norm": 0.6810380080871552, + "learning_rate": 3.648574668778499e-05, + "loss": 0.1648, + "step": 983 + }, + { + "epoch": 0.22, + "grad_norm": 0.7029924267717719, + "learning_rate": 3.647768689693291e-05, + "loss": 0.1484, + "step": 984 + }, + { + "epoch": 0.22, + "grad_norm": 0.8445207819259016, + "learning_rate": 3.646961876662248e-05, + "loss": 0.2061, + "step": 985 + }, + { + "epoch": 0.22, + "grad_norm": 0.6621453829626172, + "learning_rate": 3.6461542300937035e-05, + "loss": 0.1546, + "step": 986 + }, + { + "epoch": 0.22, + "grad_norm": 0.7710667892427043, + "learning_rate": 3.645345750396412e-05, + "loss": 0.186, + "step": 987 + }, + { + "epoch": 0.22, + "grad_norm": 0.6722052228392159, + "learning_rate": 3.64453643797955e-05, + "loss": 0.1712, + "step": 988 + }, + { + "epoch": 0.22, + "grad_norm": 0.6294382345032234, + "learning_rate": 3.643726293252717e-05, + "loss": 0.1267, + "step": 989 + }, + { + "epoch": 0.22, + "grad_norm": 0.6357988604761875, + "learning_rate": 3.642915316625929e-05, + "loss": 0.14, + "step": 990 + }, + { + "epoch": 0.22, + "grad_norm": 0.7048080222855339, + "learning_rate": 3.642103508509629e-05, + "loss": 0.1741, + "step": 991 + }, + { + "epoch": 0.22, + "grad_norm": 0.5709092662827024, + "learning_rate": 3.641290869314676e-05, + "loss": 0.1459, + "step": 992 + }, + { + "epoch": 0.22, + "grad_norm": 0.7015535808582499, + "learning_rate": 3.640477399452354e-05, + "loss": 0.1733, + "step": 993 + }, + { + "epoch": 0.22, + "grad_norm": 0.6066420477109242, + "learning_rate": 3.639663099334363e-05, + "loss": 0.1353, + "step": 994 + }, + { + "epoch": 0.22, + "grad_norm": 0.5647230459909748, + "learning_rate": 3.6388479693728266e-05, + "loss": 0.1051, + "step": 995 + }, + { + "epoch": 0.22, + "grad_norm": 0.6302293400902045, + "learning_rate": 3.638032009980286e-05, + "loss": 0.1498, + "step": 996 + }, + { + "epoch": 0.22, + "grad_norm": 0.7179210692669409, + "learning_rate": 3.637215221569705e-05, + "loss": 0.1741, + "step": 997 + }, + { + "epoch": 0.22, + "grad_norm": 0.6376237580841241, + "learning_rate": 3.636397604554463e-05, + "loss": 0.1267, + "step": 998 + }, + { + "epoch": 0.22, + "grad_norm": 0.6459328410696206, + "learning_rate": 3.635579159348362e-05, + "loss": 0.1058, + "step": 999 + }, + { + "epoch": 0.22, + "grad_norm": 0.7772120071799035, + "learning_rate": 3.634759886365623e-05, + "loss": 0.1715, + "step": 1000 + }, + { + "epoch": 0.22, + "grad_norm": 0.6411320145391166, + "learning_rate": 3.633939786020884e-05, + "loss": 0.1518, + "step": 1001 + }, + { + "epoch": 0.22, + "grad_norm": 0.6501649573892895, + "learning_rate": 3.633118858729203e-05, + "loss": 0.1375, + "step": 1002 + }, + { + "epoch": 0.22, + "grad_norm": 0.6743897461008136, + "learning_rate": 3.632297104906057e-05, + "loss": 0.156, + "step": 1003 + }, + { + "epoch": 0.22, + "grad_norm": 0.6672403479914584, + "learning_rate": 3.63147452496734e-05, + "loss": 0.1124, + "step": 1004 + }, + { + "epoch": 0.22, + "grad_norm": 0.6429425165843409, + "learning_rate": 3.6306511193293636e-05, + "loss": 0.1337, + "step": 1005 + }, + { + "epoch": 0.22, + "grad_norm": 0.6776541626704364, + "learning_rate": 3.629826888408861e-05, + "loss": 0.1574, + "step": 1006 + }, + { + "epoch": 0.22, + "grad_norm": 0.7718525068581041, + "learning_rate": 3.629001832622979e-05, + "loss": 0.2403, + "step": 1007 + }, + { + "epoch": 0.22, + "grad_norm": 0.6612823182052278, + "learning_rate": 3.628175952389283e-05, + "loss": 0.1602, + "step": 1008 + }, + { + "epoch": 0.22, + "grad_norm": 0.6752689823461115, + "learning_rate": 3.627349248125757e-05, + "loss": 0.1424, + "step": 1009 + }, + { + "epoch": 0.22, + "grad_norm": 0.5796774048453173, + "learning_rate": 3.6265217202508006e-05, + "loss": 0.134, + "step": 1010 + }, + { + "epoch": 0.22, + "grad_norm": 0.6309566599944132, + "learning_rate": 3.625693369183231e-05, + "loss": 0.1365, + "step": 1011 + }, + { + "epoch": 0.22, + "grad_norm": 0.6002505226450737, + "learning_rate": 3.624864195342281e-05, + "loss": 0.144, + "step": 1012 + }, + { + "epoch": 0.22, + "grad_norm": 0.7674195414564663, + "learning_rate": 3.624034199147602e-05, + "loss": 0.1914, + "step": 1013 + }, + { + "epoch": 0.22, + "grad_norm": 0.7049269580203319, + "learning_rate": 3.623203381019259e-05, + "loss": 0.1618, + "step": 1014 + }, + { + "epoch": 0.22, + "grad_norm": 0.6315551794383809, + "learning_rate": 3.6223717413777346e-05, + "loss": 0.116, + "step": 1015 + }, + { + "epoch": 0.22, + "grad_norm": 0.6859663850789501, + "learning_rate": 3.621539280643926e-05, + "loss": 0.1655, + "step": 1016 + }, + { + "epoch": 0.22, + "grad_norm": 0.6495537170312613, + "learning_rate": 3.620705999239148e-05, + "loss": 0.1481, + "step": 1017 + }, + { + "epoch": 0.22, + "grad_norm": 0.6116131249215796, + "learning_rate": 3.619871897585129e-05, + "loss": 0.1341, + "step": 1018 + }, + { + "epoch": 0.22, + "grad_norm": 0.5413486818915718, + "learning_rate": 3.6190369761040116e-05, + "loss": 0.1103, + "step": 1019 + }, + { + "epoch": 0.22, + "grad_norm": 0.5985065221200078, + "learning_rate": 3.618201235218356e-05, + "loss": 0.1247, + "step": 1020 + }, + { + "epoch": 0.22, + "grad_norm": 0.6556592734863363, + "learning_rate": 3.617364675351136e-05, + "loss": 0.1522, + "step": 1021 + }, + { + "epoch": 0.22, + "grad_norm": 0.5977013764143297, + "learning_rate": 3.61652729692574e-05, + "loss": 0.1161, + "step": 1022 + }, + { + "epoch": 0.22, + "grad_norm": 0.7310760747668861, + "learning_rate": 3.615689100365968e-05, + "loss": 0.1906, + "step": 1023 + }, + { + "epoch": 0.22, + "grad_norm": 0.6738576689062206, + "learning_rate": 3.6148500860960386e-05, + "loss": 0.1552, + "step": 1024 + }, + { + "epoch": 0.23, + "grad_norm": 0.5956457892169247, + "learning_rate": 3.614010254540581e-05, + "loss": 0.1199, + "step": 1025 + }, + { + "epoch": 0.23, + "grad_norm": 0.6877198301331007, + "learning_rate": 3.6131696061246405e-05, + "loss": 0.1557, + "step": 1026 + }, + { + "epoch": 0.23, + "grad_norm": 0.6930864253545237, + "learning_rate": 3.612328141273673e-05, + "loss": 0.1761, + "step": 1027 + }, + { + "epoch": 0.23, + "grad_norm": 0.711283547158347, + "learning_rate": 3.6114858604135496e-05, + "loss": 0.1623, + "step": 1028 + }, + { + "epoch": 0.23, + "grad_norm": 0.729195653248427, + "learning_rate": 3.610642763970553e-05, + "loss": 0.1848, + "step": 1029 + }, + { + "epoch": 0.23, + "grad_norm": 0.563524690420537, + "learning_rate": 3.6097988523713816e-05, + "loss": 0.1184, + "step": 1030 + }, + { + "epoch": 0.23, + "grad_norm": 0.6577990052567404, + "learning_rate": 3.608954126043141e-05, + "loss": 0.1756, + "step": 1031 + }, + { + "epoch": 0.23, + "grad_norm": 0.6586686025055664, + "learning_rate": 3.608108585413356e-05, + "loss": 0.15, + "step": 1032 + }, + { + "epoch": 0.23, + "grad_norm": 0.5790512809345955, + "learning_rate": 3.6072622309099566e-05, + "loss": 0.1078, + "step": 1033 + }, + { + "epoch": 0.23, + "grad_norm": 0.7169676117411847, + "learning_rate": 3.60641506296129e-05, + "loss": 0.1548, + "step": 1034 + }, + { + "epoch": 0.23, + "grad_norm": 0.6659877021730962, + "learning_rate": 3.605567081996113e-05, + "loss": 0.1395, + "step": 1035 + }, + { + "epoch": 0.23, + "grad_norm": 0.6933439870926703, + "learning_rate": 3.604718288443593e-05, + "loss": 0.162, + "step": 1036 + }, + { + "epoch": 0.23, + "grad_norm": 0.6841192351087376, + "learning_rate": 3.60386868273331e-05, + "loss": 0.1639, + "step": 1037 + }, + { + "epoch": 0.23, + "grad_norm": 0.7478730320930891, + "learning_rate": 3.603018265295255e-05, + "loss": 0.1938, + "step": 1038 + }, + { + "epoch": 0.23, + "grad_norm": 0.6873603592288243, + "learning_rate": 3.60216703655983e-05, + "loss": 0.1807, + "step": 1039 + }, + { + "epoch": 0.23, + "grad_norm": 0.6615960711149769, + "learning_rate": 3.601314996957845e-05, + "loss": 0.1431, + "step": 1040 + }, + { + "epoch": 0.23, + "grad_norm": 0.6790527141864818, + "learning_rate": 3.600462146920525e-05, + "loss": 0.1507, + "step": 1041 + }, + { + "epoch": 0.23, + "grad_norm": 0.6228395314339072, + "learning_rate": 3.5996084868795015e-05, + "loss": 0.1275, + "step": 1042 + }, + { + "epoch": 0.23, + "grad_norm": 0.6983925184063805, + "learning_rate": 3.5987540172668164e-05, + "loss": 0.1587, + "step": 1043 + }, + { + "epoch": 0.23, + "grad_norm": 0.6764855215492669, + "learning_rate": 3.597898738514923e-05, + "loss": 0.1606, + "step": 1044 + }, + { + "epoch": 0.23, + "grad_norm": 0.6701212772521431, + "learning_rate": 3.5970426510566824e-05, + "loss": 0.1587, + "step": 1045 + }, + { + "epoch": 0.23, + "grad_norm": 0.6212399628517273, + "learning_rate": 3.5961857553253665e-05, + "loss": 0.1464, + "step": 1046 + }, + { + "epoch": 0.23, + "grad_norm": 0.622063222033276, + "learning_rate": 3.595328051754654e-05, + "loss": 0.1172, + "step": 1047 + }, + { + "epoch": 0.23, + "grad_norm": 0.6384653346584779, + "learning_rate": 3.594469540778637e-05, + "loss": 0.1121, + "step": 1048 + }, + { + "epoch": 0.23, + "grad_norm": 0.704067827281616, + "learning_rate": 3.593610222831809e-05, + "loss": 0.1264, + "step": 1049 + }, + { + "epoch": 0.23, + "grad_norm": 0.655922558247646, + "learning_rate": 3.59275009834908e-05, + "loss": 0.1078, + "step": 1050 + }, + { + "epoch": 0.23, + "grad_norm": 0.7779504143522625, + "learning_rate": 3.591889167765762e-05, + "loss": 0.1519, + "step": 1051 + }, + { + "epoch": 0.23, + "grad_norm": 0.7507066576498731, + "learning_rate": 3.591027431517577e-05, + "loss": 0.1231, + "step": 1052 + }, + { + "epoch": 0.23, + "grad_norm": 0.6901421494497663, + "learning_rate": 3.590164890040657e-05, + "loss": 0.1529, + "step": 1053 + }, + { + "epoch": 0.23, + "grad_norm": 0.7730705084134945, + "learning_rate": 3.589301543771537e-05, + "loss": 0.1819, + "step": 1054 + }, + { + "epoch": 0.23, + "grad_norm": 0.6758688500577673, + "learning_rate": 3.588437393147164e-05, + "loss": 0.1753, + "step": 1055 + }, + { + "epoch": 0.23, + "grad_norm": 0.6888931204638528, + "learning_rate": 3.587572438604889e-05, + "loss": 0.1456, + "step": 1056 + }, + { + "epoch": 0.23, + "grad_norm": 0.7073774385951197, + "learning_rate": 3.586706680582471e-05, + "loss": 0.1722, + "step": 1057 + }, + { + "epoch": 0.23, + "grad_norm": 0.6509485915727481, + "learning_rate": 3.585840119518075e-05, + "loss": 0.1305, + "step": 1058 + }, + { + "epoch": 0.23, + "grad_norm": 0.6538960381990047, + "learning_rate": 3.584972755850273e-05, + "loss": 0.1341, + "step": 1059 + }, + { + "epoch": 0.23, + "grad_norm": 0.6963019242777403, + "learning_rate": 3.584104590018044e-05, + "loss": 0.1436, + "step": 1060 + }, + { + "epoch": 0.23, + "grad_norm": 0.6382712644732104, + "learning_rate": 3.58323562246077e-05, + "loss": 0.1604, + "step": 1061 + }, + { + "epoch": 0.23, + "grad_norm": 0.6796882673237294, + "learning_rate": 3.5823658536182426e-05, + "loss": 0.1758, + "step": 1062 + }, + { + "epoch": 0.23, + "grad_norm": 0.6260051512475788, + "learning_rate": 3.5814952839306574e-05, + "loss": 0.1561, + "step": 1063 + }, + { + "epoch": 0.23, + "grad_norm": 0.6347693340486589, + "learning_rate": 3.580623913838613e-05, + "loss": 0.1605, + "step": 1064 + }, + { + "epoch": 0.23, + "grad_norm": 0.6308013133443539, + "learning_rate": 3.579751743783118e-05, + "loss": 0.1308, + "step": 1065 + }, + { + "epoch": 0.23, + "grad_norm": 0.5812309702967815, + "learning_rate": 3.578878774205581e-05, + "loss": 0.1118, + "step": 1066 + }, + { + "epoch": 0.23, + "grad_norm": 0.6310631390462856, + "learning_rate": 3.578005005547817e-05, + "loss": 0.1333, + "step": 1067 + }, + { + "epoch": 0.23, + "grad_norm": 0.6877280203948094, + "learning_rate": 3.577130438252046e-05, + "loss": 0.1531, + "step": 1068 + }, + { + "epoch": 0.23, + "grad_norm": 0.6483847585163999, + "learning_rate": 3.576255072760893e-05, + "loss": 0.1143, + "step": 1069 + }, + { + "epoch": 0.24, + "grad_norm": 0.6452634640367501, + "learning_rate": 3.575378909517385e-05, + "loss": 0.1313, + "step": 1070 + }, + { + "epoch": 0.24, + "grad_norm": 0.6900713690875435, + "learning_rate": 3.574501948964954e-05, + "loss": 0.1479, + "step": 1071 + }, + { + "epoch": 0.24, + "grad_norm": 0.7639374722338221, + "learning_rate": 3.5736241915474345e-05, + "loss": 0.1593, + "step": 1072 + }, + { + "epoch": 0.24, + "grad_norm": 0.6134283533254882, + "learning_rate": 3.572745637709065e-05, + "loss": 0.1205, + "step": 1073 + }, + { + "epoch": 0.24, + "grad_norm": 0.7151564450937763, + "learning_rate": 3.5718662878944876e-05, + "loss": 0.1524, + "step": 1074 + }, + { + "epoch": 0.24, + "grad_norm": 0.5888968945754794, + "learning_rate": 3.570986142548746e-05, + "loss": 0.1039, + "step": 1075 + }, + { + "epoch": 0.24, + "grad_norm": 0.6295134178319302, + "learning_rate": 3.5701052021172874e-05, + "loss": 0.1472, + "step": 1076 + }, + { + "epoch": 0.24, + "grad_norm": 0.6032800307183738, + "learning_rate": 3.5692234670459615e-05, + "loss": 0.165, + "step": 1077 + }, + { + "epoch": 0.24, + "grad_norm": 0.7610619476509329, + "learning_rate": 3.5683409377810185e-05, + "loss": 0.1877, + "step": 1078 + }, + { + "epoch": 0.24, + "grad_norm": 0.6134598830225814, + "learning_rate": 3.567457614769113e-05, + "loss": 0.1307, + "step": 1079 + }, + { + "epoch": 0.24, + "grad_norm": 0.6395293146696429, + "learning_rate": 3.566573498457301e-05, + "loss": 0.1499, + "step": 1080 + }, + { + "epoch": 0.24, + "grad_norm": 0.6567376459395161, + "learning_rate": 3.5656885892930376e-05, + "loss": 0.1432, + "step": 1081 + }, + { + "epoch": 0.24, + "grad_norm": 0.6329789825719756, + "learning_rate": 3.564802887724181e-05, + "loss": 0.1358, + "step": 1082 + }, + { + "epoch": 0.24, + "grad_norm": 0.7292218096636005, + "learning_rate": 3.563916394198991e-05, + "loss": 0.1604, + "step": 1083 + }, + { + "epoch": 0.24, + "grad_norm": 0.683673200077468, + "learning_rate": 3.5630291091661276e-05, + "loss": 0.136, + "step": 1084 + }, + { + "epoch": 0.24, + "grad_norm": 0.5416118215035685, + "learning_rate": 3.562141033074649e-05, + "loss": 0.1176, + "step": 1085 + }, + { + "epoch": 0.24, + "grad_norm": 0.61694461534053, + "learning_rate": 3.5612521663740183e-05, + "loss": 0.1251, + "step": 1086 + }, + { + "epoch": 0.24, + "grad_norm": 0.6421387164295951, + "learning_rate": 3.560362509514096e-05, + "loss": 0.1551, + "step": 1087 + }, + { + "epoch": 0.24, + "grad_norm": 0.6245773793912889, + "learning_rate": 3.5594720629451414e-05, + "loss": 0.1375, + "step": 1088 + }, + { + "epoch": 0.24, + "grad_norm": 0.5643563897157868, + "learning_rate": 3.558580827117817e-05, + "loss": 0.1104, + "step": 1089 + }, + { + "epoch": 0.24, + "grad_norm": 0.6756127207978516, + "learning_rate": 3.557688802483181e-05, + "loss": 0.1397, + "step": 1090 + }, + { + "epoch": 0.24, + "grad_norm": 0.6775988792391023, + "learning_rate": 3.556795989492694e-05, + "loss": 0.1342, + "step": 1091 + }, + { + "epoch": 0.24, + "grad_norm": 0.7394963916306223, + "learning_rate": 3.555902388598213e-05, + "loss": 0.1246, + "step": 1092 + }, + { + "epoch": 0.24, + "grad_norm": 0.6586100511807396, + "learning_rate": 3.555008000251995e-05, + "loss": 0.1442, + "step": 1093 + }, + { + "epoch": 0.24, + "grad_norm": 0.6084882691317554, + "learning_rate": 3.554112824906696e-05, + "loss": 0.1148, + "step": 1094 + }, + { + "epoch": 0.24, + "grad_norm": 0.6846720447127864, + "learning_rate": 3.55321686301537e-05, + "loss": 0.1451, + "step": 1095 + }, + { + "epoch": 0.24, + "grad_norm": 0.7111722537625023, + "learning_rate": 3.552320115031468e-05, + "loss": 0.1472, + "step": 1096 + }, + { + "epoch": 0.24, + "grad_norm": 0.6502074020788126, + "learning_rate": 3.55142258140884e-05, + "loss": 0.15, + "step": 1097 + }, + { + "epoch": 0.24, + "grad_norm": 0.6718778610168578, + "learning_rate": 3.5505242626017326e-05, + "loss": 0.1447, + "step": 1098 + }, + { + "epoch": 0.24, + "grad_norm": 0.6165806889739158, + "learning_rate": 3.549625159064792e-05, + "loss": 0.133, + "step": 1099 + }, + { + "epoch": 0.24, + "grad_norm": 0.6153270348904686, + "learning_rate": 3.5487252712530583e-05, + "loss": 0.1295, + "step": 1100 + }, + { + "epoch": 0.24, + "grad_norm": 0.6274481272059175, + "learning_rate": 3.547824599621971e-05, + "loss": 0.1329, + "step": 1101 + }, + { + "epoch": 0.24, + "grad_norm": 0.661905555524102, + "learning_rate": 3.546923144627366e-05, + "loss": 0.1649, + "step": 1102 + }, + { + "epoch": 0.24, + "grad_norm": 0.5984050572617661, + "learning_rate": 3.546020906725474e-05, + "loss": 0.1405, + "step": 1103 + }, + { + "epoch": 0.24, + "grad_norm": 0.5674648783650594, + "learning_rate": 3.5451178863729244e-05, + "loss": 0.1001, + "step": 1104 + }, + { + "epoch": 0.24, + "grad_norm": 0.5261440618084035, + "learning_rate": 3.5442140840267404e-05, + "loss": 0.1103, + "step": 1105 + }, + { + "epoch": 0.24, + "grad_norm": 0.6960247977514382, + "learning_rate": 3.543309500144343e-05, + "loss": 0.159, + "step": 1106 + }, + { + "epoch": 0.24, + "grad_norm": 0.6214548177062327, + "learning_rate": 3.542404135183547e-05, + "loss": 0.1372, + "step": 1107 + }, + { + "epoch": 0.24, + "grad_norm": 0.6376840494046262, + "learning_rate": 3.541497989602562e-05, + "loss": 0.1349, + "step": 1108 + }, + { + "epoch": 0.24, + "grad_norm": 0.6090079449196457, + "learning_rate": 3.540591063859996e-05, + "loss": 0.124, + "step": 1109 + }, + { + "epoch": 0.24, + "grad_norm": 0.6539680223900682, + "learning_rate": 3.539683358414848e-05, + "loss": 0.1531, + "step": 1110 + }, + { + "epoch": 0.24, + "grad_norm": 0.7891131847457237, + "learning_rate": 3.538774873726514e-05, + "loss": 0.172, + "step": 1111 + }, + { + "epoch": 0.24, + "grad_norm": 0.6280573215367934, + "learning_rate": 3.537865610254784e-05, + "loss": 0.1262, + "step": 1112 + }, + { + "epoch": 0.24, + "grad_norm": 0.6193859444660762, + "learning_rate": 3.536955568459841e-05, + "loss": 0.1145, + "step": 1113 + }, + { + "epoch": 0.24, + "grad_norm": 0.5785477613220341, + "learning_rate": 3.536044748802263e-05, + "loss": 0.143, + "step": 1114 + }, + { + "epoch": 0.24, + "grad_norm": 0.5395700856893703, + "learning_rate": 3.535133151743022e-05, + "loss": 0.1128, + "step": 1115 + }, + { + "epoch": 0.25, + "grad_norm": 0.5701944934567039, + "learning_rate": 3.534220777743482e-05, + "loss": 0.1186, + "step": 1116 + }, + { + "epoch": 0.25, + "grad_norm": 0.6152525157373263, + "learning_rate": 3.5333076272654014e-05, + "loss": 0.128, + "step": 1117 + }, + { + "epoch": 0.25, + "grad_norm": 0.6256664061206213, + "learning_rate": 3.532393700770932e-05, + "loss": 0.1221, + "step": 1118 + }, + { + "epoch": 0.25, + "grad_norm": 0.6757214911173773, + "learning_rate": 3.5314789987226156e-05, + "loss": 0.1129, + "step": 1119 + }, + { + "epoch": 0.25, + "grad_norm": 0.6246653137541823, + "learning_rate": 3.5305635215833914e-05, + "loss": 0.1228, + "step": 1120 + }, + { + "epoch": 0.25, + "grad_norm": 0.6707085530900672, + "learning_rate": 3.5296472698165856e-05, + "loss": 0.1376, + "step": 1121 + }, + { + "epoch": 0.25, + "grad_norm": 0.6179075025066263, + "learning_rate": 3.5287302438859204e-05, + "loss": 0.1042, + "step": 1122 + }, + { + "epoch": 0.25, + "grad_norm": 0.6481782785733887, + "learning_rate": 3.5278124442555066e-05, + "loss": 0.1291, + "step": 1123 + }, + { + "epoch": 0.25, + "grad_norm": 0.6074076709129159, + "learning_rate": 3.526893871389849e-05, + "loss": 0.1111, + "step": 1124 + }, + { + "epoch": 0.25, + "grad_norm": 0.6630142240551968, + "learning_rate": 3.5259745257538443e-05, + "loss": 0.1322, + "step": 1125 + }, + { + "epoch": 0.25, + "grad_norm": 0.7361292481937083, + "learning_rate": 3.525054407812777e-05, + "loss": 0.1826, + "step": 1126 + }, + { + "epoch": 0.25, + "grad_norm": 0.5535894189934888, + "learning_rate": 3.524133518032325e-05, + "loss": 0.1087, + "step": 1127 + }, + { + "epoch": 0.25, + "grad_norm": 0.8351112988813633, + "learning_rate": 3.5232118568785565e-05, + "loss": 0.244, + "step": 1128 + }, + { + "epoch": 0.25, + "grad_norm": 0.6121295635932749, + "learning_rate": 3.52228942481793e-05, + "loss": 0.1496, + "step": 1129 + }, + { + "epoch": 0.25, + "grad_norm": 0.5354744737740527, + "learning_rate": 3.5213662223172935e-05, + "loss": 0.1045, + "step": 1130 + }, + { + "epoch": 0.25, + "grad_norm": 0.6452633485710266, + "learning_rate": 3.520442249843887e-05, + "loss": 0.1192, + "step": 1131 + }, + { + "epoch": 0.25, + "grad_norm": 0.6178008785901008, + "learning_rate": 3.5195175078653355e-05, + "loss": 0.1412, + "step": 1132 + }, + { + "epoch": 0.25, + "grad_norm": 0.5797864392513596, + "learning_rate": 3.51859199684966e-05, + "loss": 0.0976, + "step": 1133 + }, + { + "epoch": 0.25, + "grad_norm": 0.6622076271098327, + "learning_rate": 3.517665717265265e-05, + "loss": 0.1363, + "step": 1134 + }, + { + "epoch": 0.25, + "grad_norm": 0.6045944053491373, + "learning_rate": 3.516738669580947e-05, + "loss": 0.1131, + "step": 1135 + }, + { + "epoch": 0.25, + "grad_norm": 0.6398177294162027, + "learning_rate": 3.5158108542658915e-05, + "loss": 0.1404, + "step": 1136 + }, + { + "epoch": 0.25, + "grad_norm": 0.6773543157930411, + "learning_rate": 3.5148822717896694e-05, + "loss": 0.1461, + "step": 1137 + }, + { + "epoch": 0.25, + "grad_norm": 0.5875629005848713, + "learning_rate": 3.513952922622243e-05, + "loss": 0.1133, + "step": 1138 + }, + { + "epoch": 0.25, + "grad_norm": 0.6161178059033141, + "learning_rate": 3.513022807233964e-05, + "loss": 0.1102, + "step": 1139 + }, + { + "epoch": 0.25, + "grad_norm": 0.6611613235073424, + "learning_rate": 3.5120919260955655e-05, + "loss": 0.1372, + "step": 1140 + }, + { + "epoch": 0.25, + "grad_norm": 0.6502130397718782, + "learning_rate": 3.511160279678174e-05, + "loss": 0.1408, + "step": 1141 + }, + { + "epoch": 0.25, + "grad_norm": 0.592408848966039, + "learning_rate": 3.510227868453302e-05, + "loss": 0.104, + "step": 1142 + }, + { + "epoch": 0.25, + "grad_norm": 0.6089071776647552, + "learning_rate": 3.509294692892847e-05, + "loss": 0.1135, + "step": 1143 + }, + { + "epoch": 0.25, + "grad_norm": 0.6034376470564629, + "learning_rate": 3.508360753469097e-05, + "loss": 0.1398, + "step": 1144 + }, + { + "epoch": 0.25, + "grad_norm": 0.6381865824576154, + "learning_rate": 3.5074260506547225e-05, + "loss": 0.1279, + "step": 1145 + }, + { + "epoch": 0.25, + "grad_norm": 0.6098214250407791, + "learning_rate": 3.506490584922784e-05, + "loss": 0.1167, + "step": 1146 + }, + { + "epoch": 0.25, + "grad_norm": 0.699568101782631, + "learning_rate": 3.5055543567467244e-05, + "loss": 0.1656, + "step": 1147 + }, + { + "epoch": 0.25, + "grad_norm": 0.5408009953766895, + "learning_rate": 3.504617366600376e-05, + "loss": 0.115, + "step": 1148 + }, + { + "epoch": 0.25, + "grad_norm": 0.5839136648587737, + "learning_rate": 3.503679614957955e-05, + "loss": 0.1162, + "step": 1149 + }, + { + "epoch": 0.25, + "grad_norm": 0.7622599708603124, + "learning_rate": 3.502741102294063e-05, + "loss": 0.1708, + "step": 1150 + }, + { + "epoch": 0.25, + "grad_norm": 0.6580762882445695, + "learning_rate": 3.501801829083688e-05, + "loss": 0.1396, + "step": 1151 + }, + { + "epoch": 0.25, + "grad_norm": 0.5034837652030538, + "learning_rate": 3.500861795802201e-05, + "loss": 0.1035, + "step": 1152 + }, + { + "epoch": 0.25, + "grad_norm": 0.6301220343485374, + "learning_rate": 3.499921002925357e-05, + "loss": 0.134, + "step": 1153 + }, + { + "epoch": 0.25, + "grad_norm": 0.6759966027552546, + "learning_rate": 3.4989794509293005e-05, + "loss": 0.134, + "step": 1154 + }, + { + "epoch": 0.25, + "grad_norm": 0.5163242338440555, + "learning_rate": 3.498037140290555e-05, + "loss": 0.0933, + "step": 1155 + }, + { + "epoch": 0.25, + "grad_norm": 0.6237718913913396, + "learning_rate": 3.497094071486029e-05, + "loss": 0.1475, + "step": 1156 + }, + { + "epoch": 0.25, + "grad_norm": 0.5704668175929202, + "learning_rate": 3.4961502449930165e-05, + "loss": 0.1297, + "step": 1157 + }, + { + "epoch": 0.25, + "grad_norm": 0.6364792917633192, + "learning_rate": 3.495205661289193e-05, + "loss": 0.1467, + "step": 1158 + }, + { + "epoch": 0.25, + "grad_norm": 0.6446432305362467, + "learning_rate": 3.494260320852619e-05, + "loss": 0.1305, + "step": 1159 + }, + { + "epoch": 0.25, + "grad_norm": 0.6163755701743595, + "learning_rate": 3.493314224161737e-05, + "loss": 0.104, + "step": 1160 + }, + { + "epoch": 0.25, + "grad_norm": 0.5823381370149056, + "learning_rate": 3.4923673716953717e-05, + "loss": 0.1106, + "step": 1161 + }, + { + "epoch": 0.26, + "grad_norm": 0.5806140158264087, + "learning_rate": 3.4914197639327306e-05, + "loss": 0.1231, + "step": 1162 + }, + { + "epoch": 0.26, + "grad_norm": 0.5533175408118538, + "learning_rate": 3.490471401353405e-05, + "loss": 0.0922, + "step": 1163 + }, + { + "epoch": 0.26, + "grad_norm": 0.7623329229694302, + "learning_rate": 3.489522284437366e-05, + "loss": 0.1804, + "step": 1164 + }, + { + "epoch": 0.26, + "grad_norm": 0.6246931753430318, + "learning_rate": 3.488572413664969e-05, + "loss": 0.164, + "step": 1165 + }, + { + "epoch": 0.26, + "grad_norm": 0.5766952465211387, + "learning_rate": 3.4876217895169474e-05, + "loss": 0.1108, + "step": 1166 + }, + { + "epoch": 0.26, + "grad_norm": 0.5636533991904469, + "learning_rate": 3.4866704124744196e-05, + "loss": 0.1229, + "step": 1167 + }, + { + "epoch": 0.26, + "grad_norm": 0.5361245854759259, + "learning_rate": 3.4857182830188816e-05, + "loss": 0.0994, + "step": 1168 + }, + { + "epoch": 0.26, + "grad_norm": 0.6510711408936877, + "learning_rate": 3.484765401632214e-05, + "loss": 0.1251, + "step": 1169 + }, + { + "epoch": 0.26, + "grad_norm": 0.6100764028197836, + "learning_rate": 3.483811768796674e-05, + "loss": 0.1093, + "step": 1170 + }, + { + "epoch": 0.26, + "grad_norm": 0.6169278286668669, + "learning_rate": 3.482857384994903e-05, + "loss": 0.1295, + "step": 1171 + }, + { + "epoch": 0.26, + "grad_norm": 0.6201544082164446, + "learning_rate": 3.4819022507099184e-05, + "loss": 0.122, + "step": 1172 + }, + { + "epoch": 0.26, + "grad_norm": 0.5516199530485807, + "learning_rate": 3.480946366425121e-05, + "loss": 0.1092, + "step": 1173 + }, + { + "epoch": 0.26, + "grad_norm": 0.7000214215815579, + "learning_rate": 3.4799897326242895e-05, + "loss": 0.1739, + "step": 1174 + }, + { + "epoch": 0.26, + "grad_norm": 0.632988563816966, + "learning_rate": 3.479032349791581e-05, + "loss": 0.1376, + "step": 1175 + }, + { + "epoch": 0.26, + "grad_norm": 0.6082218703833914, + "learning_rate": 3.478074218411534e-05, + "loss": 0.1378, + "step": 1176 + }, + { + "epoch": 0.26, + "grad_norm": 0.5413108598529882, + "learning_rate": 3.477115338969065e-05, + "loss": 0.0979, + "step": 1177 + }, + { + "epoch": 0.26, + "grad_norm": 0.5812498390033458, + "learning_rate": 3.476155711949467e-05, + "loss": 0.1012, + "step": 1178 + }, + { + "epoch": 0.26, + "grad_norm": 0.6965512588572845, + "learning_rate": 3.475195337838415e-05, + "loss": 0.1541, + "step": 1179 + }, + { + "epoch": 0.26, + "grad_norm": 0.5625723951164735, + "learning_rate": 3.474234217121959e-05, + "loss": 0.1207, + "step": 1180 + }, + { + "epoch": 0.26, + "grad_norm": 0.596036773798448, + "learning_rate": 3.473272350286529e-05, + "loss": 0.1274, + "step": 1181 + }, + { + "epoch": 0.26, + "grad_norm": 0.7216373167904762, + "learning_rate": 3.4723097378189306e-05, + "loss": 0.1523, + "step": 1182 + }, + { + "epoch": 0.26, + "grad_norm": 0.6263755919104075, + "learning_rate": 3.471346380206349e-05, + "loss": 0.1208, + "step": 1183 + }, + { + "epoch": 0.26, + "grad_norm": 0.5691475139297871, + "learning_rate": 3.470382277936345e-05, + "loss": 0.113, + "step": 1184 + }, + { + "epoch": 0.26, + "grad_norm": 0.5948601182830424, + "learning_rate": 3.4694174314968564e-05, + "loss": 0.1156, + "step": 1185 + }, + { + "epoch": 0.26, + "grad_norm": 0.7207090796247909, + "learning_rate": 3.468451841376198e-05, + "loss": 0.1529, + "step": 1186 + }, + { + "epoch": 0.26, + "grad_norm": 0.634722053793067, + "learning_rate": 3.467485508063061e-05, + "loss": 0.1158, + "step": 1187 + }, + { + "epoch": 0.26, + "grad_norm": 0.5649868774578078, + "learning_rate": 3.466518432046512e-05, + "loss": 0.1024, + "step": 1188 + }, + { + "epoch": 0.26, + "grad_norm": 0.6426538115570382, + "learning_rate": 3.4655506138159954e-05, + "loss": 0.1087, + "step": 1189 + }, + { + "epoch": 0.26, + "grad_norm": 0.555267433780788, + "learning_rate": 3.464582053861329e-05, + "loss": 0.1083, + "step": 1190 + }, + { + "epoch": 0.26, + "grad_norm": 0.5448565153350091, + "learning_rate": 3.463612752672707e-05, + "loss": 0.133, + "step": 1191 + }, + { + "epoch": 0.26, + "grad_norm": 0.5585144360127174, + "learning_rate": 3.462642710740699e-05, + "loss": 0.1085, + "step": 1192 + }, + { + "epoch": 0.26, + "grad_norm": 0.6569064580859775, + "learning_rate": 3.461671928556248e-05, + "loss": 0.136, + "step": 1193 + }, + { + "epoch": 0.26, + "grad_norm": 0.6260130301816523, + "learning_rate": 3.4607004066106754e-05, + "loss": 0.1196, + "step": 1194 + }, + { + "epoch": 0.26, + "grad_norm": 0.5934602130643271, + "learning_rate": 3.459728145395671e-05, + "loss": 0.1346, + "step": 1195 + }, + { + "epoch": 0.26, + "grad_norm": 0.7028667611513228, + "learning_rate": 3.458755145403306e-05, + "loss": 0.1429, + "step": 1196 + }, + { + "epoch": 0.26, + "grad_norm": 0.648635856899249, + "learning_rate": 3.457781407126018e-05, + "loss": 0.1357, + "step": 1197 + }, + { + "epoch": 0.26, + "grad_norm": 0.5471637780816928, + "learning_rate": 3.456806931056624e-05, + "loss": 0.1034, + "step": 1198 + }, + { + "epoch": 0.26, + "grad_norm": 0.642561405045521, + "learning_rate": 3.4558317176883116e-05, + "loss": 0.1435, + "step": 1199 + }, + { + "epoch": 0.26, + "grad_norm": 0.6153785899846203, + "learning_rate": 3.454855767514643e-05, + "loss": 0.1289, + "step": 1200 + }, + { + "epoch": 0.26, + "grad_norm": 0.5295401151516663, + "learning_rate": 3.453879081029552e-05, + "loss": 0.0893, + "step": 1201 + }, + { + "epoch": 0.26, + "grad_norm": 0.6474491768483753, + "learning_rate": 3.452901658727345e-05, + "loss": 0.1256, + "step": 1202 + }, + { + "epoch": 0.26, + "grad_norm": 0.6736327841856329, + "learning_rate": 3.451923501102703e-05, + "loss": 0.1364, + "step": 1203 + }, + { + "epoch": 0.26, + "grad_norm": 0.5755152177033249, + "learning_rate": 3.450944608650677e-05, + "loss": 0.1296, + "step": 1204 + }, + { + "epoch": 0.26, + "grad_norm": 0.5771117222914954, + "learning_rate": 3.449964981866689e-05, + "loss": 0.1182, + "step": 1205 + }, + { + "epoch": 0.26, + "grad_norm": 0.6209119854673251, + "learning_rate": 3.4489846212465356e-05, + "loss": 0.1205, + "step": 1206 + }, + { + "epoch": 0.27, + "grad_norm": 0.7319852214009278, + "learning_rate": 3.448003527286383e-05, + "loss": 0.1527, + "step": 1207 + }, + { + "epoch": 0.27, + "grad_norm": 0.5456084707930992, + "learning_rate": 3.447021700482769e-05, + "loss": 0.106, + "step": 1208 + }, + { + "epoch": 0.27, + "grad_norm": 0.6069089189394817, + "learning_rate": 3.446039141332602e-05, + "loss": 0.0915, + "step": 1209 + }, + { + "epoch": 0.27, + "grad_norm": 0.6142817069593046, + "learning_rate": 3.4450558503331606e-05, + "loss": 0.1182, + "step": 1210 + }, + { + "epoch": 0.27, + "grad_norm": 0.7099143830243355, + "learning_rate": 3.444071827982096e-05, + "loss": 0.1767, + "step": 1211 + }, + { + "epoch": 0.27, + "grad_norm": 0.5684238934684104, + "learning_rate": 3.4430870747774266e-05, + "loss": 0.1236, + "step": 1212 + }, + { + "epoch": 0.27, + "grad_norm": 0.6007230524614405, + "learning_rate": 3.442101591217542e-05, + "loss": 0.1112, + "step": 1213 + }, + { + "epoch": 0.27, + "grad_norm": 0.6021835292176488, + "learning_rate": 3.441115377801202e-05, + "loss": 0.1253, + "step": 1214 + }, + { + "epoch": 0.27, + "grad_norm": 0.5635790541604533, + "learning_rate": 3.440128435027536e-05, + "loss": 0.1188, + "step": 1215 + }, + { + "epoch": 0.27, + "grad_norm": 0.57701656687504, + "learning_rate": 3.43914076339604e-05, + "loss": 0.1268, + "step": 1216 + }, + { + "epoch": 0.27, + "grad_norm": 0.6599674519186035, + "learning_rate": 3.438152363406582e-05, + "loss": 0.1276, + "step": 1217 + }, + { + "epoch": 0.27, + "grad_norm": 0.6034804645653913, + "learning_rate": 3.437163235559396e-05, + "loss": 0.1168, + "step": 1218 + }, + { + "epoch": 0.27, + "grad_norm": 0.6539808050788924, + "learning_rate": 3.4361733803550874e-05, + "loss": 0.1581, + "step": 1219 + }, + { + "epoch": 0.27, + "grad_norm": 0.6012704313617491, + "learning_rate": 3.4351827982946274e-05, + "loss": 0.1041, + "step": 1220 + }, + { + "epoch": 0.27, + "grad_norm": 0.6296357135917684, + "learning_rate": 3.434191489879355e-05, + "loss": 0.1284, + "step": 1221 + }, + { + "epoch": 0.27, + "grad_norm": 0.6326536049743969, + "learning_rate": 3.433199455610978e-05, + "loss": 0.1221, + "step": 1222 + }, + { + "epoch": 0.27, + "grad_norm": 0.6212516919732799, + "learning_rate": 3.43220669599157e-05, + "loss": 0.1399, + "step": 1223 + }, + { + "epoch": 0.27, + "grad_norm": 0.5812341236694485, + "learning_rate": 3.431213211523574e-05, + "loss": 0.1036, + "step": 1224 + }, + { + "epoch": 0.27, + "grad_norm": 0.6630829711308373, + "learning_rate": 3.430219002709799e-05, + "loss": 0.123, + "step": 1225 + }, + { + "epoch": 0.27, + "grad_norm": 0.6440885931112753, + "learning_rate": 3.429224070053419e-05, + "loss": 0.136, + "step": 1226 + }, + { + "epoch": 0.27, + "grad_norm": 0.6477891829287996, + "learning_rate": 3.428228414057975e-05, + "loss": 0.1256, + "step": 1227 + }, + { + "epoch": 0.27, + "grad_norm": 0.589110113997143, + "learning_rate": 3.427232035227377e-05, + "loss": 0.1169, + "step": 1228 + }, + { + "epoch": 0.27, + "grad_norm": 0.6328822438930977, + "learning_rate": 3.426234934065896e-05, + "loss": 0.1068, + "step": 1229 + }, + { + "epoch": 0.27, + "grad_norm": 0.5505090947435014, + "learning_rate": 3.4252371110781716e-05, + "loss": 0.142, + "step": 1230 + }, + { + "epoch": 0.27, + "grad_norm": 0.6575949374797614, + "learning_rate": 3.424238566769209e-05, + "loss": 0.1311, + "step": 1231 + }, + { + "epoch": 0.27, + "grad_norm": 0.5399678961164646, + "learning_rate": 3.423239301644377e-05, + "loss": 0.108, + "step": 1232 + }, + { + "epoch": 0.27, + "grad_norm": 0.636209225537825, + "learning_rate": 3.42223931620941e-05, + "loss": 0.1432, + "step": 1233 + }, + { + "epoch": 0.27, + "grad_norm": 0.5430976659746752, + "learning_rate": 3.421238610970406e-05, + "loss": 0.0976, + "step": 1234 + }, + { + "epoch": 0.27, + "grad_norm": 0.654773185785552, + "learning_rate": 3.4202371864338295e-05, + "loss": 0.1374, + "step": 1235 + }, + { + "epoch": 0.27, + "grad_norm": 0.6577328323362479, + "learning_rate": 3.419235043106506e-05, + "loss": 0.1299, + "step": 1236 + }, + { + "epoch": 0.27, + "grad_norm": 0.6343967950769432, + "learning_rate": 3.4182321814956274e-05, + "loss": 0.1374, + "step": 1237 + }, + { + "epoch": 0.27, + "grad_norm": 0.5689775703313454, + "learning_rate": 3.4172286021087475e-05, + "loss": 0.1115, + "step": 1238 + }, + { + "epoch": 0.27, + "grad_norm": 0.5321393422744918, + "learning_rate": 3.416224305453785e-05, + "loss": 0.0976, + "step": 1239 + }, + { + "epoch": 0.27, + "grad_norm": 0.5895353104496781, + "learning_rate": 3.4152192920390195e-05, + "loss": 0.115, + "step": 1240 + }, + { + "epoch": 0.27, + "grad_norm": 0.6693416117685603, + "learning_rate": 3.4142135623730954e-05, + "loss": 0.1299, + "step": 1241 + }, + { + "epoch": 0.27, + "grad_norm": 0.5223463271451472, + "learning_rate": 3.413207116965018e-05, + "loss": 0.0913, + "step": 1242 + }, + { + "epoch": 0.27, + "grad_norm": 0.5742671712584116, + "learning_rate": 3.412199956324155e-05, + "loss": 0.1103, + "step": 1243 + }, + { + "epoch": 0.27, + "grad_norm": 0.6471738567231234, + "learning_rate": 3.4111920809602374e-05, + "loss": 0.1202, + "step": 1244 + }, + { + "epoch": 0.27, + "grad_norm": 0.5738606536557671, + "learning_rate": 3.4101834913833576e-05, + "loss": 0.1253, + "step": 1245 + }, + { + "epoch": 0.27, + "grad_norm": 0.5329715099951039, + "learning_rate": 3.4091741881039677e-05, + "loss": 0.0931, + "step": 1246 + }, + { + "epoch": 0.27, + "grad_norm": 0.6004768334508961, + "learning_rate": 3.4081641716328826e-05, + "loss": 0.1212, + "step": 1247 + }, + { + "epoch": 0.27, + "grad_norm": 0.6238993728835703, + "learning_rate": 3.407153442481278e-05, + "loss": 0.1188, + "step": 1248 + }, + { + "epoch": 0.27, + "grad_norm": 0.6391662877126049, + "learning_rate": 3.4061420011606906e-05, + "loss": 0.1327, + "step": 1249 + }, + { + "epoch": 0.27, + "grad_norm": 0.6850578541931815, + "learning_rate": 3.405129848183017e-05, + "loss": 0.1346, + "step": 1250 + }, + { + "epoch": 0.27, + "grad_norm": 0.6620741753237898, + "learning_rate": 3.404116984060513e-05, + "loss": 0.1424, + "step": 1251 + }, + { + "epoch": 0.27, + "grad_norm": 0.5884064094161954, + "learning_rate": 3.403103409305796e-05, + "loss": 0.1271, + "step": 1252 + }, + { + "epoch": 0.28, + "grad_norm": 0.592150818385704, + "learning_rate": 3.402089124431843e-05, + "loss": 0.1224, + "step": 1253 + }, + { + "epoch": 0.28, + "grad_norm": 0.5933580610751684, + "learning_rate": 3.4010741299519885e-05, + "loss": 0.0974, + "step": 1254 + }, + { + "epoch": 0.28, + "grad_norm": 0.5395084968672039, + "learning_rate": 3.400058426379929e-05, + "loss": 0.0956, + "step": 1255 + }, + { + "epoch": 0.28, + "grad_norm": 0.5568274468905576, + "learning_rate": 3.3990420142297165e-05, + "loss": 0.0934, + "step": 1256 + }, + { + "epoch": 0.28, + "grad_norm": 0.6464044573387644, + "learning_rate": 3.398024894015764e-05, + "loss": 0.1278, + "step": 1257 + }, + { + "epoch": 0.28, + "grad_norm": 0.6397880110723696, + "learning_rate": 3.3970070662528436e-05, + "loss": 0.1261, + "step": 1258 + }, + { + "epoch": 0.28, + "grad_norm": 0.580808956213012, + "learning_rate": 3.395988531456083e-05, + "loss": 0.1214, + "step": 1259 + }, + { + "epoch": 0.28, + "grad_norm": 0.6264461475891724, + "learning_rate": 3.394969290140969e-05, + "loss": 0.1241, + "step": 1260 + }, + { + "epoch": 0.28, + "grad_norm": 0.6984251622896039, + "learning_rate": 3.393949342823346e-05, + "loss": 0.1567, + "step": 1261 + }, + { + "epoch": 0.28, + "grad_norm": 0.5048838159783609, + "learning_rate": 3.3929286900194154e-05, + "loss": 0.0969, + "step": 1262 + }, + { + "epoch": 0.28, + "grad_norm": 0.5949825243427104, + "learning_rate": 3.3919073322457364e-05, + "loss": 0.1053, + "step": 1263 + }, + { + "epoch": 0.28, + "grad_norm": 0.5917974541286745, + "learning_rate": 3.3908852700192236e-05, + "loss": 0.1162, + "step": 1264 + }, + { + "epoch": 0.28, + "grad_norm": 0.6482849401229682, + "learning_rate": 3.38986250385715e-05, + "loss": 0.1305, + "step": 1265 + }, + { + "epoch": 0.28, + "grad_norm": 0.5296304643919447, + "learning_rate": 3.388839034277142e-05, + "loss": 0.0985, + "step": 1266 + }, + { + "epoch": 0.28, + "grad_norm": 0.6002721447464341, + "learning_rate": 3.387814861797186e-05, + "loss": 0.1077, + "step": 1267 + }, + { + "epoch": 0.28, + "grad_norm": 0.5918012813995788, + "learning_rate": 3.386789986935621e-05, + "loss": 0.1221, + "step": 1268 + }, + { + "epoch": 0.28, + "grad_norm": 0.5493203124861112, + "learning_rate": 3.385764410211143e-05, + "loss": 0.1078, + "step": 1269 + }, + { + "epoch": 0.28, + "grad_norm": 0.5263530608990402, + "learning_rate": 3.3847381321428e-05, + "loss": 0.1154, + "step": 1270 + }, + { + "epoch": 0.28, + "grad_norm": 0.5414700579233093, + "learning_rate": 3.383711153250002e-05, + "loss": 0.1058, + "step": 1271 + }, + { + "epoch": 0.28, + "grad_norm": 0.6080252787344901, + "learning_rate": 3.382683474052506e-05, + "loss": 0.1299, + "step": 1272 + }, + { + "epoch": 0.28, + "grad_norm": 0.5519697744311083, + "learning_rate": 3.381655095070428e-05, + "loss": 0.0927, + "step": 1273 + }, + { + "epoch": 0.28, + "grad_norm": 0.5399789621528959, + "learning_rate": 3.3806260168242365e-05, + "loss": 0.0976, + "step": 1274 + }, + { + "epoch": 0.28, + "grad_norm": 0.6127765546933293, + "learning_rate": 3.379596239834755e-05, + "loss": 0.1375, + "step": 1275 + }, + { + "epoch": 0.28, + "grad_norm": 0.5594481223279179, + "learning_rate": 3.3785657646231596e-05, + "loss": 0.105, + "step": 1276 + }, + { + "epoch": 0.28, + "grad_norm": 0.5644887629151611, + "learning_rate": 3.37753459171098e-05, + "loss": 0.1016, + "step": 1277 + }, + { + "epoch": 0.28, + "grad_norm": 0.5622478820791422, + "learning_rate": 3.376502721620098e-05, + "loss": 0.118, + "step": 1278 + }, + { + "epoch": 0.28, + "grad_norm": 0.6754545844501156, + "learning_rate": 3.375470154872751e-05, + "loss": 0.13, + "step": 1279 + }, + { + "epoch": 0.28, + "grad_norm": 0.6377381197945493, + "learning_rate": 3.3744368919915275e-05, + "loss": 0.1602, + "step": 1280 + }, + { + "epoch": 0.28, + "grad_norm": 0.6207918116959827, + "learning_rate": 3.3734029334993675e-05, + "loss": 0.1371, + "step": 1281 + }, + { + "epoch": 0.28, + "grad_norm": 0.538156355942291, + "learning_rate": 3.372368279919563e-05, + "loss": 0.1094, + "step": 1282 + }, + { + "epoch": 0.28, + "grad_norm": 0.7058682228466938, + "learning_rate": 3.3713329317757594e-05, + "loss": 0.163, + "step": 1283 + }, + { + "epoch": 0.28, + "grad_norm": 0.5479527616275841, + "learning_rate": 3.370296889591953e-05, + "loss": 0.093, + "step": 1284 + }, + { + "epoch": 0.28, + "grad_norm": 0.5901569015180179, + "learning_rate": 3.369260153892491e-05, + "loss": 0.1216, + "step": 1285 + }, + { + "epoch": 0.28, + "grad_norm": 0.5840498973615459, + "learning_rate": 3.3682227252020716e-05, + "loss": 0.1125, + "step": 1286 + }, + { + "epoch": 0.28, + "grad_norm": 0.522568565049669, + "learning_rate": 3.367184604045743e-05, + "loss": 0.1061, + "step": 1287 + }, + { + "epoch": 0.28, + "grad_norm": 0.5738316483256367, + "learning_rate": 3.3661457909489056e-05, + "loss": 0.1105, + "step": 1288 + }, + { + "epoch": 0.28, + "grad_norm": 0.6034257939219156, + "learning_rate": 3.365106286437309e-05, + "loss": 0.1189, + "step": 1289 + }, + { + "epoch": 0.28, + "grad_norm": 0.5368924066832583, + "learning_rate": 3.364066091037052e-05, + "loss": 0.0988, + "step": 1290 + }, + { + "epoch": 0.28, + "grad_norm": 0.6711078744452189, + "learning_rate": 3.3630252052745844e-05, + "loss": 0.1559, + "step": 1291 + }, + { + "epoch": 0.28, + "grad_norm": 0.5399400230496249, + "learning_rate": 3.361983629676705e-05, + "loss": 0.1077, + "step": 1292 + }, + { + "epoch": 0.28, + "grad_norm": 0.6108261338608086, + "learning_rate": 3.360941364770562e-05, + "loss": 0.1294, + "step": 1293 + }, + { + "epoch": 0.28, + "grad_norm": 0.5468824113616102, + "learning_rate": 3.359898411083652e-05, + "loss": 0.1225, + "step": 1294 + }, + { + "epoch": 0.28, + "grad_norm": 0.6249837634838636, + "learning_rate": 3.358854769143819e-05, + "loss": 0.1312, + "step": 1295 + }, + { + "epoch": 0.28, + "grad_norm": 0.5940600639666558, + "learning_rate": 3.357810439479258e-05, + "loss": 0.1086, + "step": 1296 + }, + { + "epoch": 0.28, + "grad_norm": 0.5821095356360554, + "learning_rate": 3.356765422618509e-05, + "loss": 0.108, + "step": 1297 + }, + { + "epoch": 0.29, + "grad_norm": 0.6157257055801524, + "learning_rate": 3.355719719090465e-05, + "loss": 0.1548, + "step": 1298 + }, + { + "epoch": 0.29, + "grad_norm": 0.5744449922340548, + "learning_rate": 3.3546733294243585e-05, + "loss": 0.1333, + "step": 1299 + }, + { + "epoch": 0.29, + "grad_norm": 0.6436444510808119, + "learning_rate": 3.353626254149776e-05, + "loss": 0.1208, + "step": 1300 + }, + { + "epoch": 0.29, + "grad_norm": 0.5962607853391843, + "learning_rate": 3.3525784937966474e-05, + "loss": 0.1112, + "step": 1301 + }, + { + "epoch": 0.29, + "grad_norm": 0.5523535289195513, + "learning_rate": 3.3515300488952534e-05, + "loss": 0.0944, + "step": 1302 + }, + { + "epoch": 0.29, + "grad_norm": 0.501809006719086, + "learning_rate": 3.350480919976216e-05, + "loss": 0.087, + "step": 1303 + }, + { + "epoch": 0.29, + "grad_norm": 0.5981814750628635, + "learning_rate": 3.349431107570506e-05, + "loss": 0.1275, + "step": 1304 + }, + { + "epoch": 0.29, + "grad_norm": 0.6180237524967851, + "learning_rate": 3.348380612209441e-05, + "loss": 0.1173, + "step": 1305 + }, + { + "epoch": 0.29, + "grad_norm": 0.6130160104393875, + "learning_rate": 3.347329434424683e-05, + "loss": 0.1229, + "step": 1306 + }, + { + "epoch": 0.29, + "grad_norm": 0.544583943908318, + "learning_rate": 3.346277574748238e-05, + "loss": 0.1194, + "step": 1307 + }, + { + "epoch": 0.29, + "grad_norm": 0.5317881607186777, + "learning_rate": 3.345225033712459e-05, + "loss": 0.0983, + "step": 1308 + }, + { + "epoch": 0.29, + "grad_norm": 0.5799913180868896, + "learning_rate": 3.344171811850045e-05, + "loss": 0.1168, + "step": 1309 + }, + { + "epoch": 0.29, + "grad_norm": 0.5500312731261546, + "learning_rate": 3.3431179096940375e-05, + "loss": 0.1332, + "step": 1310 + }, + { + "epoch": 0.29, + "grad_norm": 0.6583755222113519, + "learning_rate": 3.3420633277778214e-05, + "loss": 0.1362, + "step": 1311 + }, + { + "epoch": 0.29, + "grad_norm": 0.5739367824277034, + "learning_rate": 3.341008066635129e-05, + "loss": 0.114, + "step": 1312 + }, + { + "epoch": 0.29, + "grad_norm": 0.6181973031072495, + "learning_rate": 3.339952126800033e-05, + "loss": 0.1252, + "step": 1313 + }, + { + "epoch": 0.29, + "grad_norm": 0.5774385397685252, + "learning_rate": 3.3388955088069524e-05, + "loss": 0.1402, + "step": 1314 + }, + { + "epoch": 0.29, + "grad_norm": 0.6174955926961799, + "learning_rate": 3.3378382131906465e-05, + "loss": 0.1398, + "step": 1315 + }, + { + "epoch": 0.29, + "grad_norm": 0.5129309703011087, + "learning_rate": 3.33678024048622e-05, + "loss": 0.0974, + "step": 1316 + }, + { + "epoch": 0.29, + "grad_norm": 0.6284531566427997, + "learning_rate": 3.335721591229119e-05, + "loss": 0.1331, + "step": 1317 + }, + { + "epoch": 0.29, + "grad_norm": 0.7647961002582618, + "learning_rate": 3.334662265955133e-05, + "loss": 0.1739, + "step": 1318 + }, + { + "epoch": 0.29, + "grad_norm": 0.6942168615205172, + "learning_rate": 3.3336022652003924e-05, + "loss": 0.1737, + "step": 1319 + }, + { + "epoch": 0.29, + "grad_norm": 0.5579231077747826, + "learning_rate": 3.33254158950137e-05, + "loss": 0.0998, + "step": 1320 + }, + { + "epoch": 0.29, + "grad_norm": 0.500234608445358, + "learning_rate": 3.331480239394881e-05, + "loss": 0.1234, + "step": 1321 + }, + { + "epoch": 0.29, + "grad_norm": 0.5624056313086268, + "learning_rate": 3.330418215418081e-05, + "loss": 0.1177, + "step": 1322 + }, + { + "epoch": 0.29, + "grad_norm": 0.544769976921429, + "learning_rate": 3.329355518108466e-05, + "loss": 0.086, + "step": 1323 + }, + { + "epoch": 0.29, + "grad_norm": 0.6480806356860086, + "learning_rate": 3.328292148003875e-05, + "loss": 0.1699, + "step": 1324 + }, + { + "epoch": 0.29, + "grad_norm": 0.5556130008667703, + "learning_rate": 3.3272281056424854e-05, + "loss": 0.117, + "step": 1325 + }, + { + "epoch": 0.29, + "grad_norm": 0.546150790661389, + "learning_rate": 3.326163391562814e-05, + "loss": 0.1028, + "step": 1326 + }, + { + "epoch": 0.29, + "grad_norm": 0.5950563265080882, + "learning_rate": 3.325098006303722e-05, + "loss": 0.1133, + "step": 1327 + }, + { + "epoch": 0.29, + "grad_norm": 0.606291157023492, + "learning_rate": 3.324031950404406e-05, + "loss": 0.0992, + "step": 1328 + }, + { + "epoch": 0.29, + "grad_norm": 0.6167163257536733, + "learning_rate": 3.322965224404403e-05, + "loss": 0.1478, + "step": 1329 + }, + { + "epoch": 0.29, + "grad_norm": 0.5856656270301086, + "learning_rate": 3.3218978288435896e-05, + "loss": 0.1117, + "step": 1330 + }, + { + "epoch": 0.29, + "grad_norm": 0.5518055042821899, + "learning_rate": 3.3208297642621824e-05, + "loss": 0.0941, + "step": 1331 + }, + { + "epoch": 0.29, + "grad_norm": 0.553741508057109, + "learning_rate": 3.319761031200735e-05, + "loss": 0.1246, + "step": 1332 + }, + { + "epoch": 0.29, + "grad_norm": 0.5034305485838049, + "learning_rate": 3.318691630200138e-05, + "loss": 0.0952, + "step": 1333 + }, + { + "epoch": 0.29, + "grad_norm": 0.49705691216473424, + "learning_rate": 3.317621561801624e-05, + "loss": 0.0896, + "step": 1334 + }, + { + "epoch": 0.29, + "grad_norm": 0.5258869535349345, + "learning_rate": 3.316550826546761e-05, + "loss": 0.0988, + "step": 1335 + }, + { + "epoch": 0.29, + "grad_norm": 0.5489673641506772, + "learning_rate": 3.315479424977453e-05, + "loss": 0.1149, + "step": 1336 + }, + { + "epoch": 0.29, + "grad_norm": 0.5114559925851911, + "learning_rate": 3.3144073576359455e-05, + "loss": 0.1157, + "step": 1337 + }, + { + "epoch": 0.29, + "grad_norm": 0.5566812333135119, + "learning_rate": 3.313334625064816e-05, + "loss": 0.1118, + "step": 1338 + }, + { + "epoch": 0.29, + "grad_norm": 0.5873765440338292, + "learning_rate": 3.312261227806982e-05, + "loss": 0.1208, + "step": 1339 + }, + { + "epoch": 0.29, + "grad_norm": 0.5055540186849343, + "learning_rate": 3.311187166405696e-05, + "loss": 0.086, + "step": 1340 + }, + { + "epoch": 0.29, + "grad_norm": 0.5983374210667861, + "learning_rate": 3.310112441404548e-05, + "loss": 0.1095, + "step": 1341 + }, + { + "epoch": 0.29, + "grad_norm": 0.5834239707649026, + "learning_rate": 3.309037053347462e-05, + "loss": 0.1269, + "step": 1342 + }, + { + "epoch": 0.29, + "grad_norm": 0.5894128349595298, + "learning_rate": 3.3079610027786985e-05, + "loss": 0.1095, + "step": 1343 + }, + { + "epoch": 0.3, + "grad_norm": 0.5633359895686817, + "learning_rate": 3.306884290242854e-05, + "loss": 0.1089, + "step": 1344 + }, + { + "epoch": 0.3, + "grad_norm": 0.5123450254265632, + "learning_rate": 3.3058069162848586e-05, + "loss": 0.1056, + "step": 1345 + }, + { + "epoch": 0.3, + "grad_norm": 0.6086173605103516, + "learning_rate": 3.3047288814499786e-05, + "loss": 0.0937, + "step": 1346 + }, + { + "epoch": 0.3, + "grad_norm": 0.6991284701068382, + "learning_rate": 3.3036501862838125e-05, + "loss": 0.1692, + "step": 1347 + }, + { + "epoch": 0.3, + "grad_norm": 0.5441333712760471, + "learning_rate": 3.302570831332297e-05, + "loss": 0.108, + "step": 1348 + }, + { + "epoch": 0.3, + "grad_norm": 0.5533949069537027, + "learning_rate": 3.301490817141698e-05, + "loss": 0.1199, + "step": 1349 + }, + { + "epoch": 0.3, + "grad_norm": 0.5405876312130093, + "learning_rate": 3.300410144258619e-05, + "loss": 0.1341, + "step": 1350 + }, + { + "epoch": 0.3, + "grad_norm": 0.4227404547560645, + "learning_rate": 3.2993288132299935e-05, + "loss": 0.0827, + "step": 1351 + }, + { + "epoch": 0.3, + "grad_norm": 0.627268037704672, + "learning_rate": 3.298246824603091e-05, + "loss": 0.1427, + "step": 1352 + }, + { + "epoch": 0.3, + "grad_norm": 0.5433477821457182, + "learning_rate": 3.297164178925512e-05, + "loss": 0.1175, + "step": 1353 + }, + { + "epoch": 0.3, + "grad_norm": 0.525409683959608, + "learning_rate": 3.2960808767451905e-05, + "loss": 0.1077, + "step": 1354 + }, + { + "epoch": 0.3, + "grad_norm": 0.4823978124632984, + "learning_rate": 3.294996918610393e-05, + "loss": 0.0896, + "step": 1355 + }, + { + "epoch": 0.3, + "grad_norm": 0.5202600057428901, + "learning_rate": 3.293912305069715e-05, + "loss": 0.0906, + "step": 1356 + }, + { + "epoch": 0.3, + "grad_norm": 0.6476774932359233, + "learning_rate": 3.292827036672089e-05, + "loss": 0.1117, + "step": 1357 + }, + { + "epoch": 0.3, + "grad_norm": 0.5763221674018213, + "learning_rate": 3.291741113966773e-05, + "loss": 0.1023, + "step": 1358 + }, + { + "epoch": 0.3, + "grad_norm": 0.6494560056062909, + "learning_rate": 3.290654537503362e-05, + "loss": 0.1113, + "step": 1359 + }, + { + "epoch": 0.3, + "grad_norm": 0.7034436033874668, + "learning_rate": 3.2895673078317775e-05, + "loss": 0.1229, + "step": 1360 + }, + { + "epoch": 0.3, + "grad_norm": 0.6143205425695772, + "learning_rate": 3.288479425502273e-05, + "loss": 0.1194, + "step": 1361 + }, + { + "epoch": 0.3, + "grad_norm": 0.6209591173806414, + "learning_rate": 3.287390891065433e-05, + "loss": 0.1485, + "step": 1362 + }, + { + "epoch": 0.3, + "grad_norm": 0.5999097786066504, + "learning_rate": 3.2863017050721715e-05, + "loss": 0.1246, + "step": 1363 + }, + { + "epoch": 0.3, + "grad_norm": 0.48418642803392065, + "learning_rate": 3.2852118680737306e-05, + "loss": 0.0786, + "step": 1364 + }, + { + "epoch": 0.3, + "grad_norm": 0.540208482983803, + "learning_rate": 3.2841213806216864e-05, + "loss": 0.1044, + "step": 1365 + }, + { + "epoch": 0.3, + "grad_norm": 0.5094061859507117, + "learning_rate": 3.283030243267939e-05, + "loss": 0.0803, + "step": 1366 + }, + { + "epoch": 0.3, + "grad_norm": 0.5646400010314275, + "learning_rate": 3.281938456564721e-05, + "loss": 0.1106, + "step": 1367 + }, + { + "epoch": 0.3, + "grad_norm": 0.5686819203471248, + "learning_rate": 3.2808460210645906e-05, + "loss": 0.1155, + "step": 1368 + }, + { + "epoch": 0.3, + "grad_norm": 0.6112084804857806, + "learning_rate": 3.2797529373204375e-05, + "loss": 0.1225, + "step": 1369 + }, + { + "epoch": 0.3, + "grad_norm": 0.5162515878999319, + "learning_rate": 3.278659205885479e-05, + "loss": 0.0996, + "step": 1370 + }, + { + "epoch": 0.3, + "grad_norm": 0.6340510175256542, + "learning_rate": 3.2775648273132574e-05, + "loss": 0.1383, + "step": 1371 + }, + { + "epoch": 0.3, + "grad_norm": 0.5671948435074174, + "learning_rate": 3.2764698021576446e-05, + "loss": 0.1048, + "step": 1372 + }, + { + "epoch": 0.3, + "grad_norm": 0.6487673644312671, + "learning_rate": 3.27537413097284e-05, + "loss": 0.1625, + "step": 1373 + }, + { + "epoch": 0.3, + "grad_norm": 0.5601474787877964, + "learning_rate": 3.27427781431337e-05, + "loss": 0.1142, + "step": 1374 + }, + { + "epoch": 0.3, + "grad_norm": 0.5813905402472606, + "learning_rate": 3.273180852734087e-05, + "loss": 0.1012, + "step": 1375 + }, + { + "epoch": 0.3, + "grad_norm": 0.4734692792644643, + "learning_rate": 3.27208324679017e-05, + "loss": 0.0724, + "step": 1376 + }, + { + "epoch": 0.3, + "grad_norm": 0.5780253453324077, + "learning_rate": 3.270984997037123e-05, + "loss": 0.1006, + "step": 1377 + }, + { + "epoch": 0.3, + "grad_norm": 0.5225369977400518, + "learning_rate": 3.269886104030778e-05, + "loss": 0.1089, + "step": 1378 + }, + { + "epoch": 0.3, + "grad_norm": 0.5194797060071085, + "learning_rate": 3.268786568327291e-05, + "loss": 0.1096, + "step": 1379 + }, + { + "epoch": 0.3, + "grad_norm": 0.5409759066217806, + "learning_rate": 3.2676863904831444e-05, + "loss": 0.1027, + "step": 1380 + }, + { + "epoch": 0.3, + "grad_norm": 0.5970778315366738, + "learning_rate": 3.266585571055145e-05, + "loss": 0.1028, + "step": 1381 + }, + { + "epoch": 0.3, + "grad_norm": 0.6425400928500881, + "learning_rate": 3.2654841106004225e-05, + "loss": 0.1467, + "step": 1382 + }, + { + "epoch": 0.3, + "grad_norm": 0.5410733519236234, + "learning_rate": 3.264382009676435e-05, + "loss": 0.1025, + "step": 1383 + }, + { + "epoch": 0.3, + "grad_norm": 0.5964767435581186, + "learning_rate": 3.263279268840961e-05, + "loss": 0.1334, + "step": 1384 + }, + { + "epoch": 0.3, + "grad_norm": 0.5198936345823255, + "learning_rate": 3.262175888652106e-05, + "loss": 0.0953, + "step": 1385 + }, + { + "epoch": 0.3, + "grad_norm": 0.656659815755759, + "learning_rate": 3.261071869668296e-05, + "loss": 0.1355, + "step": 1386 + }, + { + "epoch": 0.3, + "grad_norm": 0.588144393341704, + "learning_rate": 3.259967212448282e-05, + "loss": 0.1398, + "step": 1387 + }, + { + "epoch": 0.3, + "grad_norm": 0.5605675563918083, + "learning_rate": 3.2588619175511387e-05, + "loss": 0.1048, + "step": 1388 + }, + { + "epoch": 0.31, + "grad_norm": 0.452692499602161, + "learning_rate": 3.2577559855362614e-05, + "loss": 0.0811, + "step": 1389 + }, + { + "epoch": 0.31, + "grad_norm": 0.5681878057077676, + "learning_rate": 3.2566494169633693e-05, + "loss": 0.1066, + "step": 1390 + }, + { + "epoch": 0.31, + "grad_norm": 0.5795194716502409, + "learning_rate": 3.255542212392505e-05, + "loss": 0.1226, + "step": 1391 + }, + { + "epoch": 0.31, + "grad_norm": 0.565216129538394, + "learning_rate": 3.2544343723840296e-05, + "loss": 0.0917, + "step": 1392 + }, + { + "epoch": 0.31, + "grad_norm": 0.4864753285882939, + "learning_rate": 3.253325897498629e-05, + "loss": 0.0918, + "step": 1393 + }, + { + "epoch": 0.31, + "grad_norm": 0.5830497790492641, + "learning_rate": 3.2522167882973085e-05, + "loss": 0.0995, + "step": 1394 + }, + { + "epoch": 0.31, + "grad_norm": 0.5171751360841577, + "learning_rate": 3.251107045341395e-05, + "loss": 0.0975, + "step": 1395 + }, + { + "epoch": 0.31, + "grad_norm": 0.6417617249571287, + "learning_rate": 3.249996669192537e-05, + "loss": 0.1543, + "step": 1396 + }, + { + "epoch": 0.31, + "grad_norm": 0.5739693132332222, + "learning_rate": 3.248885660412701e-05, + "loss": 0.115, + "step": 1397 + }, + { + "epoch": 0.31, + "grad_norm": 0.5815615711071502, + "learning_rate": 3.247774019564178e-05, + "loss": 0.0829, + "step": 1398 + }, + { + "epoch": 0.31, + "grad_norm": 0.6249244644296427, + "learning_rate": 3.2466617472095736e-05, + "loss": 0.0991, + "step": 1399 + }, + { + "epoch": 0.31, + "grad_norm": 0.5152293334436637, + "learning_rate": 3.245548843911817e-05, + "loss": 0.0931, + "step": 1400 + }, + { + "epoch": 0.31, + "grad_norm": 0.5970091713898942, + "learning_rate": 3.244435310234156e-05, + "loss": 0.1193, + "step": 1401 + }, + { + "epoch": 0.31, + "grad_norm": 0.5920520899415288, + "learning_rate": 3.243321146740155e-05, + "loss": 0.0919, + "step": 1402 + }, + { + "epoch": 0.31, + "grad_norm": 0.5128584538616451, + "learning_rate": 3.2422063539937006e-05, + "loss": 0.0919, + "step": 1403 + }, + { + "epoch": 0.31, + "grad_norm": 0.5298398319280635, + "learning_rate": 3.2410909325589954e-05, + "loss": 0.085, + "step": 1404 + }, + { + "epoch": 0.31, + "grad_norm": 0.5807596435304535, + "learning_rate": 3.239974883000561e-05, + "loss": 0.1002, + "step": 1405 + }, + { + "epoch": 0.31, + "grad_norm": 0.5915929109199566, + "learning_rate": 3.2388582058832375e-05, + "loss": 0.117, + "step": 1406 + }, + { + "epoch": 0.31, + "grad_norm": 0.5410412944116243, + "learning_rate": 3.237740901772181e-05, + "loss": 0.1044, + "step": 1407 + }, + { + "epoch": 0.31, + "grad_norm": 0.5061377860140891, + "learning_rate": 3.2366229712328675e-05, + "loss": 0.0859, + "step": 1408 + }, + { + "epoch": 0.31, + "grad_norm": 0.5453665501606407, + "learning_rate": 3.235504414831087e-05, + "loss": 0.1005, + "step": 1409 + }, + { + "epoch": 0.31, + "grad_norm": 0.6237674002810024, + "learning_rate": 3.234385233132949e-05, + "loss": 0.1188, + "step": 1410 + }, + { + "epoch": 0.31, + "grad_norm": 0.5550178833391438, + "learning_rate": 3.233265426704877e-05, + "loss": 0.0948, + "step": 1411 + }, + { + "epoch": 0.31, + "grad_norm": 0.5620625702416668, + "learning_rate": 3.232144996113613e-05, + "loss": 0.0876, + "step": 1412 + }, + { + "epoch": 0.31, + "grad_norm": 0.6278011493027039, + "learning_rate": 3.231023941926213e-05, + "loss": 0.1235, + "step": 1413 + }, + { + "epoch": 0.31, + "grad_norm": 0.6123232800643602, + "learning_rate": 3.22990226471005e-05, + "loss": 0.1085, + "step": 1414 + }, + { + "epoch": 0.31, + "grad_norm": 0.5878673054578677, + "learning_rate": 3.2287799650328116e-05, + "loss": 0.1088, + "step": 1415 + }, + { + "epoch": 0.31, + "grad_norm": 0.5936979973757809, + "learning_rate": 3.2276570434625e-05, + "loss": 0.108, + "step": 1416 + }, + { + "epoch": 0.31, + "grad_norm": 0.5982325150532656, + "learning_rate": 3.226533500567433e-05, + "loss": 0.121, + "step": 1417 + }, + { + "epoch": 0.31, + "grad_norm": 0.5867558677097133, + "learning_rate": 3.2254093369162425e-05, + "loss": 0.115, + "step": 1418 + }, + { + "epoch": 0.31, + "grad_norm": 0.5545845473477063, + "learning_rate": 3.2242845530778755e-05, + "loss": 0.1124, + "step": 1419 + }, + { + "epoch": 0.31, + "grad_norm": 0.4871739458666332, + "learning_rate": 3.22315914962159e-05, + "loss": 0.0867, + "step": 1420 + }, + { + "epoch": 0.31, + "grad_norm": 0.4646669259868776, + "learning_rate": 3.2220331271169614e-05, + "loss": 0.1008, + "step": 1421 + }, + { + "epoch": 0.31, + "grad_norm": 0.4921344698236326, + "learning_rate": 3.220906486133876e-05, + "loss": 0.0877, + "step": 1422 + }, + { + "epoch": 0.31, + "grad_norm": 0.506246499649945, + "learning_rate": 3.219779227242534e-05, + "loss": 0.1104, + "step": 1423 + }, + { + "epoch": 0.31, + "grad_norm": 0.5522790387999251, + "learning_rate": 3.218651351013447e-05, + "loss": 0.0856, + "step": 1424 + }, + { + "epoch": 0.31, + "grad_norm": 0.571981261756722, + "learning_rate": 3.217522858017442e-05, + "loss": 0.0969, + "step": 1425 + }, + { + "epoch": 0.31, + "grad_norm": 0.6090202766293858, + "learning_rate": 3.216393748825654e-05, + "loss": 0.1176, + "step": 1426 + }, + { + "epoch": 0.31, + "grad_norm": 0.44838768752167724, + "learning_rate": 3.2152640240095335e-05, + "loss": 0.0705, + "step": 1427 + }, + { + "epoch": 0.31, + "grad_norm": 0.46354196491303296, + "learning_rate": 3.2141336841408406e-05, + "loss": 0.0671, + "step": 1428 + }, + { + "epoch": 0.31, + "grad_norm": 0.5880174468365021, + "learning_rate": 3.2130027297916476e-05, + "loss": 0.1175, + "step": 1429 + }, + { + "epoch": 0.31, + "grad_norm": 0.5015029372740631, + "learning_rate": 3.2118711615343366e-05, + "loss": 0.0792, + "step": 1430 + }, + { + "epoch": 0.31, + "grad_norm": 0.6066306502092649, + "learning_rate": 3.210738979941603e-05, + "loss": 0.1205, + "step": 1431 + }, + { + "epoch": 0.31, + "grad_norm": 0.5556678704800044, + "learning_rate": 3.2096061855864485e-05, + "loss": 0.0861, + "step": 1432 + }, + { + "epoch": 0.31, + "grad_norm": 0.5559779455558638, + "learning_rate": 3.2084727790421895e-05, + "loss": 0.1002, + "step": 1433 + }, + { + "epoch": 0.31, + "grad_norm": 0.5594356834272175, + "learning_rate": 3.207338760882448e-05, + "loss": 0.0945, + "step": 1434 + }, + { + "epoch": 0.32, + "grad_norm": 0.49659919521560014, + "learning_rate": 3.20620413168116e-05, + "loss": 0.0853, + "step": 1435 + }, + { + "epoch": 0.32, + "grad_norm": 0.5243465444052824, + "learning_rate": 3.205068892012565e-05, + "loss": 0.0922, + "step": 1436 + }, + { + "epoch": 0.32, + "grad_norm": 0.5947183943564954, + "learning_rate": 3.203933042451218e-05, + "loss": 0.1038, + "step": 1437 + }, + { + "epoch": 0.32, + "grad_norm": 0.5286617057434441, + "learning_rate": 3.202796583571977e-05, + "loss": 0.0944, + "step": 1438 + }, + { + "epoch": 0.32, + "grad_norm": 0.6296246613226489, + "learning_rate": 3.2016595159500127e-05, + "loss": 0.1377, + "step": 1439 + }, + { + "epoch": 0.32, + "grad_norm": 0.6802923646189153, + "learning_rate": 3.2005218401608006e-05, + "loss": 0.1245, + "step": 1440 + }, + { + "epoch": 0.32, + "grad_norm": 0.4712275674018901, + "learning_rate": 3.1993835567801266e-05, + "loss": 0.0807, + "step": 1441 + }, + { + "epoch": 0.32, + "grad_norm": 0.6099906647859015, + "learning_rate": 3.19824466638408e-05, + "loss": 0.1082, + "step": 1442 + }, + { + "epoch": 0.32, + "grad_norm": 0.5531651196773758, + "learning_rate": 3.1971051695490644e-05, + "loss": 0.1041, + "step": 1443 + }, + { + "epoch": 0.32, + "grad_norm": 0.5385036889218184, + "learning_rate": 3.195965066851784e-05, + "loss": 0.1112, + "step": 1444 + }, + { + "epoch": 0.32, + "grad_norm": 0.5480856709740979, + "learning_rate": 3.194824358869252e-05, + "loss": 0.0909, + "step": 1445 + }, + { + "epoch": 0.32, + "grad_norm": 0.5267635403138891, + "learning_rate": 3.1936830461787866e-05, + "loss": 0.1064, + "step": 1446 + }, + { + "epoch": 0.32, + "grad_norm": 0.5324786787734438, + "learning_rate": 3.192541129358014e-05, + "loss": 0.0897, + "step": 1447 + }, + { + "epoch": 0.32, + "grad_norm": 0.5094036410184004, + "learning_rate": 3.191398608984867e-05, + "loss": 0.093, + "step": 1448 + }, + { + "epoch": 0.32, + "grad_norm": 0.5502984927559366, + "learning_rate": 3.19025548563758e-05, + "loss": 0.1035, + "step": 1449 + }, + { + "epoch": 0.32, + "grad_norm": 0.579324277408475, + "learning_rate": 3.189111759894695e-05, + "loss": 0.1183, + "step": 1450 + }, + { + "epoch": 0.32, + "grad_norm": 0.5289617793855, + "learning_rate": 3.1879674323350594e-05, + "loss": 0.0944, + "step": 1451 + }, + { + "epoch": 0.32, + "grad_norm": 0.560668692826493, + "learning_rate": 3.186822503537823e-05, + "loss": 0.1117, + "step": 1452 + }, + { + "epoch": 0.32, + "grad_norm": 0.5176155796534162, + "learning_rate": 3.1856769740824426e-05, + "loss": 0.0911, + "step": 1453 + }, + { + "epoch": 0.32, + "grad_norm": 0.49774506343824937, + "learning_rate": 3.184530844548678e-05, + "loss": 0.0838, + "step": 1454 + }, + { + "epoch": 0.32, + "grad_norm": 0.512447042752987, + "learning_rate": 3.183384115516591e-05, + "loss": 0.09, + "step": 1455 + }, + { + "epoch": 0.32, + "grad_norm": 0.6195166856706205, + "learning_rate": 3.182236787566549e-05, + "loss": 0.1248, + "step": 1456 + }, + { + "epoch": 0.32, + "grad_norm": 0.49141038368318024, + "learning_rate": 3.181088861279222e-05, + "loss": 0.0838, + "step": 1457 + }, + { + "epoch": 0.32, + "grad_norm": 0.5814312908518064, + "learning_rate": 3.179940337235582e-05, + "loss": 0.1247, + "step": 1458 + }, + { + "epoch": 0.32, + "grad_norm": 0.5920081186495002, + "learning_rate": 3.178791216016904e-05, + "loss": 0.1222, + "step": 1459 + }, + { + "epoch": 0.32, + "grad_norm": 0.5004816119982474, + "learning_rate": 3.177641498204765e-05, + "loss": 0.0956, + "step": 1460 + }, + { + "epoch": 0.32, + "grad_norm": 0.682944398095459, + "learning_rate": 3.1764911843810456e-05, + "loss": 0.1432, + "step": 1461 + }, + { + "epoch": 0.32, + "grad_norm": 0.547596346725736, + "learning_rate": 3.175340275127925e-05, + "loss": 0.0909, + "step": 1462 + }, + { + "epoch": 0.32, + "grad_norm": 0.4748960029795314, + "learning_rate": 3.1741887710278874e-05, + "loss": 0.0804, + "step": 1463 + }, + { + "epoch": 0.32, + "grad_norm": 0.5348561413037483, + "learning_rate": 3.173036672663714e-05, + "loss": 0.0884, + "step": 1464 + }, + { + "epoch": 0.32, + "grad_norm": 0.5017013215085425, + "learning_rate": 3.17188398061849e-05, + "loss": 0.1001, + "step": 1465 + }, + { + "epoch": 0.32, + "grad_norm": 0.5777986223814436, + "learning_rate": 3.170730695475599e-05, + "loss": 0.126, + "step": 1466 + }, + { + "epoch": 0.32, + "grad_norm": 0.533350410659888, + "learning_rate": 3.1695768178187267e-05, + "loss": 0.1064, + "step": 1467 + }, + { + "epoch": 0.32, + "grad_norm": 0.5274072962020079, + "learning_rate": 3.168422348231857e-05, + "loss": 0.095, + "step": 1468 + }, + { + "epoch": 0.32, + "grad_norm": 0.5806139901619206, + "learning_rate": 3.1672672872992755e-05, + "loss": 0.1023, + "step": 1469 + }, + { + "epoch": 0.32, + "grad_norm": 0.5420230616118438, + "learning_rate": 3.166111635605564e-05, + "loss": 0.1033, + "step": 1470 + }, + { + "epoch": 0.32, + "grad_norm": 0.5674809224959466, + "learning_rate": 3.164955393735605e-05, + "loss": 0.0948, + "step": 1471 + }, + { + "epoch": 0.32, + "grad_norm": 0.5598604472015583, + "learning_rate": 3.1637985622745795e-05, + "loss": 0.0861, + "step": 1472 + }, + { + "epoch": 0.32, + "grad_norm": 0.48932312245861176, + "learning_rate": 3.1626411418079684e-05, + "loss": 0.0933, + "step": 1473 + }, + { + "epoch": 0.32, + "grad_norm": 0.49542347239489665, + "learning_rate": 3.1614831329215475e-05, + "loss": 0.0905, + "step": 1474 + }, + { + "epoch": 0.32, + "grad_norm": 0.5307744365947702, + "learning_rate": 3.160324536201393e-05, + "loss": 0.0922, + "step": 1475 + }, + { + "epoch": 0.32, + "grad_norm": 0.6242590804075524, + "learning_rate": 3.159165352233879e-05, + "loss": 0.1098, + "step": 1476 + }, + { + "epoch": 0.32, + "grad_norm": 0.5361180260014112, + "learning_rate": 3.158005581605673e-05, + "loss": 0.0815, + "step": 1477 + }, + { + "epoch": 0.32, + "grad_norm": 0.5969933467395213, + "learning_rate": 3.156845224903745e-05, + "loss": 0.1015, + "step": 1478 + }, + { + "epoch": 0.32, + "grad_norm": 0.5662072456926389, + "learning_rate": 3.1556842827153556e-05, + "loss": 0.1035, + "step": 1479 + }, + { + "epoch": 0.33, + "grad_norm": 0.5801233344651101, + "learning_rate": 3.154522755628067e-05, + "loss": 0.0885, + "step": 1480 + }, + { + "epoch": 0.33, + "grad_norm": 0.5516999070831768, + "learning_rate": 3.153360644229735e-05, + "loss": 0.0943, + "step": 1481 + }, + { + "epoch": 0.33, + "grad_norm": 0.5700633928952555, + "learning_rate": 3.1521979491085095e-05, + "loss": 0.1038, + "step": 1482 + }, + { + "epoch": 0.33, + "grad_norm": 0.5798399465153998, + "learning_rate": 3.15103467085284e-05, + "loss": 0.1173, + "step": 1483 + }, + { + "epoch": 0.33, + "grad_norm": 0.5076361716853368, + "learning_rate": 3.149870810051467e-05, + "loss": 0.0959, + "step": 1484 + }, + { + "epoch": 0.33, + "grad_norm": 0.5473599327454586, + "learning_rate": 3.148706367293428e-05, + "loss": 0.0934, + "step": 1485 + }, + { + "epoch": 0.33, + "grad_norm": 0.5861413728164516, + "learning_rate": 3.147541343168055e-05, + "loss": 0.1274, + "step": 1486 + }, + { + "epoch": 0.33, + "grad_norm": 0.5542745530605213, + "learning_rate": 3.146375738264975e-05, + "loss": 0.0949, + "step": 1487 + }, + { + "epoch": 0.33, + "grad_norm": 0.5921784986551651, + "learning_rate": 3.145209553174105e-05, + "loss": 0.1391, + "step": 1488 + }, + { + "epoch": 0.33, + "grad_norm": 0.5181984692655839, + "learning_rate": 3.14404278848566e-05, + "loss": 0.0863, + "step": 1489 + }, + { + "epoch": 0.33, + "grad_norm": 0.5962280462130014, + "learning_rate": 3.142875444790147e-05, + "loss": 0.0991, + "step": 1490 + }, + { + "epoch": 0.33, + "grad_norm": 0.48386436125900834, + "learning_rate": 3.141707522678365e-05, + "loss": 0.0813, + "step": 1491 + }, + { + "epoch": 0.33, + "grad_norm": 0.48607849700451455, + "learning_rate": 3.140539022741408e-05, + "loss": 0.0923, + "step": 1492 + }, + { + "epoch": 0.33, + "grad_norm": 0.4443590095314898, + "learning_rate": 3.139369945570659e-05, + "loss": 0.0806, + "step": 1493 + }, + { + "epoch": 0.33, + "grad_norm": 0.605275350877237, + "learning_rate": 3.138200291757797e-05, + "loss": 0.1126, + "step": 1494 + }, + { + "epoch": 0.33, + "grad_norm": 0.5292985641621455, + "learning_rate": 3.137030061894789e-05, + "loss": 0.0933, + "step": 1495 + }, + { + "epoch": 0.33, + "grad_norm": 0.5313127705010521, + "learning_rate": 3.135859256573898e-05, + "loss": 0.0958, + "step": 1496 + }, + { + "epoch": 0.33, + "grad_norm": 0.5485840930268028, + "learning_rate": 3.134687876387673e-05, + "loss": 0.096, + "step": 1497 + }, + { + "epoch": 0.33, + "grad_norm": 0.5495443320540766, + "learning_rate": 3.1335159219289585e-05, + "loss": 0.0903, + "step": 1498 + }, + { + "epoch": 0.33, + "grad_norm": 0.49349570123777126, + "learning_rate": 3.132343393790887e-05, + "loss": 0.0879, + "step": 1499 + }, + { + "epoch": 0.33, + "grad_norm": 0.48489367494468105, + "learning_rate": 3.131170292566883e-05, + "loss": 0.092, + "step": 1500 + }, + { + "epoch": 0.33, + "grad_norm": 0.5636963836988381, + "learning_rate": 3.12999661885066e-05, + "loss": 0.1014, + "step": 1501 + }, + { + "epoch": 0.33, + "grad_norm": 0.5906622809294992, + "learning_rate": 3.12882237323622e-05, + "loss": 0.107, + "step": 1502 + }, + { + "epoch": 0.33, + "grad_norm": 0.49269801109528827, + "learning_rate": 3.127647556317858e-05, + "loss": 0.0805, + "step": 1503 + }, + { + "epoch": 0.33, + "grad_norm": 0.5129773917096273, + "learning_rate": 3.126472168690156e-05, + "loss": 0.0737, + "step": 1504 + }, + { + "epoch": 0.33, + "grad_norm": 0.4865049017917022, + "learning_rate": 3.125296210947983e-05, + "loss": 0.0882, + "step": 1505 + }, + { + "epoch": 0.33, + "grad_norm": 0.5390699634864613, + "learning_rate": 3.1241196836865e-05, + "loss": 0.0939, + "step": 1506 + }, + { + "epoch": 0.33, + "grad_norm": 0.4798041608770732, + "learning_rate": 3.1229425875011534e-05, + "loss": 0.1018, + "step": 1507 + }, + { + "epoch": 0.33, + "grad_norm": 0.47996977350808995, + "learning_rate": 3.12176492298768e-05, + "loss": 0.0691, + "step": 1508 + }, + { + "epoch": 0.33, + "grad_norm": 0.44069173253425836, + "learning_rate": 3.120586690742102e-05, + "loss": 0.0631, + "step": 1509 + }, + { + "epoch": 0.33, + "grad_norm": 0.5452199141375765, + "learning_rate": 3.119407891360732e-05, + "loss": 0.1006, + "step": 1510 + }, + { + "epoch": 0.33, + "grad_norm": 0.5161302576367768, + "learning_rate": 3.118228525440165e-05, + "loss": 0.0915, + "step": 1511 + }, + { + "epoch": 0.33, + "grad_norm": 0.49472325169915915, + "learning_rate": 3.1170485935772864e-05, + "loss": 0.0884, + "step": 1512 + }, + { + "epoch": 0.33, + "grad_norm": 0.5517885722810728, + "learning_rate": 3.1158680963692676e-05, + "loss": 0.0884, + "step": 1513 + }, + { + "epoch": 0.33, + "grad_norm": 0.46487889838167595, + "learning_rate": 3.114687034413564e-05, + "loss": 0.0756, + "step": 1514 + }, + { + "epoch": 0.33, + "grad_norm": 0.5892782721730733, + "learning_rate": 3.1135054083079194e-05, + "loss": 0.1095, + "step": 1515 + }, + { + "epoch": 0.33, + "grad_norm": 0.513953012326265, + "learning_rate": 3.112323218650362e-05, + "loss": 0.0992, + "step": 1516 + }, + { + "epoch": 0.33, + "grad_norm": 0.4966780008099141, + "learning_rate": 3.111140466039205e-05, + "loss": 0.08, + "step": 1517 + }, + { + "epoch": 0.33, + "grad_norm": 0.494700088726365, + "learning_rate": 3.1099571510730466e-05, + "loss": 0.0836, + "step": 1518 + }, + { + "epoch": 0.33, + "grad_norm": 0.46990742839572375, + "learning_rate": 3.1087732743507704e-05, + "loss": 0.0791, + "step": 1519 + }, + { + "epoch": 0.33, + "grad_norm": 0.45231846028789485, + "learning_rate": 3.107588836471542e-05, + "loss": 0.0751, + "step": 1520 + }, + { + "epoch": 0.33, + "grad_norm": 0.5732737179976433, + "learning_rate": 3.106403838034815e-05, + "loss": 0.0994, + "step": 1521 + }, + { + "epoch": 0.33, + "grad_norm": 0.555422497418479, + "learning_rate": 3.1052182796403225e-05, + "loss": 0.1109, + "step": 1522 + }, + { + "epoch": 0.33, + "grad_norm": 0.44664638246672456, + "learning_rate": 3.104032161888084e-05, + "loss": 0.0782, + "step": 1523 + }, + { + "epoch": 0.33, + "grad_norm": 0.49068254145545215, + "learning_rate": 3.1028454853784e-05, + "loss": 0.0817, + "step": 1524 + }, + { + "epoch": 0.33, + "grad_norm": 0.5007696784195629, + "learning_rate": 3.101658250711856e-05, + "loss": 0.0835, + "step": 1525 + }, + { + "epoch": 0.34, + "grad_norm": 0.47998469089890194, + "learning_rate": 3.100470458489318e-05, + "loss": 0.0867, + "step": 1526 + }, + { + "epoch": 0.34, + "grad_norm": 0.5557227346756548, + "learning_rate": 3.099282109311934e-05, + "loss": 0.1048, + "step": 1527 + }, + { + "epoch": 0.34, + "grad_norm": 0.5085157659837836, + "learning_rate": 3.098093203781137e-05, + "loss": 0.1068, + "step": 1528 + }, + { + "epoch": 0.34, + "grad_norm": 0.4919272294724993, + "learning_rate": 3.0969037424986376e-05, + "loss": 0.0793, + "step": 1529 + }, + { + "epoch": 0.34, + "grad_norm": 0.4970668800100589, + "learning_rate": 3.09571372606643e-05, + "loss": 0.0718, + "step": 1530 + }, + { + "epoch": 0.34, + "grad_norm": 0.6050020449580161, + "learning_rate": 3.09452315508679e-05, + "loss": 0.1104, + "step": 1531 + }, + { + "epoch": 0.34, + "grad_norm": 0.5542740153786689, + "learning_rate": 3.09333203016227e-05, + "loss": 0.0835, + "step": 1532 + }, + { + "epoch": 0.34, + "grad_norm": 0.5153961251662009, + "learning_rate": 3.0921403518957076e-05, + "loss": 0.0965, + "step": 1533 + }, + { + "epoch": 0.34, + "grad_norm": 0.6651957547171047, + "learning_rate": 3.0909481208902185e-05, + "loss": 0.1351, + "step": 1534 + }, + { + "epoch": 0.34, + "grad_norm": 0.5472869956005133, + "learning_rate": 3.089755337749198e-05, + "loss": 0.0888, + "step": 1535 + }, + { + "epoch": 0.34, + "grad_norm": 0.6644839351985476, + "learning_rate": 3.08856200307632e-05, + "loss": 0.114, + "step": 1536 + }, + { + "epoch": 0.34, + "grad_norm": 0.5567185196834562, + "learning_rate": 3.08736811747554e-05, + "loss": 0.0958, + "step": 1537 + }, + { + "epoch": 0.34, + "grad_norm": 0.5060256625780866, + "learning_rate": 3.08617368155109e-05, + "loss": 0.0637, + "step": 1538 + }, + { + "epoch": 0.34, + "grad_norm": 0.5294189030931142, + "learning_rate": 3.084978695907482e-05, + "loss": 0.0915, + "step": 1539 + }, + { + "epoch": 0.34, + "grad_norm": 0.5011807744937476, + "learning_rate": 3.0837831611495036e-05, + "loss": 0.0851, + "step": 1540 + }, + { + "epoch": 0.34, + "grad_norm": 0.5303344691440417, + "learning_rate": 3.082587077882225e-05, + "loss": 0.0788, + "step": 1541 + }, + { + "epoch": 0.34, + "grad_norm": 0.5673118455006193, + "learning_rate": 3.081390446710989e-05, + "loss": 0.0943, + "step": 1542 + }, + { + "epoch": 0.34, + "grad_norm": 0.5125749137834809, + "learning_rate": 3.080193268241419e-05, + "loss": 0.0832, + "step": 1543 + }, + { + "epoch": 0.34, + "grad_norm": 0.5498719250505671, + "learning_rate": 3.0789955430794145e-05, + "loss": 0.1065, + "step": 1544 + }, + { + "epoch": 0.34, + "grad_norm": 0.5784083522581387, + "learning_rate": 3.077797271831152e-05, + "loss": 0.104, + "step": 1545 + }, + { + "epoch": 0.34, + "grad_norm": 0.5247577414829844, + "learning_rate": 3.076598455103081e-05, + "loss": 0.0856, + "step": 1546 + }, + { + "epoch": 0.34, + "grad_norm": 0.5081610437163209, + "learning_rate": 3.0753990935019345e-05, + "loss": 0.0854, + "step": 1547 + }, + { + "epoch": 0.34, + "grad_norm": 0.47914656479452084, + "learning_rate": 3.074199187634713e-05, + "loss": 0.0884, + "step": 1548 + }, + { + "epoch": 0.34, + "grad_norm": 0.4793126533479205, + "learning_rate": 3.072998738108699e-05, + "loss": 0.084, + "step": 1549 + }, + { + "epoch": 0.34, + "grad_norm": 0.5118582440755662, + "learning_rate": 3.071797745531445e-05, + "loss": 0.0909, + "step": 1550 + }, + { + "epoch": 0.34, + "grad_norm": 0.5628049871355234, + "learning_rate": 3.070596210510783e-05, + "loss": 0.1084, + "step": 1551 + }, + { + "epoch": 0.34, + "grad_norm": 0.6168438884461107, + "learning_rate": 3.069394133654815e-05, + "loss": 0.1213, + "step": 1552 + }, + { + "epoch": 0.34, + "grad_norm": 0.5079984177861392, + "learning_rate": 3.068191515571921e-05, + "loss": 0.0857, + "step": 1553 + }, + { + "epoch": 0.34, + "grad_norm": 0.46789837405160123, + "learning_rate": 3.066988356870752e-05, + "loss": 0.0895, + "step": 1554 + }, + { + "epoch": 0.34, + "grad_norm": 0.5426420148085699, + "learning_rate": 3.0657846581602355e-05, + "loss": 0.1254, + "step": 1555 + }, + { + "epoch": 0.34, + "grad_norm": 0.558099434620568, + "learning_rate": 3.06458042004957e-05, + "loss": 0.0789, + "step": 1556 + }, + { + "epoch": 0.34, + "grad_norm": 0.4035657639202378, + "learning_rate": 3.063375643148228e-05, + "loss": 0.062, + "step": 1557 + }, + { + "epoch": 0.34, + "grad_norm": 0.5096574573562189, + "learning_rate": 3.062170328065954e-05, + "loss": 0.1382, + "step": 1558 + }, + { + "epoch": 0.34, + "grad_norm": 0.4840329254405113, + "learning_rate": 3.060964475412766e-05, + "loss": 0.0796, + "step": 1559 + }, + { + "epoch": 0.34, + "grad_norm": 0.49819952925350103, + "learning_rate": 3.059758085798954e-05, + "loss": 0.0909, + "step": 1560 + }, + { + "epoch": 0.34, + "grad_norm": 0.5484340243759227, + "learning_rate": 3.058551159835078e-05, + "loss": 0.0892, + "step": 1561 + }, + { + "epoch": 0.34, + "grad_norm": 0.553949052585398, + "learning_rate": 3.057343698131971e-05, + "loss": 0.0914, + "step": 1562 + }, + { + "epoch": 0.34, + "grad_norm": 0.5722146906660561, + "learning_rate": 3.056135701300736e-05, + "loss": 0.0987, + "step": 1563 + }, + { + "epoch": 0.34, + "grad_norm": 0.5633963225303233, + "learning_rate": 3.054927169952749e-05, + "loss": 0.103, + "step": 1564 + }, + { + "epoch": 0.34, + "grad_norm": 0.6369758742073995, + "learning_rate": 3.053718104699654e-05, + "loss": 0.1306, + "step": 1565 + }, + { + "epoch": 0.34, + "grad_norm": 0.5158833810020567, + "learning_rate": 3.052508506153368e-05, + "loss": 0.0986, + "step": 1566 + }, + { + "epoch": 0.34, + "grad_norm": 0.46353267453790725, + "learning_rate": 3.051298374926074e-05, + "loss": 0.0915, + "step": 1567 + }, + { + "epoch": 0.34, + "grad_norm": 0.5066324461289279, + "learning_rate": 3.0500877116302284e-05, + "loss": 0.0788, + "step": 1568 + }, + { + "epoch": 0.34, + "grad_norm": 0.5822538423849566, + "learning_rate": 3.0488765168785544e-05, + "loss": 0.111, + "step": 1569 + }, + { + "epoch": 0.34, + "grad_norm": 0.5673723334758027, + "learning_rate": 3.047664791284046e-05, + "loss": 0.1206, + "step": 1570 + }, + { + "epoch": 0.35, + "grad_norm": 0.5087601549133268, + "learning_rate": 3.046452535459963e-05, + "loss": 0.0846, + "step": 1571 + }, + { + "epoch": 0.35, + "grad_norm": 0.5376127346602497, + "learning_rate": 3.045239750019839e-05, + "loss": 0.0835, + "step": 1572 + }, + { + "epoch": 0.35, + "grad_norm": 0.5174346149067873, + "learning_rate": 3.044026435577469e-05, + "loss": 0.0971, + "step": 1573 + }, + { + "epoch": 0.35, + "grad_norm": 0.5149142539760165, + "learning_rate": 3.0428125927469198e-05, + "loss": 0.0895, + "step": 1574 + }, + { + "epoch": 0.35, + "grad_norm": 0.5280456957441996, + "learning_rate": 3.0415982221425257e-05, + "loss": 0.1009, + "step": 1575 + }, + { + "epoch": 0.35, + "grad_norm": 0.4897789378386961, + "learning_rate": 3.040383324378885e-05, + "loss": 0.0722, + "step": 1576 + }, + { + "epoch": 0.35, + "grad_norm": 0.5358871703246333, + "learning_rate": 3.0391679000708673e-05, + "loss": 0.1011, + "step": 1577 + }, + { + "epoch": 0.35, + "grad_norm": 0.5310148953074585, + "learning_rate": 3.0379519498336054e-05, + "loss": 0.0898, + "step": 1578 + }, + { + "epoch": 0.35, + "grad_norm": 0.5301812866397277, + "learning_rate": 3.036735474282498e-05, + "loss": 0.0927, + "step": 1579 + }, + { + "epoch": 0.35, + "grad_norm": 0.6411726169481955, + "learning_rate": 3.035518474033212e-05, + "loss": 0.1478, + "step": 1580 + }, + { + "epoch": 0.35, + "grad_norm": 0.5214365676511248, + "learning_rate": 3.0343009497016787e-05, + "loss": 0.0989, + "step": 1581 + }, + { + "epoch": 0.35, + "grad_norm": 0.5757916505202934, + "learning_rate": 3.0330829019040945e-05, + "loss": 0.1243, + "step": 1582 + }, + { + "epoch": 0.35, + "grad_norm": 0.5416318503218875, + "learning_rate": 3.0318643312569204e-05, + "loss": 0.1017, + "step": 1583 + }, + { + "epoch": 0.35, + "grad_norm": 0.5471932166567245, + "learning_rate": 3.0306452383768833e-05, + "loss": 0.0784, + "step": 1584 + }, + { + "epoch": 0.35, + "grad_norm": 0.5177630400072355, + "learning_rate": 3.0294256238809727e-05, + "loss": 0.1019, + "step": 1585 + }, + { + "epoch": 0.35, + "grad_norm": 0.45850777738104637, + "learning_rate": 3.0282054883864434e-05, + "loss": 0.0764, + "step": 1586 + }, + { + "epoch": 0.35, + "grad_norm": 0.4849117443070667, + "learning_rate": 3.026984832510814e-05, + "loss": 0.0741, + "step": 1587 + }, + { + "epoch": 0.35, + "grad_norm": 0.5921277169521008, + "learning_rate": 3.025763656871865e-05, + "loss": 0.1184, + "step": 1588 + }, + { + "epoch": 0.35, + "grad_norm": 0.6143843819104651, + "learning_rate": 3.024541962087641e-05, + "loss": 0.1227, + "step": 1589 + }, + { + "epoch": 0.35, + "grad_norm": 0.4186554019034123, + "learning_rate": 3.0233197487764494e-05, + "loss": 0.0758, + "step": 1590 + }, + { + "epoch": 0.35, + "grad_norm": 0.4419283184812778, + "learning_rate": 3.0220970175568604e-05, + "loss": 0.0936, + "step": 1591 + }, + { + "epoch": 0.35, + "grad_norm": 0.49122642356168894, + "learning_rate": 3.020873769047705e-05, + "loss": 0.1048, + "step": 1592 + }, + { + "epoch": 0.35, + "grad_norm": 0.5101086861605791, + "learning_rate": 3.019650003868077e-05, + "loss": 0.0856, + "step": 1593 + }, + { + "epoch": 0.35, + "grad_norm": 0.5561429928068005, + "learning_rate": 3.0184257226373317e-05, + "loss": 0.0963, + "step": 1594 + }, + { + "epoch": 0.35, + "grad_norm": 0.4809035682977758, + "learning_rate": 3.0172009259750852e-05, + "loss": 0.0893, + "step": 1595 + }, + { + "epoch": 0.35, + "grad_norm": 0.5498611936108522, + "learning_rate": 3.015975614501214e-05, + "loss": 0.1033, + "step": 1596 + }, + { + "epoch": 0.35, + "grad_norm": 0.5622349220448989, + "learning_rate": 3.0147497888358564e-05, + "loss": 0.0949, + "step": 1597 + }, + { + "epoch": 0.35, + "grad_norm": 0.479641613334266, + "learning_rate": 3.0135234495994107e-05, + "loss": 0.0908, + "step": 1598 + }, + { + "epoch": 0.35, + "grad_norm": 0.5252805686745193, + "learning_rate": 3.0122965974125335e-05, + "loss": 0.0927, + "step": 1599 + }, + { + "epoch": 0.35, + "grad_norm": 0.598178610317444, + "learning_rate": 3.0110692328961435e-05, + "loss": 0.118, + "step": 1600 + }, + { + "epoch": 0.35, + "grad_norm": 0.5620250816314273, + "learning_rate": 3.0098413566714165e-05, + "loss": 0.1106, + "step": 1601 + }, + { + "epoch": 0.35, + "grad_norm": 0.5314593183248383, + "learning_rate": 3.008612969359788e-05, + "loss": 0.0918, + "step": 1602 + }, + { + "epoch": 0.35, + "grad_norm": 0.4614640800284727, + "learning_rate": 3.0073840715829532e-05, + "loss": 0.0706, + "step": 1603 + }, + { + "epoch": 0.35, + "grad_norm": 0.4640629244972625, + "learning_rate": 3.006154663962865e-05, + "loss": 0.0763, + "step": 1604 + }, + { + "epoch": 0.35, + "grad_norm": 0.4099952563441726, + "learning_rate": 3.0049247471217326e-05, + "loss": 0.0592, + "step": 1605 + }, + { + "epoch": 0.35, + "grad_norm": 0.5684856523583793, + "learning_rate": 3.0036943216820256e-05, + "loss": 0.1109, + "step": 1606 + }, + { + "epoch": 0.35, + "grad_norm": 0.47359148612090135, + "learning_rate": 3.00246338826647e-05, + "loss": 0.0738, + "step": 1607 + }, + { + "epoch": 0.35, + "grad_norm": 0.47715175657175857, + "learning_rate": 3.001231947498048e-05, + "loss": 0.0643, + "step": 1608 + }, + { + "epoch": 0.35, + "grad_norm": 0.5851278178459499, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.1059, + "step": 1609 + }, + { + "epoch": 0.35, + "grad_norm": 0.6081181794555448, + "learning_rate": 2.998767546395822e-05, + "loss": 0.1035, + "step": 1610 + }, + { + "epoch": 0.35, + "grad_norm": 0.47406968880158457, + "learning_rate": 2.9975345873092662e-05, + "loss": 0.0911, + "step": 1611 + }, + { + "epoch": 0.35, + "grad_norm": 0.511908372296521, + "learning_rate": 2.996301123364341e-05, + "loss": 0.0761, + "step": 1612 + }, + { + "epoch": 0.35, + "grad_norm": 0.5312066902009361, + "learning_rate": 2.9950671551853094e-05, + "loss": 0.0902, + "step": 1613 + }, + { + "epoch": 0.35, + "grad_norm": 0.5479439778163137, + "learning_rate": 2.9938326833966914e-05, + "loss": 0.1056, + "step": 1614 + }, + { + "epoch": 0.35, + "grad_norm": 0.5139200169640844, + "learning_rate": 2.992597708623259e-05, + "loss": 0.1016, + "step": 1615 + }, + { + "epoch": 0.35, + "grad_norm": 0.4303417426917004, + "learning_rate": 2.991362231490042e-05, + "loss": 0.0837, + "step": 1616 + }, + { + "epoch": 0.36, + "grad_norm": 0.5217671859754873, + "learning_rate": 2.990126252622323e-05, + "loss": 0.091, + "step": 1617 + }, + { + "epoch": 0.36, + "grad_norm": 0.5249282333731543, + "learning_rate": 2.9888897726456374e-05, + "loss": 0.0917, + "step": 1618 + }, + { + "epoch": 0.36, + "grad_norm": 0.5065982386168, + "learning_rate": 2.9876527921857756e-05, + "loss": 0.1009, + "step": 1619 + }, + { + "epoch": 0.36, + "grad_norm": 0.5157414796063478, + "learning_rate": 2.986415311868782e-05, + "loss": 0.091, + "step": 1620 + }, + { + "epoch": 0.36, + "grad_norm": 0.4295084493558709, + "learning_rate": 2.985177332320952e-05, + "loss": 0.0606, + "step": 1621 + }, + { + "epoch": 0.36, + "grad_norm": 0.5135798788910292, + "learning_rate": 2.9839388541688352e-05, + "loss": 0.0907, + "step": 1622 + }, + { + "epoch": 0.36, + "grad_norm": 0.5477632937661516, + "learning_rate": 2.9826998780392324e-05, + "loss": 0.1087, + "step": 1623 + }, + { + "epoch": 0.36, + "grad_norm": 0.5377968013052618, + "learning_rate": 2.9814604045591974e-05, + "loss": 0.082, + "step": 1624 + }, + { + "epoch": 0.36, + "grad_norm": 0.4219926210962844, + "learning_rate": 2.980220434356035e-05, + "loss": 0.0731, + "step": 1625 + }, + { + "epoch": 0.36, + "grad_norm": 0.45800952057021754, + "learning_rate": 2.9789799680573014e-05, + "loss": 0.0815, + "step": 1626 + }, + { + "epoch": 0.36, + "grad_norm": 0.4190818901895578, + "learning_rate": 2.9777390062908056e-05, + "loss": 0.0612, + "step": 1627 + }, + { + "epoch": 0.36, + "grad_norm": 0.654880934842561, + "learning_rate": 2.976497549684605e-05, + "loss": 0.0936, + "step": 1628 + }, + { + "epoch": 0.36, + "grad_norm": 0.44223417433418194, + "learning_rate": 2.9752555988670084e-05, + "loss": 0.064, + "step": 1629 + }, + { + "epoch": 0.36, + "grad_norm": 0.5239911069788755, + "learning_rate": 2.9740131544665748e-05, + "loss": 0.0842, + "step": 1630 + }, + { + "epoch": 0.36, + "grad_norm": 0.4960591161457972, + "learning_rate": 2.9727702171121125e-05, + "loss": 0.082, + "step": 1631 + }, + { + "epoch": 0.36, + "grad_norm": 0.4905212900166773, + "learning_rate": 2.9715267874326805e-05, + "loss": 0.0901, + "step": 1632 + }, + { + "epoch": 0.36, + "grad_norm": 0.46224708730131725, + "learning_rate": 2.970282866057586e-05, + "loss": 0.0767, + "step": 1633 + }, + { + "epoch": 0.36, + "grad_norm": 0.5464459507312848, + "learning_rate": 2.969038453616385e-05, + "loss": 0.0953, + "step": 1634 + }, + { + "epoch": 0.36, + "grad_norm": 0.4984873899167725, + "learning_rate": 2.9677935507388817e-05, + "loss": 0.0708, + "step": 1635 + }, + { + "epoch": 0.36, + "grad_norm": 0.534677360835388, + "learning_rate": 2.96654815805513e-05, + "loss": 0.0912, + "step": 1636 + }, + { + "epoch": 0.36, + "grad_norm": 0.4562672481150706, + "learning_rate": 2.965302276195431e-05, + "loss": 0.0751, + "step": 1637 + }, + { + "epoch": 0.36, + "grad_norm": 0.37727624751445854, + "learning_rate": 2.9640559057903325e-05, + "loss": 0.0692, + "step": 1638 + }, + { + "epoch": 0.36, + "grad_norm": 0.47634405851813993, + "learning_rate": 2.9628090474706304e-05, + "loss": 0.088, + "step": 1639 + }, + { + "epoch": 0.36, + "grad_norm": 0.45687423356353307, + "learning_rate": 2.9615617018673663e-05, + "loss": 0.0689, + "step": 1640 + }, + { + "epoch": 0.36, + "grad_norm": 0.4847616221641512, + "learning_rate": 2.9603138696118315e-05, + "loss": 0.0768, + "step": 1641 + }, + { + "epoch": 0.36, + "grad_norm": 0.4417883982633577, + "learning_rate": 2.9590655513355598e-05, + "loss": 0.0727, + "step": 1642 + }, + { + "epoch": 0.36, + "grad_norm": 0.44089015395594955, + "learning_rate": 2.957816747670334e-05, + "loss": 0.06, + "step": 1643 + }, + { + "epoch": 0.36, + "grad_norm": 0.42681674855841306, + "learning_rate": 2.956567459248181e-05, + "loss": 0.0702, + "step": 1644 + }, + { + "epoch": 0.36, + "grad_norm": 0.566858413689165, + "learning_rate": 2.9553176867013714e-05, + "loss": 0.102, + "step": 1645 + }, + { + "epoch": 0.36, + "grad_norm": 0.5197223799349769, + "learning_rate": 2.9540674306624262e-05, + "loss": 0.0942, + "step": 1646 + }, + { + "epoch": 0.36, + "grad_norm": 0.5280340692011758, + "learning_rate": 2.9528166917641048e-05, + "loss": 0.0882, + "step": 1647 + }, + { + "epoch": 0.36, + "grad_norm": 0.5924065599923164, + "learning_rate": 2.951565470639415e-05, + "loss": 0.1015, + "step": 1648 + }, + { + "epoch": 0.36, + "grad_norm": 0.5174930718552494, + "learning_rate": 2.9503137679216073e-05, + "loss": 0.0926, + "step": 1649 + }, + { + "epoch": 0.36, + "grad_norm": 0.5359374699717699, + "learning_rate": 2.9490615842441764e-05, + "loss": 0.0907, + "step": 1650 + }, + { + "epoch": 0.36, + "grad_norm": 0.4148972931125151, + "learning_rate": 2.94780892024086e-05, + "loss": 0.0677, + "step": 1651 + }, + { + "epoch": 0.36, + "grad_norm": 0.5461000946942457, + "learning_rate": 2.9465557765456387e-05, + "loss": 0.0973, + "step": 1652 + }, + { + "epoch": 0.36, + "grad_norm": 0.4813988529715445, + "learning_rate": 2.9453021537927363e-05, + "loss": 0.077, + "step": 1653 + }, + { + "epoch": 0.36, + "grad_norm": 0.46848046182768466, + "learning_rate": 2.9440480526166193e-05, + "loss": 0.0686, + "step": 1654 + }, + { + "epoch": 0.36, + "grad_norm": 0.5593676507989175, + "learning_rate": 2.9427934736519962e-05, + "loss": 0.0954, + "step": 1655 + }, + { + "epoch": 0.36, + "grad_norm": 0.5000285200091341, + "learning_rate": 2.9415384175338154e-05, + "loss": 0.0729, + "step": 1656 + }, + { + "epoch": 0.36, + "grad_norm": 0.4873686387291929, + "learning_rate": 2.9402828848972706e-05, + "loss": 0.0838, + "step": 1657 + }, + { + "epoch": 0.36, + "grad_norm": 0.4687000724905962, + "learning_rate": 2.9390268763777938e-05, + "loss": 0.0793, + "step": 1658 + }, + { + "epoch": 0.36, + "grad_norm": 0.47911341170825583, + "learning_rate": 2.937770392611058e-05, + "loss": 0.089, + "step": 1659 + }, + { + "epoch": 0.36, + "grad_norm": 0.5289099148013381, + "learning_rate": 2.9365134342329783e-05, + "loss": 0.0908, + "step": 1660 + }, + { + "epoch": 0.36, + "grad_norm": 0.5187240295343717, + "learning_rate": 2.935256001879709e-05, + "loss": 0.0853, + "step": 1661 + }, + { + "epoch": 0.37, + "grad_norm": 0.5361045176781944, + "learning_rate": 2.9339980961876434e-05, + "loss": 0.1128, + "step": 1662 + }, + { + "epoch": 0.37, + "grad_norm": 0.5844729876855916, + "learning_rate": 2.932739717793416e-05, + "loss": 0.0931, + "step": 1663 + }, + { + "epoch": 0.37, + "grad_norm": 0.48204295918542, + "learning_rate": 2.9314808673338997e-05, + "loss": 0.0693, + "step": 1664 + }, + { + "epoch": 0.37, + "grad_norm": 0.5227459566825613, + "learning_rate": 2.9302215454462063e-05, + "loss": 0.0818, + "step": 1665 + }, + { + "epoch": 0.37, + "grad_norm": 0.49666751512139234, + "learning_rate": 2.928961752767686e-05, + "loss": 0.0805, + "step": 1666 + }, + { + "epoch": 0.37, + "grad_norm": 0.5397645000579105, + "learning_rate": 2.9277014899359284e-05, + "loss": 0.0955, + "step": 1667 + }, + { + "epoch": 0.37, + "grad_norm": 0.49132051250609765, + "learning_rate": 2.92644075758876e-05, + "loss": 0.0943, + "step": 1668 + }, + { + "epoch": 0.37, + "grad_norm": 0.49830369498658783, + "learning_rate": 2.9251795563642445e-05, + "loss": 0.0933, + "step": 1669 + }, + { + "epoch": 0.37, + "grad_norm": 0.4875517622048043, + "learning_rate": 2.923917886900685e-05, + "loss": 0.0717, + "step": 1670 + }, + { + "epoch": 0.37, + "grad_norm": 0.45240061506285206, + "learning_rate": 2.922655749836618e-05, + "loss": 0.0757, + "step": 1671 + }, + { + "epoch": 0.37, + "grad_norm": 0.4454040265973597, + "learning_rate": 2.921393145810821e-05, + "loss": 0.0709, + "step": 1672 + }, + { + "epoch": 0.37, + "grad_norm": 0.511675650841848, + "learning_rate": 2.9201300754623046e-05, + "loss": 0.0962, + "step": 1673 + }, + { + "epoch": 0.37, + "grad_norm": 0.5622649231419086, + "learning_rate": 2.9188665394303163e-05, + "loss": 0.1, + "step": 1674 + }, + { + "epoch": 0.37, + "grad_norm": 0.5357272311190564, + "learning_rate": 2.9176025383543395e-05, + "loss": 0.0753, + "step": 1675 + }, + { + "epoch": 0.37, + "grad_norm": 0.4992327167306387, + "learning_rate": 2.916338072874093e-05, + "loss": 0.0919, + "step": 1676 + }, + { + "epoch": 0.37, + "grad_norm": 0.5875316043225938, + "learning_rate": 2.915073143629531e-05, + "loss": 0.111, + "step": 1677 + }, + { + "epoch": 0.37, + "grad_norm": 0.5132310956027424, + "learning_rate": 2.9138077512608417e-05, + "loss": 0.0993, + "step": 1678 + }, + { + "epoch": 0.37, + "grad_norm": 0.4525387029271844, + "learning_rate": 2.9125418964084474e-05, + "loss": 0.0912, + "step": 1679 + }, + { + "epoch": 0.37, + "grad_norm": 0.5493623027701254, + "learning_rate": 2.9112755797130052e-05, + "loss": 0.1015, + "step": 1680 + }, + { + "epoch": 0.37, + "grad_norm": 0.47496973493428624, + "learning_rate": 2.910008801815406e-05, + "loss": 0.0692, + "step": 1681 + }, + { + "epoch": 0.37, + "grad_norm": 0.487745387722754, + "learning_rate": 2.908741563356774e-05, + "loss": 0.0816, + "step": 1682 + }, + { + "epoch": 0.37, + "grad_norm": 0.5300960630503472, + "learning_rate": 2.9074738649784665e-05, + "loss": 0.0796, + "step": 1683 + }, + { + "epoch": 0.37, + "grad_norm": 0.4775093397006017, + "learning_rate": 2.9062057073220723e-05, + "loss": 0.0837, + "step": 1684 + }, + { + "epoch": 0.37, + "grad_norm": 0.4841079591075937, + "learning_rate": 2.9049370910294143e-05, + "loss": 0.0792, + "step": 1685 + }, + { + "epoch": 0.37, + "grad_norm": 0.44417570809304496, + "learning_rate": 2.9036680167425476e-05, + "loss": 0.0658, + "step": 1686 + }, + { + "epoch": 0.37, + "grad_norm": 0.5095910836808137, + "learning_rate": 2.902398485103758e-05, + "loss": 0.1016, + "step": 1687 + }, + { + "epoch": 0.37, + "grad_norm": 0.4611040557981612, + "learning_rate": 2.901128496755564e-05, + "loss": 0.0745, + "step": 1688 + }, + { + "epoch": 0.37, + "grad_norm": 0.5023244234107879, + "learning_rate": 2.899858052340713e-05, + "loss": 0.077, + "step": 1689 + }, + { + "epoch": 0.37, + "grad_norm": 0.4784058581203675, + "learning_rate": 2.8985871525021857e-05, + "loss": 0.0832, + "step": 1690 + }, + { + "epoch": 0.37, + "grad_norm": 0.5384177623104506, + "learning_rate": 2.897315797883192e-05, + "loss": 0.1033, + "step": 1691 + }, + { + "epoch": 0.37, + "grad_norm": 0.4415949696551052, + "learning_rate": 2.896043989127172e-05, + "loss": 0.0798, + "step": 1692 + }, + { + "epoch": 0.37, + "grad_norm": 0.46916304510129275, + "learning_rate": 2.8947717268777968e-05, + "loss": 0.0854, + "step": 1693 + }, + { + "epoch": 0.37, + "grad_norm": 0.5489348718405892, + "learning_rate": 2.8934990117789658e-05, + "loss": 0.1041, + "step": 1694 + }, + { + "epoch": 0.37, + "grad_norm": 0.4737551187122979, + "learning_rate": 2.8922258444748074e-05, + "loss": 0.0661, + "step": 1695 + }, + { + "epoch": 0.37, + "grad_norm": 0.529193290906736, + "learning_rate": 2.8909522256096795e-05, + "loss": 0.0812, + "step": 1696 + }, + { + "epoch": 0.37, + "grad_norm": 0.47969049521141716, + "learning_rate": 2.8896781558281688e-05, + "loss": 0.0744, + "step": 1697 + }, + { + "epoch": 0.37, + "grad_norm": 0.4855578500331324, + "learning_rate": 2.88840363577509e-05, + "loss": 0.0651, + "step": 1698 + }, + { + "epoch": 0.37, + "grad_norm": 0.5289464261954826, + "learning_rate": 2.8871286660954846e-05, + "loss": 0.0764, + "step": 1699 + }, + { + "epoch": 0.37, + "grad_norm": 0.45181143211991037, + "learning_rate": 2.8858532474346232e-05, + "loss": 0.0728, + "step": 1700 + }, + { + "epoch": 0.37, + "grad_norm": 0.48186171593767424, + "learning_rate": 2.8845773804380028e-05, + "loss": 0.0715, + "step": 1701 + }, + { + "epoch": 0.37, + "grad_norm": 0.5211316176442896, + "learning_rate": 2.883301065751348e-05, + "loss": 0.1015, + "step": 1702 + }, + { + "epoch": 0.37, + "grad_norm": 0.4493606509118241, + "learning_rate": 2.882024304020609e-05, + "loss": 0.0631, + "step": 1703 + }, + { + "epoch": 0.37, + "grad_norm": 0.45422305871096313, + "learning_rate": 2.8807470958919626e-05, + "loss": 0.0674, + "step": 1704 + }, + { + "epoch": 0.37, + "grad_norm": 0.4887822899310735, + "learning_rate": 2.8794694420118117e-05, + "loss": 0.0814, + "step": 1705 + }, + { + "epoch": 0.37, + "grad_norm": 0.5160722238022565, + "learning_rate": 2.8781913430267857e-05, + "loss": 0.0777, + "step": 1706 + }, + { + "epoch": 0.37, + "grad_norm": 0.5285768573530069, + "learning_rate": 2.876912799583737e-05, + "loss": 0.0714, + "step": 1707 + }, + { + "epoch": 0.38, + "grad_norm": 0.6155821286682815, + "learning_rate": 2.8756338123297455e-05, + "loss": 0.1196, + "step": 1708 + }, + { + "epoch": 0.38, + "grad_norm": 0.48713356943010355, + "learning_rate": 2.8743543819121132e-05, + "loss": 0.0688, + "step": 1709 + }, + { + "epoch": 0.38, + "grad_norm": 0.5582549394781079, + "learning_rate": 2.8730745089783686e-05, + "loss": 0.0951, + "step": 1710 + }, + { + "epoch": 0.38, + "grad_norm": 0.5635245846493944, + "learning_rate": 2.871794194176263e-05, + "loss": 0.1159, + "step": 1711 + }, + { + "epoch": 0.38, + "grad_norm": 0.4709760578978905, + "learning_rate": 2.8705134381537718e-05, + "loss": 0.0635, + "step": 1712 + }, + { + "epoch": 0.38, + "grad_norm": 0.5130103896057514, + "learning_rate": 2.869232241559093e-05, + "loss": 0.0764, + "step": 1713 + }, + { + "epoch": 0.38, + "grad_norm": 0.4696199133462722, + "learning_rate": 2.8679506050406475e-05, + "loss": 0.0656, + "step": 1714 + }, + { + "epoch": 0.38, + "grad_norm": 0.5931310187899642, + "learning_rate": 2.8666685292470814e-05, + "loss": 0.1018, + "step": 1715 + }, + { + "epoch": 0.38, + "grad_norm": 0.5262454428214967, + "learning_rate": 2.8653860148272596e-05, + "loss": 0.0946, + "step": 1716 + }, + { + "epoch": 0.38, + "grad_norm": 0.5290788996994328, + "learning_rate": 2.8641030624302704e-05, + "loss": 0.0929, + "step": 1717 + }, + { + "epoch": 0.38, + "grad_norm": 0.4861553758570568, + "learning_rate": 2.8628196727054244e-05, + "loss": 0.0931, + "step": 1718 + }, + { + "epoch": 0.38, + "grad_norm": 0.44621232790038645, + "learning_rate": 2.8615358463022533e-05, + "loss": 0.0729, + "step": 1719 + }, + { + "epoch": 0.38, + "grad_norm": 0.4687295909253299, + "learning_rate": 2.860251583870509e-05, + "loss": 0.095, + "step": 1720 + }, + { + "epoch": 0.38, + "grad_norm": 0.41306681848024107, + "learning_rate": 2.8589668860601643e-05, + "loss": 0.0783, + "step": 1721 + }, + { + "epoch": 0.38, + "grad_norm": 0.4714748814598099, + "learning_rate": 2.857681753521413e-05, + "loss": 0.0862, + "step": 1722 + }, + { + "epoch": 0.38, + "grad_norm": 0.47383239316472453, + "learning_rate": 2.856396186904669e-05, + "loss": 0.0586, + "step": 1723 + }, + { + "epoch": 0.38, + "grad_norm": 0.49578643651846827, + "learning_rate": 2.8551101868605644e-05, + "loss": 0.081, + "step": 1724 + }, + { + "epoch": 0.38, + "grad_norm": 0.5084300467725035, + "learning_rate": 2.8538237540399528e-05, + "loss": 0.0899, + "step": 1725 + }, + { + "epoch": 0.38, + "grad_norm": 0.5145259762148203, + "learning_rate": 2.8525368890939055e-05, + "loss": 0.1061, + "step": 1726 + }, + { + "epoch": 0.38, + "grad_norm": 0.4566929861417621, + "learning_rate": 2.851249592673712e-05, + "loss": 0.0661, + "step": 1727 + }, + { + "epoch": 0.38, + "grad_norm": 0.5720207810244956, + "learning_rate": 2.8499618654308815e-05, + "loss": 0.1057, + "step": 1728 + }, + { + "epoch": 0.38, + "grad_norm": 0.5089245284548529, + "learning_rate": 2.8486737080171405e-05, + "loss": 0.0875, + "step": 1729 + }, + { + "epoch": 0.38, + "grad_norm": 0.4464601818882488, + "learning_rate": 2.847385121084434e-05, + "loss": 0.059, + "step": 1730 + }, + { + "epoch": 0.38, + "grad_norm": 0.47582491093389556, + "learning_rate": 2.8460961052849222e-05, + "loss": 0.0879, + "step": 1731 + }, + { + "epoch": 0.38, + "grad_norm": 0.426657448845374, + "learning_rate": 2.8448066612709854e-05, + "loss": 0.0684, + "step": 1732 + }, + { + "epoch": 0.38, + "grad_norm": 0.44690866676969737, + "learning_rate": 2.843516789695219e-05, + "loss": 0.0584, + "step": 1733 + }, + { + "epoch": 0.38, + "grad_norm": 0.5096894422876526, + "learning_rate": 2.842226491210434e-05, + "loss": 0.0747, + "step": 1734 + }, + { + "epoch": 0.38, + "grad_norm": 0.4899342442689138, + "learning_rate": 2.8409357664696585e-05, + "loss": 0.0949, + "step": 1735 + }, + { + "epoch": 0.38, + "grad_norm": 0.5672997103580923, + "learning_rate": 2.8396446161261372e-05, + "loss": 0.0891, + "step": 1736 + }, + { + "epoch": 0.38, + "grad_norm": 0.5439526542017444, + "learning_rate": 2.8383530408333285e-05, + "loss": 0.0949, + "step": 1737 + }, + { + "epoch": 0.38, + "grad_norm": 0.48880725756917676, + "learning_rate": 2.8370610412449066e-05, + "loss": 0.0777, + "step": 1738 + }, + { + "epoch": 0.38, + "grad_norm": 0.46023548137096015, + "learning_rate": 2.8357686180147604e-05, + "loss": 0.0708, + "step": 1739 + }, + { + "epoch": 0.38, + "grad_norm": 0.44760904912110805, + "learning_rate": 2.834475771796993e-05, + "loss": 0.0571, + "step": 1740 + }, + { + "epoch": 0.38, + "grad_norm": 0.5765089956129296, + "learning_rate": 2.8331825032459228e-05, + "loss": 0.1086, + "step": 1741 + }, + { + "epoch": 0.38, + "grad_norm": 0.44519501525006766, + "learning_rate": 2.8318888130160796e-05, + "loss": 0.076, + "step": 1742 + }, + { + "epoch": 0.38, + "grad_norm": 0.4456507586007182, + "learning_rate": 2.830594701762209e-05, + "loss": 0.0683, + "step": 1743 + }, + { + "epoch": 0.38, + "grad_norm": 0.4807542557401542, + "learning_rate": 2.8293001701392677e-05, + "loss": 0.0814, + "step": 1744 + }, + { + "epoch": 0.38, + "grad_norm": 0.46088174709060936, + "learning_rate": 2.828005218802427e-05, + "loss": 0.0681, + "step": 1745 + }, + { + "epoch": 0.38, + "grad_norm": 0.48053604523088983, + "learning_rate": 2.8267098484070693e-05, + "loss": 0.0894, + "step": 1746 + }, + { + "epoch": 0.38, + "grad_norm": 0.6099883440733267, + "learning_rate": 2.8254140596087897e-05, + "loss": 0.1019, + "step": 1747 + }, + { + "epoch": 0.38, + "grad_norm": 0.4979950821817546, + "learning_rate": 2.8241178530633947e-05, + "loss": 0.099, + "step": 1748 + }, + { + "epoch": 0.38, + "grad_norm": 0.5786325700981236, + "learning_rate": 2.822821229426902e-05, + "loss": 0.1113, + "step": 1749 + }, + { + "epoch": 0.38, + "grad_norm": 0.5114477182388685, + "learning_rate": 2.8215241893555415e-05, + "loss": 0.0924, + "step": 1750 + }, + { + "epoch": 0.38, + "grad_norm": 0.6553502044963802, + "learning_rate": 2.8202267335057522e-05, + "loss": 0.1022, + "step": 1751 + }, + { + "epoch": 0.38, + "grad_norm": 0.44788021856062976, + "learning_rate": 2.818928862534185e-05, + "loss": 0.0545, + "step": 1752 + }, + { + "epoch": 0.39, + "grad_norm": 0.5089788979669672, + "learning_rate": 2.817630577097701e-05, + "loss": 0.0936, + "step": 1753 + }, + { + "epoch": 0.39, + "grad_norm": 0.5125735183646032, + "learning_rate": 2.8163318778533692e-05, + "loss": 0.0874, + "step": 1754 + }, + { + "epoch": 0.39, + "grad_norm": 0.47712190033503193, + "learning_rate": 2.81503276545847e-05, + "loss": 0.0893, + "step": 1755 + }, + { + "epoch": 0.39, + "grad_norm": 0.4760484424362048, + "learning_rate": 2.8137332405704922e-05, + "loss": 0.0736, + "step": 1756 + }, + { + "epoch": 0.39, + "grad_norm": 0.4860070942294226, + "learning_rate": 2.812433303847133e-05, + "loss": 0.083, + "step": 1757 + }, + { + "epoch": 0.39, + "grad_norm": 0.4364222125430108, + "learning_rate": 2.811132955946298e-05, + "loss": 0.0692, + "step": 1758 + }, + { + "epoch": 0.39, + "grad_norm": 0.476793749000669, + "learning_rate": 2.8098321975261026e-05, + "loss": 0.071, + "step": 1759 + }, + { + "epoch": 0.39, + "grad_norm": 0.4861955576919296, + "learning_rate": 2.8085310292448666e-05, + "loss": 0.0799, + "step": 1760 + }, + { + "epoch": 0.39, + "grad_norm": 0.541403703559733, + "learning_rate": 2.8072294517611208e-05, + "loss": 0.094, + "step": 1761 + }, + { + "epoch": 0.39, + "grad_norm": 0.4389854315999451, + "learning_rate": 2.805927465733601e-05, + "loss": 0.0738, + "step": 1762 + }, + { + "epoch": 0.39, + "grad_norm": 0.4030064060918374, + "learning_rate": 2.8046250718212507e-05, + "loss": 0.0583, + "step": 1763 + }, + { + "epoch": 0.39, + "grad_norm": 0.4475027397007963, + "learning_rate": 2.8033222706832187e-05, + "loss": 0.0786, + "step": 1764 + }, + { + "epoch": 0.39, + "grad_norm": 0.5462551418734595, + "learning_rate": 2.802019062978861e-05, + "loss": 0.0936, + "step": 1765 + }, + { + "epoch": 0.39, + "grad_norm": 0.45795786896413304, + "learning_rate": 2.80071544936774e-05, + "loss": 0.0723, + "step": 1766 + }, + { + "epoch": 0.39, + "grad_norm": 0.4787774284563339, + "learning_rate": 2.7994114305096208e-05, + "loss": 0.0738, + "step": 1767 + }, + { + "epoch": 0.39, + "grad_norm": 0.47482042368927674, + "learning_rate": 2.7981070070644764e-05, + "loss": 0.0885, + "step": 1768 + }, + { + "epoch": 0.39, + "grad_norm": 0.512290693201746, + "learning_rate": 2.7968021796924834e-05, + "loss": 0.1083, + "step": 1769 + }, + { + "epoch": 0.39, + "grad_norm": 0.5731261621710955, + "learning_rate": 2.7954969490540223e-05, + "loss": 0.0942, + "step": 1770 + }, + { + "epoch": 0.39, + "grad_norm": 0.4821992429933812, + "learning_rate": 2.7941913158096792e-05, + "loss": 0.0758, + "step": 1771 + }, + { + "epoch": 0.39, + "grad_norm": 0.44004913602999246, + "learning_rate": 2.7928852806202424e-05, + "loss": 0.0578, + "step": 1772 + }, + { + "epoch": 0.39, + "grad_norm": 0.4511290690935369, + "learning_rate": 2.7915788441467052e-05, + "loss": 0.0722, + "step": 1773 + }, + { + "epoch": 0.39, + "grad_norm": 0.5211129169284835, + "learning_rate": 2.790272007050262e-05, + "loss": 0.0794, + "step": 1774 + }, + { + "epoch": 0.39, + "grad_norm": 0.4790374868523092, + "learning_rate": 2.7889647699923114e-05, + "loss": 0.0806, + "step": 1775 + }, + { + "epoch": 0.39, + "grad_norm": 0.4536289832412529, + "learning_rate": 2.7876571336344546e-05, + "loss": 0.0626, + "step": 1776 + }, + { + "epoch": 0.39, + "grad_norm": 0.4069042439652636, + "learning_rate": 2.7863490986384945e-05, + "loss": 0.0739, + "step": 1777 + }, + { + "epoch": 0.39, + "grad_norm": 0.4723581682503797, + "learning_rate": 2.7850406656664346e-05, + "loss": 0.0899, + "step": 1778 + }, + { + "epoch": 0.39, + "grad_norm": 0.5206130388403967, + "learning_rate": 2.783731835380482e-05, + "loss": 0.0877, + "step": 1779 + }, + { + "epoch": 0.39, + "grad_norm": 0.4326390418050192, + "learning_rate": 2.782422608443043e-05, + "loss": 0.0569, + "step": 1780 + }, + { + "epoch": 0.39, + "grad_norm": 0.4592044870680303, + "learning_rate": 2.781112985516725e-05, + "loss": 0.0694, + "step": 1781 + }, + { + "epoch": 0.39, + "grad_norm": 0.5556496080285102, + "learning_rate": 2.7798029672643375e-05, + "loss": 0.0923, + "step": 1782 + }, + { + "epoch": 0.39, + "grad_norm": 0.5149313856452441, + "learning_rate": 2.778492554348887e-05, + "loss": 0.0843, + "step": 1783 + }, + { + "epoch": 0.39, + "grad_norm": 0.5557231100718248, + "learning_rate": 2.7771817474335835e-05, + "loss": 0.09, + "step": 1784 + }, + { + "epoch": 0.39, + "grad_norm": 0.4594406943803711, + "learning_rate": 2.7758705471818327e-05, + "loss": 0.0775, + "step": 1785 + }, + { + "epoch": 0.39, + "grad_norm": 0.4263839655470079, + "learning_rate": 2.7745589542572424e-05, + "loss": 0.0609, + "step": 1786 + }, + { + "epoch": 0.39, + "grad_norm": 0.43457029535529285, + "learning_rate": 2.7732469693236166e-05, + "loss": 0.0683, + "step": 1787 + }, + { + "epoch": 0.39, + "grad_norm": 0.549615261585891, + "learning_rate": 2.77193459304496e-05, + "loss": 0.0943, + "step": 1788 + }, + { + "epoch": 0.39, + "grad_norm": 0.4156646680903677, + "learning_rate": 2.7706218260854738e-05, + "loss": 0.0583, + "step": 1789 + }, + { + "epoch": 0.39, + "grad_norm": 0.4454491221674145, + "learning_rate": 2.7693086691095573e-05, + "loss": 0.0791, + "step": 1790 + }, + { + "epoch": 0.39, + "grad_norm": 0.4934414527256401, + "learning_rate": 2.767995122781807e-05, + "loss": 0.0768, + "step": 1791 + }, + { + "epoch": 0.39, + "grad_norm": 0.47652166222870884, + "learning_rate": 2.7666811877670177e-05, + "loss": 0.0723, + "step": 1792 + }, + { + "epoch": 0.39, + "grad_norm": 0.4011121621863688, + "learning_rate": 2.7653668647301797e-05, + "loss": 0.0605, + "step": 1793 + }, + { + "epoch": 0.39, + "grad_norm": 0.4426381298575097, + "learning_rate": 2.7640521543364797e-05, + "loss": 0.0705, + "step": 1794 + }, + { + "epoch": 0.39, + "grad_norm": 0.5711811713692367, + "learning_rate": 2.7627370572513005e-05, + "loss": 0.0787, + "step": 1795 + }, + { + "epoch": 0.39, + "grad_norm": 0.5513816158664855, + "learning_rate": 2.7614215741402204e-05, + "loss": 0.0921, + "step": 1796 + }, + { + "epoch": 0.39, + "grad_norm": 0.4770801109582723, + "learning_rate": 2.7601057056690148e-05, + "loss": 0.0572, + "step": 1797 + }, + { + "epoch": 0.39, + "grad_norm": 0.4465907973766518, + "learning_rate": 2.7587894525036517e-05, + "loss": 0.0676, + "step": 1798 + }, + { + "epoch": 0.4, + "grad_norm": 0.43528794602088433, + "learning_rate": 2.7574728153102956e-05, + "loss": 0.064, + "step": 1799 + }, + { + "epoch": 0.4, + "grad_norm": 0.45708921668069113, + "learning_rate": 2.7561557947553037e-05, + "loss": 0.0721, + "step": 1800 + }, + { + "epoch": 0.4, + "grad_norm": 0.49661595351522414, + "learning_rate": 2.7548383915052287e-05, + "loss": 0.0833, + "step": 1801 + }, + { + "epoch": 0.4, + "grad_norm": 0.4647576709194282, + "learning_rate": 2.7535206062268174e-05, + "loss": 0.0874, + "step": 1802 + }, + { + "epoch": 0.4, + "grad_norm": 0.4758847529637113, + "learning_rate": 2.7522024395870075e-05, + "loss": 0.073, + "step": 1803 + }, + { + "epoch": 0.4, + "grad_norm": 0.5512991289922461, + "learning_rate": 2.7508838922529316e-05, + "loss": 0.0728, + "step": 1804 + }, + { + "epoch": 0.4, + "grad_norm": 0.45794397490796895, + "learning_rate": 2.7495649648919153e-05, + "loss": 0.082, + "step": 1805 + }, + { + "epoch": 0.4, + "grad_norm": 0.5781542280902109, + "learning_rate": 2.7482456581714757e-05, + "loss": 0.1036, + "step": 1806 + }, + { + "epoch": 0.4, + "grad_norm": 0.5296405647661102, + "learning_rate": 2.7469259727593213e-05, + "loss": 0.0862, + "step": 1807 + }, + { + "epoch": 0.4, + "grad_norm": 0.43297235105373694, + "learning_rate": 2.7456059093233537e-05, + "loss": 0.0723, + "step": 1808 + }, + { + "epoch": 0.4, + "grad_norm": 0.47428964905001236, + "learning_rate": 2.7442854685316643e-05, + "loss": 0.082, + "step": 1809 + }, + { + "epoch": 0.4, + "grad_norm": 0.45791286631225797, + "learning_rate": 2.7429646510525373e-05, + "loss": 0.0568, + "step": 1810 + }, + { + "epoch": 0.4, + "grad_norm": 0.442750554782858, + "learning_rate": 2.7416434575544455e-05, + "loss": 0.0675, + "step": 1811 + }, + { + "epoch": 0.4, + "grad_norm": 0.47779904642868215, + "learning_rate": 2.7403218887060538e-05, + "loss": 0.0828, + "step": 1812 + }, + { + "epoch": 0.4, + "grad_norm": 0.4937883108703817, + "learning_rate": 2.738999945176215e-05, + "loss": 0.0599, + "step": 1813 + }, + { + "epoch": 0.4, + "grad_norm": 0.46984544546305385, + "learning_rate": 2.7376776276339745e-05, + "loss": 0.0701, + "step": 1814 + }, + { + "epoch": 0.4, + "grad_norm": 0.4381758034347087, + "learning_rate": 2.7363549367485648e-05, + "loss": 0.0646, + "step": 1815 + }, + { + "epoch": 0.4, + "grad_norm": 0.46041729230823847, + "learning_rate": 2.7350318731894075e-05, + "loss": 0.0812, + "step": 1816 + }, + { + "epoch": 0.4, + "grad_norm": 0.4205301713824055, + "learning_rate": 2.7337084376261135e-05, + "loss": 0.0705, + "step": 1817 + }, + { + "epoch": 0.4, + "grad_norm": 0.4711574720111425, + "learning_rate": 2.7323846307284814e-05, + "loss": 0.0731, + "step": 1818 + }, + { + "epoch": 0.4, + "grad_norm": 0.46499916491894955, + "learning_rate": 2.7310604531664983e-05, + "loss": 0.0679, + "step": 1819 + }, + { + "epoch": 0.4, + "grad_norm": 0.4927366614179062, + "learning_rate": 2.7297359056103378e-05, + "loss": 0.085, + "step": 1820 + }, + { + "epoch": 0.4, + "grad_norm": 0.45650688477575435, + "learning_rate": 2.7284109887303628e-05, + "loss": 0.071, + "step": 1821 + }, + { + "epoch": 0.4, + "grad_norm": 0.5742329446504342, + "learning_rate": 2.7270857031971203e-05, + "loss": 0.1062, + "step": 1822 + }, + { + "epoch": 0.4, + "grad_norm": 0.531793288511195, + "learning_rate": 2.7257600496813475e-05, + "loss": 0.0912, + "step": 1823 + }, + { + "epoch": 0.4, + "grad_norm": 0.502106461315447, + "learning_rate": 2.7244340288539638e-05, + "loss": 0.0726, + "step": 1824 + }, + { + "epoch": 0.4, + "grad_norm": 0.5183084194087579, + "learning_rate": 2.7231076413860774e-05, + "loss": 0.0635, + "step": 1825 + }, + { + "epoch": 0.4, + "grad_norm": 0.45798770559459295, + "learning_rate": 2.721780887948981e-05, + "loss": 0.0539, + "step": 1826 + }, + { + "epoch": 0.4, + "grad_norm": 0.4696202623789847, + "learning_rate": 2.7204537692141526e-05, + "loss": 0.0771, + "step": 1827 + }, + { + "epoch": 0.4, + "grad_norm": 0.43115573073857216, + "learning_rate": 2.7191262858532552e-05, + "loss": 0.0682, + "step": 1828 + }, + { + "epoch": 0.4, + "grad_norm": 0.49340640619357506, + "learning_rate": 2.7177984385381366e-05, + "loss": 0.0853, + "step": 1829 + }, + { + "epoch": 0.4, + "grad_norm": 0.47025112752865444, + "learning_rate": 2.7164702279408275e-05, + "loss": 0.0615, + "step": 1830 + }, + { + "epoch": 0.4, + "grad_norm": 0.4015113404633209, + "learning_rate": 2.715141654733544e-05, + "loss": 0.0517, + "step": 1831 + }, + { + "epoch": 0.4, + "grad_norm": 0.5527488248304472, + "learning_rate": 2.7138127195886856e-05, + "loss": 0.0959, + "step": 1832 + }, + { + "epoch": 0.4, + "grad_norm": 0.4134978456925406, + "learning_rate": 2.712483423178834e-05, + "loss": 0.0639, + "step": 1833 + }, + { + "epoch": 0.4, + "grad_norm": 0.38267705428554666, + "learning_rate": 2.7111537661767537e-05, + "loss": 0.0503, + "step": 1834 + }, + { + "epoch": 0.4, + "grad_norm": 0.502958604296117, + "learning_rate": 2.7098237492553937e-05, + "loss": 0.0683, + "step": 1835 + }, + { + "epoch": 0.4, + "grad_norm": 0.4738071710618732, + "learning_rate": 2.7084933730878824e-05, + "loss": 0.0812, + "step": 1836 + }, + { + "epoch": 0.4, + "grad_norm": 0.5045485432184268, + "learning_rate": 2.7071626383475327e-05, + "loss": 0.0827, + "step": 1837 + }, + { + "epoch": 0.4, + "grad_norm": 0.4528071340923516, + "learning_rate": 2.7058315457078358e-05, + "loss": 0.0705, + "step": 1838 + }, + { + "epoch": 0.4, + "grad_norm": 0.4197753118195681, + "learning_rate": 2.7045000958424674e-05, + "loss": 0.0679, + "step": 1839 + }, + { + "epoch": 0.4, + "grad_norm": 0.5344421188410078, + "learning_rate": 2.7031682894252816e-05, + "loss": 0.0901, + "step": 1840 + }, + { + "epoch": 0.4, + "grad_norm": 0.48017108039036727, + "learning_rate": 2.701836127130314e-05, + "loss": 0.0642, + "step": 1841 + }, + { + "epoch": 0.4, + "grad_norm": 0.4774242801446497, + "learning_rate": 2.7005036096317802e-05, + "loss": 0.0709, + "step": 1842 + }, + { + "epoch": 0.4, + "grad_norm": 0.4355363374128159, + "learning_rate": 2.6991707376040755e-05, + "loss": 0.0642, + "step": 1843 + }, + { + "epoch": 0.41, + "grad_norm": 0.5430117802755907, + "learning_rate": 2.6978375117217743e-05, + "loss": 0.0812, + "step": 1844 + }, + { + "epoch": 0.41, + "grad_norm": 0.44822825823647566, + "learning_rate": 2.696503932659631e-05, + "loss": 0.0683, + "step": 1845 + }, + { + "epoch": 0.41, + "grad_norm": 0.44747005604877643, + "learning_rate": 2.6951700010925774e-05, + "loss": 0.0627, + "step": 1846 + }, + { + "epoch": 0.41, + "grad_norm": 0.6233900076002594, + "learning_rate": 2.6938357176957243e-05, + "loss": 0.0998, + "step": 1847 + }, + { + "epoch": 0.41, + "grad_norm": 0.5089348933656639, + "learning_rate": 2.692501083144362e-05, + "loss": 0.0581, + "step": 1848 + }, + { + "epoch": 0.41, + "grad_norm": 0.4220702284742675, + "learning_rate": 2.6911660981139563e-05, + "loss": 0.0589, + "step": 1849 + }, + { + "epoch": 0.41, + "grad_norm": 0.4690353796410688, + "learning_rate": 2.6898307632801515e-05, + "loss": 0.0624, + "step": 1850 + }, + { + "epoch": 0.41, + "grad_norm": 0.4031684164575929, + "learning_rate": 2.6884950793187684e-05, + "loss": 0.0523, + "step": 1851 + }, + { + "epoch": 0.41, + "grad_norm": 0.4621623301535883, + "learning_rate": 2.6871590469058052e-05, + "loss": 0.0745, + "step": 1852 + }, + { + "epoch": 0.41, + "grad_norm": 0.46212234801320057, + "learning_rate": 2.6858226667174362e-05, + "loss": 0.0803, + "step": 1853 + }, + { + "epoch": 0.41, + "grad_norm": 0.4780853049312022, + "learning_rate": 2.684485939430011e-05, + "loss": 0.0621, + "step": 1854 + }, + { + "epoch": 0.41, + "grad_norm": 0.5359598516579986, + "learning_rate": 2.683148865720056e-05, + "loss": 0.1006, + "step": 1855 + }, + { + "epoch": 0.41, + "grad_norm": 0.3983820708371192, + "learning_rate": 2.6818114462642726e-05, + "loss": 0.0472, + "step": 1856 + }, + { + "epoch": 0.41, + "grad_norm": 0.4445445650382663, + "learning_rate": 2.6804736817395362e-05, + "loss": 0.0584, + "step": 1857 + }, + { + "epoch": 0.41, + "grad_norm": 0.5937913830038436, + "learning_rate": 2.6791355728228986e-05, + "loss": 0.1188, + "step": 1858 + }, + { + "epoch": 0.41, + "grad_norm": 0.49584486117172055, + "learning_rate": 2.6777971201915843e-05, + "loss": 0.0693, + "step": 1859 + }, + { + "epoch": 0.41, + "grad_norm": 0.5385538271988911, + "learning_rate": 2.676458324522992e-05, + "loss": 0.0734, + "step": 1860 + }, + { + "epoch": 0.41, + "grad_norm": 0.5635618148652323, + "learning_rate": 2.675119186494696e-05, + "loss": 0.0983, + "step": 1861 + }, + { + "epoch": 0.41, + "grad_norm": 0.48687003097642834, + "learning_rate": 2.6737797067844403e-05, + "loss": 0.068, + "step": 1862 + }, + { + "epoch": 0.41, + "grad_norm": 0.5590274434322308, + "learning_rate": 2.6724398860701453e-05, + "loss": 0.0995, + "step": 1863 + }, + { + "epoch": 0.41, + "grad_norm": 0.4434149114972751, + "learning_rate": 2.6710997250299012e-05, + "loss": 0.0634, + "step": 1864 + }, + { + "epoch": 0.41, + "grad_norm": 0.41477458829925434, + "learning_rate": 2.6697592243419723e-05, + "loss": 0.0588, + "step": 1865 + }, + { + "epoch": 0.41, + "grad_norm": 0.41729900141627835, + "learning_rate": 2.668418384684795e-05, + "loss": 0.0632, + "step": 1866 + }, + { + "epoch": 0.41, + "grad_norm": 0.4770965866541249, + "learning_rate": 2.6670772067369754e-05, + "loss": 0.0783, + "step": 1867 + }, + { + "epoch": 0.41, + "grad_norm": 0.4641929366087564, + "learning_rate": 2.6657356911772922e-05, + "loss": 0.0657, + "step": 1868 + }, + { + "epoch": 0.41, + "grad_norm": 0.38559123115511384, + "learning_rate": 2.6643938386846945e-05, + "loss": 0.0578, + "step": 1869 + }, + { + "epoch": 0.41, + "grad_norm": 0.47675891643241647, + "learning_rate": 2.663051649938303e-05, + "loss": 0.0861, + "step": 1870 + }, + { + "epoch": 0.41, + "grad_norm": 0.4886254585109774, + "learning_rate": 2.6617091256174058e-05, + "loss": 0.0805, + "step": 1871 + }, + { + "epoch": 0.41, + "grad_norm": 0.45498374647935974, + "learning_rate": 2.6603662664014644e-05, + "loss": 0.08, + "step": 1872 + }, + { + "epoch": 0.41, + "grad_norm": 0.4548434360441476, + "learning_rate": 2.659023072970107e-05, + "loss": 0.0832, + "step": 1873 + }, + { + "epoch": 0.41, + "grad_norm": 0.4640783853091864, + "learning_rate": 2.657679546003133e-05, + "loss": 0.0824, + "step": 1874 + }, + { + "epoch": 0.41, + "grad_norm": 0.44275094182535174, + "learning_rate": 2.656335686180509e-05, + "loss": 0.0513, + "step": 1875 + }, + { + "epoch": 0.41, + "grad_norm": 0.4445752851132524, + "learning_rate": 2.6549914941823713e-05, + "loss": 0.0763, + "step": 1876 + }, + { + "epoch": 0.41, + "grad_norm": 0.4123212080066735, + "learning_rate": 2.6536469706890226e-05, + "loss": 0.084, + "step": 1877 + }, + { + "epoch": 0.41, + "grad_norm": 0.5086670654238233, + "learning_rate": 2.652302116380935e-05, + "loss": 0.087, + "step": 1878 + }, + { + "epoch": 0.41, + "grad_norm": 0.48448247640076997, + "learning_rate": 2.6509569319387477e-05, + "loss": 0.0784, + "step": 1879 + }, + { + "epoch": 0.41, + "grad_norm": 0.4387107172929702, + "learning_rate": 2.6496114180432672e-05, + "loss": 0.062, + "step": 1880 + }, + { + "epoch": 0.41, + "grad_norm": 0.4821219496508101, + "learning_rate": 2.6482655753754657e-05, + "loss": 0.0841, + "step": 1881 + }, + { + "epoch": 0.41, + "grad_norm": 0.5209558215749556, + "learning_rate": 2.6469194046164818e-05, + "loss": 0.0942, + "step": 1882 + }, + { + "epoch": 0.41, + "grad_norm": 0.40424438725563716, + "learning_rate": 2.6455729064476227e-05, + "loss": 0.0701, + "step": 1883 + }, + { + "epoch": 0.41, + "grad_norm": 0.4744976367419273, + "learning_rate": 2.6442260815503575e-05, + "loss": 0.0695, + "step": 1884 + }, + { + "epoch": 0.41, + "grad_norm": 0.45511068792452036, + "learning_rate": 2.6428789306063233e-05, + "loss": 0.0689, + "step": 1885 + }, + { + "epoch": 0.41, + "grad_norm": 0.39648076699405155, + "learning_rate": 2.6415314542973214e-05, + "loss": 0.0569, + "step": 1886 + }, + { + "epoch": 0.41, + "grad_norm": 0.39929021953304733, + "learning_rate": 2.6401836533053186e-05, + "loss": 0.0633, + "step": 1887 + }, + { + "epoch": 0.41, + "grad_norm": 0.5730543722482679, + "learning_rate": 2.6388355283124435e-05, + "loss": 0.0875, + "step": 1888 + }, + { + "epoch": 0.41, + "grad_norm": 0.47096256371662787, + "learning_rate": 2.637487080000992e-05, + "loss": 0.0705, + "step": 1889 + }, + { + "epoch": 0.42, + "grad_norm": 0.4661904390045287, + "learning_rate": 2.636138309053421e-05, + "loss": 0.0533, + "step": 1890 + }, + { + "epoch": 0.42, + "grad_norm": 0.503316933883678, + "learning_rate": 2.634789216152353e-05, + "loss": 0.07, + "step": 1891 + }, + { + "epoch": 0.42, + "grad_norm": 0.5061427615048223, + "learning_rate": 2.63343980198057e-05, + "loss": 0.0905, + "step": 1892 + }, + { + "epoch": 0.42, + "grad_norm": 0.4891768824481601, + "learning_rate": 2.6320900672210216e-05, + "loss": 0.0663, + "step": 1893 + }, + { + "epoch": 0.42, + "grad_norm": 0.46323637785237165, + "learning_rate": 2.6307400125568147e-05, + "loss": 0.0685, + "step": 1894 + }, + { + "epoch": 0.42, + "grad_norm": 0.5383597506348967, + "learning_rate": 2.629389638671221e-05, + "loss": 0.0838, + "step": 1895 + }, + { + "epoch": 0.42, + "grad_norm": 0.47678387299932706, + "learning_rate": 2.6280389462476733e-05, + "loss": 0.0821, + "step": 1896 + }, + { + "epoch": 0.42, + "grad_norm": 0.46968283023403673, + "learning_rate": 2.6266879359697647e-05, + "loss": 0.0678, + "step": 1897 + }, + { + "epoch": 0.42, + "grad_norm": 0.4568098298074498, + "learning_rate": 2.6253366085212503e-05, + "loss": 0.075, + "step": 1898 + }, + { + "epoch": 0.42, + "grad_norm": 0.4514430333537384, + "learning_rate": 2.6239849645860447e-05, + "loss": 0.0744, + "step": 1899 + }, + { + "epoch": 0.42, + "grad_norm": 0.44452307815951564, + "learning_rate": 2.6226330048482233e-05, + "loss": 0.0718, + "step": 1900 + }, + { + "epoch": 0.42, + "grad_norm": 0.4951100245452337, + "learning_rate": 2.6212807299920218e-05, + "loss": 0.0713, + "step": 1901 + }, + { + "epoch": 0.42, + "grad_norm": 0.4672619725199249, + "learning_rate": 2.6199281407018338e-05, + "loss": 0.0758, + "step": 1902 + }, + { + "epoch": 0.42, + "grad_norm": 0.35243519444327964, + "learning_rate": 2.618575237662214e-05, + "loss": 0.0512, + "step": 1903 + }, + { + "epoch": 0.42, + "grad_norm": 0.40736542307882356, + "learning_rate": 2.6172220215578743e-05, + "loss": 0.0544, + "step": 1904 + }, + { + "epoch": 0.42, + "grad_norm": 0.4704904194392552, + "learning_rate": 2.615868493073686e-05, + "loss": 0.0806, + "step": 1905 + }, + { + "epoch": 0.42, + "grad_norm": 0.36351862709614324, + "learning_rate": 2.614514652894678e-05, + "loss": 0.0582, + "step": 1906 + }, + { + "epoch": 0.42, + "grad_norm": 0.4428274181493983, + "learning_rate": 2.613160501706037e-05, + "loss": 0.062, + "step": 1907 + }, + { + "epoch": 0.42, + "grad_norm": 0.37773973289563717, + "learning_rate": 2.6118060401931073e-05, + "loss": 0.057, + "step": 1908 + }, + { + "epoch": 0.42, + "grad_norm": 0.4885177039781334, + "learning_rate": 2.6104512690413906e-05, + "loss": 0.0975, + "step": 1909 + }, + { + "epoch": 0.42, + "grad_norm": 0.49160308049558293, + "learning_rate": 2.609096188936544e-05, + "loss": 0.0669, + "step": 1910 + }, + { + "epoch": 0.42, + "grad_norm": 0.4757599091402448, + "learning_rate": 2.607740800564383e-05, + "loss": 0.0663, + "step": 1911 + }, + { + "epoch": 0.42, + "grad_norm": 0.4879069304967567, + "learning_rate": 2.6063851046108766e-05, + "loss": 0.0749, + "step": 1912 + }, + { + "epoch": 0.42, + "grad_norm": 0.6329272601773189, + "learning_rate": 2.605029101762152e-05, + "loss": 0.1222, + "step": 1913 + }, + { + "epoch": 0.42, + "grad_norm": 0.4395667713293501, + "learning_rate": 2.6036727927044897e-05, + "loss": 0.0693, + "step": 1914 + }, + { + "epoch": 0.42, + "grad_norm": 0.42974012226050445, + "learning_rate": 2.602316178124327e-05, + "loss": 0.0624, + "step": 1915 + }, + { + "epoch": 0.42, + "grad_norm": 0.5026687685967235, + "learning_rate": 2.6009592587082538e-05, + "loss": 0.0715, + "step": 1916 + }, + { + "epoch": 0.42, + "grad_norm": 0.4787859406190129, + "learning_rate": 2.5996020351430163e-05, + "loss": 0.0772, + "step": 1917 + }, + { + "epoch": 0.42, + "grad_norm": 0.49640720707144015, + "learning_rate": 2.598244508115513e-05, + "loss": 0.0869, + "step": 1918 + }, + { + "epoch": 0.42, + "grad_norm": 0.44704691774444677, + "learning_rate": 2.596886678312797e-05, + "loss": 0.0717, + "step": 1919 + }, + { + "epoch": 0.42, + "grad_norm": 0.566518259483125, + "learning_rate": 2.5955285464220738e-05, + "loss": 0.1036, + "step": 1920 + }, + { + "epoch": 0.42, + "grad_norm": 0.49886906395355324, + "learning_rate": 2.594170113130703e-05, + "loss": 0.0717, + "step": 1921 + }, + { + "epoch": 0.42, + "grad_norm": 0.3631885369239595, + "learning_rate": 2.5928113791261952e-05, + "loss": 0.0446, + "step": 1922 + }, + { + "epoch": 0.42, + "grad_norm": 0.42634171162088436, + "learning_rate": 2.5914523450962147e-05, + "loss": 0.0643, + "step": 1923 + }, + { + "epoch": 0.42, + "grad_norm": 0.49661806889154125, + "learning_rate": 2.590093011728577e-05, + "loss": 0.0865, + "step": 1924 + }, + { + "epoch": 0.42, + "grad_norm": 0.37300980468627076, + "learning_rate": 2.588733379711248e-05, + "loss": 0.0465, + "step": 1925 + }, + { + "epoch": 0.42, + "grad_norm": 0.41554669022488777, + "learning_rate": 2.587373449732347e-05, + "loss": 0.051, + "step": 1926 + }, + { + "epoch": 0.42, + "grad_norm": 0.45649580278378604, + "learning_rate": 2.5860132224801424e-05, + "loss": 0.0712, + "step": 1927 + }, + { + "epoch": 0.42, + "grad_norm": 0.40900949293536104, + "learning_rate": 2.584652698643054e-05, + "loss": 0.0591, + "step": 1928 + }, + { + "epoch": 0.42, + "grad_norm": 0.3939289367631103, + "learning_rate": 2.58329187890965e-05, + "loss": 0.0472, + "step": 1929 + }, + { + "epoch": 0.42, + "grad_norm": 0.46501062106749136, + "learning_rate": 2.581930763968651e-05, + "loss": 0.0741, + "step": 1930 + }, + { + "epoch": 0.42, + "grad_norm": 0.4760442792819372, + "learning_rate": 2.580569354508925e-05, + "loss": 0.0687, + "step": 1931 + }, + { + "epoch": 0.42, + "grad_norm": 0.4572477561474321, + "learning_rate": 2.5792076512194895e-05, + "loss": 0.075, + "step": 1932 + }, + { + "epoch": 0.42, + "grad_norm": 0.5661992714329125, + "learning_rate": 2.5778456547895117e-05, + "loss": 0.0905, + "step": 1933 + }, + { + "epoch": 0.42, + "grad_norm": 0.4468749226389998, + "learning_rate": 2.5764833659083053e-05, + "loss": 0.0656, + "step": 1934 + }, + { + "epoch": 0.42, + "grad_norm": 0.5459863800019874, + "learning_rate": 2.5751207852653334e-05, + "loss": 0.1169, + "step": 1935 + }, + { + "epoch": 0.43, + "grad_norm": 0.4620661092127289, + "learning_rate": 2.5737579135502068e-05, + "loss": 0.0691, + "step": 1936 + }, + { + "epoch": 0.43, + "grad_norm": 0.47962413766985756, + "learning_rate": 2.572394751452683e-05, + "loss": 0.0738, + "step": 1937 + }, + { + "epoch": 0.43, + "grad_norm": 0.4390047116787088, + "learning_rate": 2.5710312996626667e-05, + "loss": 0.0751, + "step": 1938 + }, + { + "epoch": 0.43, + "grad_norm": 0.4369903218236427, + "learning_rate": 2.569667558870209e-05, + "loss": 0.0652, + "step": 1939 + }, + { + "epoch": 0.43, + "grad_norm": 0.45743852384713024, + "learning_rate": 2.5683035297655076e-05, + "loss": 0.075, + "step": 1940 + }, + { + "epoch": 0.43, + "grad_norm": 0.4280689098342205, + "learning_rate": 2.566939213038906e-05, + "loss": 0.0711, + "step": 1941 + }, + { + "epoch": 0.43, + "grad_norm": 0.46608190968404795, + "learning_rate": 2.5655746093808934e-05, + "loss": 0.0796, + "step": 1942 + }, + { + "epoch": 0.43, + "grad_norm": 0.440689855902259, + "learning_rate": 2.564209719482104e-05, + "loss": 0.0509, + "step": 1943 + }, + { + "epoch": 0.43, + "grad_norm": 0.47352029309397264, + "learning_rate": 2.5628445440333164e-05, + "loss": 0.0917, + "step": 1944 + }, + { + "epoch": 0.43, + "grad_norm": 0.44388221799202127, + "learning_rate": 2.5614790837254555e-05, + "loss": 0.0565, + "step": 1945 + }, + { + "epoch": 0.43, + "grad_norm": 0.4410939757738066, + "learning_rate": 2.5601133392495886e-05, + "loss": 0.0636, + "step": 1946 + }, + { + "epoch": 0.43, + "grad_norm": 0.4855035279152683, + "learning_rate": 2.558747311296926e-05, + "loss": 0.0593, + "step": 1947 + }, + { + "epoch": 0.43, + "grad_norm": 0.4578283320218462, + "learning_rate": 2.5573810005588245e-05, + "loss": 0.0786, + "step": 1948 + }, + { + "epoch": 0.43, + "grad_norm": 0.49591954995008763, + "learning_rate": 2.5560144077267826e-05, + "loss": 0.0731, + "step": 1949 + }, + { + "epoch": 0.43, + "grad_norm": 0.4308347471916173, + "learning_rate": 2.5546475334924398e-05, + "loss": 0.0584, + "step": 1950 + }, + { + "epoch": 0.43, + "grad_norm": 0.4204925562690745, + "learning_rate": 2.5532803785475802e-05, + "loss": 0.0656, + "step": 1951 + }, + { + "epoch": 0.43, + "grad_norm": 0.49925856453889894, + "learning_rate": 2.5519129435841298e-05, + "loss": 0.077, + "step": 1952 + }, + { + "epoch": 0.43, + "grad_norm": 0.4948463978529427, + "learning_rate": 2.550545229294155e-05, + "loss": 0.0663, + "step": 1953 + }, + { + "epoch": 0.43, + "grad_norm": 0.4999508982628857, + "learning_rate": 2.549177236369865e-05, + "loss": 0.0838, + "step": 1954 + }, + { + "epoch": 0.43, + "grad_norm": 0.45857363239203563, + "learning_rate": 2.5478089655036086e-05, + "loss": 0.061, + "step": 1955 + }, + { + "epoch": 0.43, + "grad_norm": 0.35649923674406764, + "learning_rate": 2.5464404173878775e-05, + "loss": 0.0626, + "step": 1956 + }, + { + "epoch": 0.43, + "grad_norm": 0.4048285926917149, + "learning_rate": 2.5450715927153012e-05, + "loss": 0.0536, + "step": 1957 + }, + { + "epoch": 0.43, + "grad_norm": 0.38869547173262164, + "learning_rate": 2.54370249217865e-05, + "loss": 0.0501, + "step": 1958 + }, + { + "epoch": 0.43, + "grad_norm": 0.5734012519543342, + "learning_rate": 2.542333116470835e-05, + "loss": 0.0768, + "step": 1959 + }, + { + "epoch": 0.43, + "grad_norm": 0.4480250545333302, + "learning_rate": 2.5409634662849053e-05, + "loss": 0.0644, + "step": 1960 + }, + { + "epoch": 0.43, + "grad_norm": 0.47188119947390483, + "learning_rate": 2.5395935423140487e-05, + "loss": 0.0771, + "step": 1961 + }, + { + "epoch": 0.43, + "grad_norm": 0.4498040408396079, + "learning_rate": 2.5382233452515927e-05, + "loss": 0.0795, + "step": 1962 + }, + { + "epoch": 0.43, + "grad_norm": 0.5292149441641056, + "learning_rate": 2.5368528757910027e-05, + "loss": 0.0749, + "step": 1963 + }, + { + "epoch": 0.43, + "grad_norm": 0.44870011618221406, + "learning_rate": 2.5354821346258813e-05, + "loss": 0.0622, + "step": 1964 + }, + { + "epoch": 0.43, + "grad_norm": 0.4956262894189136, + "learning_rate": 2.534111122449969e-05, + "loss": 0.0914, + "step": 1965 + }, + { + "epoch": 0.43, + "grad_norm": 0.5129120286948241, + "learning_rate": 2.532739839957143e-05, + "loss": 0.0824, + "step": 1966 + }, + { + "epoch": 0.43, + "grad_norm": 0.4525574549928641, + "learning_rate": 2.5313682878414185e-05, + "loss": 0.0588, + "step": 1967 + }, + { + "epoch": 0.43, + "grad_norm": 0.49355155904583753, + "learning_rate": 2.529996466796946e-05, + "loss": 0.0858, + "step": 1968 + }, + { + "epoch": 0.43, + "grad_norm": 0.47739849869196777, + "learning_rate": 2.5286243775180128e-05, + "loss": 0.0738, + "step": 1969 + }, + { + "epoch": 0.43, + "grad_norm": 0.3867738087309617, + "learning_rate": 2.5272520206990418e-05, + "loss": 0.0636, + "step": 1970 + }, + { + "epoch": 0.43, + "grad_norm": 0.45189343176114144, + "learning_rate": 2.5258793970345908e-05, + "loss": 0.0667, + "step": 1971 + }, + { + "epoch": 0.43, + "grad_norm": 0.45433940606363016, + "learning_rate": 2.5245065072193534e-05, + "loss": 0.069, + "step": 1972 + }, + { + "epoch": 0.43, + "grad_norm": 0.519966287528791, + "learning_rate": 2.5231333519481577e-05, + "loss": 0.0744, + "step": 1973 + }, + { + "epoch": 0.43, + "grad_norm": 0.4226544385437567, + "learning_rate": 2.5217599319159654e-05, + "loss": 0.0551, + "step": 1974 + }, + { + "epoch": 0.43, + "grad_norm": 0.3707658825119907, + "learning_rate": 2.5203862478178732e-05, + "loss": 0.0505, + "step": 1975 + }, + { + "epoch": 0.43, + "grad_norm": 0.5165477933826751, + "learning_rate": 2.519012300349111e-05, + "loss": 0.0848, + "step": 1976 + }, + { + "epoch": 0.43, + "grad_norm": 0.4305647132097901, + "learning_rate": 2.5176380902050418e-05, + "loss": 0.0624, + "step": 1977 + }, + { + "epoch": 0.43, + "grad_norm": 0.4972505673558224, + "learning_rate": 2.516263618081162e-05, + "loss": 0.0629, + "step": 1978 + }, + { + "epoch": 0.43, + "grad_norm": 0.46207126902088547, + "learning_rate": 2.5148888846731007e-05, + "loss": 0.0777, + "step": 1979 + }, + { + "epoch": 0.43, + "grad_norm": 0.44448269945982694, + "learning_rate": 2.5135138906766185e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 0.44, + "grad_norm": 0.4387582668384212, + "learning_rate": 2.512138636787608e-05, + "loss": 0.0724, + "step": 1981 + }, + { + "epoch": 0.44, + "grad_norm": 0.48427690004796514, + "learning_rate": 2.510763123702094e-05, + "loss": 0.0745, + "step": 1982 + }, + { + "epoch": 0.44, + "grad_norm": 0.401680074912499, + "learning_rate": 2.5093873521162323e-05, + "loss": 0.05, + "step": 1983 + }, + { + "epoch": 0.44, + "grad_norm": 0.48928307565120804, + "learning_rate": 2.5080113227263093e-05, + "loss": 0.0863, + "step": 1984 + }, + { + "epoch": 0.44, + "grad_norm": 0.3701655575188742, + "learning_rate": 2.5066350362287407e-05, + "loss": 0.0556, + "step": 1985 + }, + { + "epoch": 0.44, + "grad_norm": 0.37219148199736224, + "learning_rate": 2.5052584933200756e-05, + "loss": 0.0562, + "step": 1986 + }, + { + "epoch": 0.44, + "grad_norm": 0.522504487406247, + "learning_rate": 2.5038816946969894e-05, + "loss": 0.0867, + "step": 1987 + }, + { + "epoch": 0.44, + "grad_norm": 0.4148666203116047, + "learning_rate": 2.5025046410562888e-05, + "loss": 0.076, + "step": 1988 + }, + { + "epoch": 0.44, + "grad_norm": 0.4241673390970852, + "learning_rate": 2.501127333094909e-05, + "loss": 0.058, + "step": 1989 + }, + { + "epoch": 0.44, + "grad_norm": 0.524471741528751, + "learning_rate": 2.4997497715099134e-05, + "loss": 0.0695, + "step": 1990 + }, + { + "epoch": 0.44, + "grad_norm": 0.48102112938397373, + "learning_rate": 2.4983719569984955e-05, + "loss": 0.0668, + "step": 1991 + }, + { + "epoch": 0.44, + "grad_norm": 0.3793570795807171, + "learning_rate": 2.496993890257975e-05, + "loss": 0.0638, + "step": 1992 + }, + { + "epoch": 0.44, + "grad_norm": 0.3804833732473676, + "learning_rate": 2.4956155719858e-05, + "loss": 0.0473, + "step": 1993 + }, + { + "epoch": 0.44, + "grad_norm": 0.4429668419744718, + "learning_rate": 2.4942370028795456e-05, + "loss": 0.0663, + "step": 1994 + }, + { + "epoch": 0.44, + "grad_norm": 0.5238030139613532, + "learning_rate": 2.4928581836369147e-05, + "loss": 0.089, + "step": 1995 + }, + { + "epoch": 0.44, + "grad_norm": 0.4618954483378638, + "learning_rate": 2.4914791149557358e-05, + "loss": 0.0645, + "step": 1996 + }, + { + "epoch": 0.44, + "grad_norm": 0.40761575008590367, + "learning_rate": 2.490099797533964e-05, + "loss": 0.0684, + "step": 1997 + }, + { + "epoch": 0.44, + "grad_norm": 0.39386441748205986, + "learning_rate": 2.48872023206968e-05, + "loss": 0.0582, + "step": 1998 + }, + { + "epoch": 0.44, + "grad_norm": 0.580610602439528, + "learning_rate": 2.487340419261091e-05, + "loss": 0.1076, + "step": 1999 + }, + { + "epoch": 0.44, + "grad_norm": 0.41888062660197195, + "learning_rate": 2.485960359806528e-05, + "loss": 0.0584, + "step": 2000 + }, + { + "epoch": 0.44, + "grad_norm": 0.4820066510667588, + "learning_rate": 2.4845800544044483e-05, + "loss": 0.0792, + "step": 2001 + }, + { + "epoch": 0.44, + "grad_norm": 0.4113433733628816, + "learning_rate": 2.4831995037534325e-05, + "loss": 0.0639, + "step": 2002 + }, + { + "epoch": 0.44, + "grad_norm": 0.4688180556167094, + "learning_rate": 2.481818708552185e-05, + "loss": 0.0657, + "step": 2003 + }, + { + "epoch": 0.44, + "grad_norm": 0.557453565759295, + "learning_rate": 2.480437669499537e-05, + "loss": 0.0744, + "step": 2004 + }, + { + "epoch": 0.44, + "grad_norm": 0.45971494928556966, + "learning_rate": 2.479056387294438e-05, + "loss": 0.0745, + "step": 2005 + }, + { + "epoch": 0.44, + "grad_norm": 0.3851151984198529, + "learning_rate": 2.4776748626359656e-05, + "loss": 0.0543, + "step": 2006 + }, + { + "epoch": 0.44, + "grad_norm": 0.5287876565298385, + "learning_rate": 2.4762930962233164e-05, + "loss": 0.0922, + "step": 2007 + }, + { + "epoch": 0.44, + "grad_norm": 0.5038474884074328, + "learning_rate": 2.4749110887558114e-05, + "loss": 0.0628, + "step": 2008 + }, + { + "epoch": 0.44, + "grad_norm": 0.43435565919150204, + "learning_rate": 2.4735288409328937e-05, + "loss": 0.0619, + "step": 2009 + }, + { + "epoch": 0.44, + "grad_norm": 0.4305378735493745, + "learning_rate": 2.472146353454127e-05, + "loss": 0.0666, + "step": 2010 + }, + { + "epoch": 0.44, + "grad_norm": 0.42052916150574776, + "learning_rate": 2.4707636270191956e-05, + "loss": 0.0669, + "step": 2011 + }, + { + "epoch": 0.44, + "grad_norm": 0.4320426553114052, + "learning_rate": 2.4693806623279074e-05, + "loss": 0.0643, + "step": 2012 + }, + { + "epoch": 0.44, + "grad_norm": 0.45603134128190864, + "learning_rate": 2.4679974600801882e-05, + "loss": 0.0756, + "step": 2013 + }, + { + "epoch": 0.44, + "grad_norm": 0.5432398825807966, + "learning_rate": 2.4666140209760862e-05, + "loss": 0.1047, + "step": 2014 + }, + { + "epoch": 0.44, + "grad_norm": 0.3501122018088189, + "learning_rate": 2.4652303457157677e-05, + "loss": 0.0458, + "step": 2015 + }, + { + "epoch": 0.44, + "grad_norm": 0.4849934014382745, + "learning_rate": 2.4638464349995186e-05, + "loss": 0.0739, + "step": 2016 + }, + { + "epoch": 0.44, + "grad_norm": 0.4127435189367342, + "learning_rate": 2.4624622895277462e-05, + "loss": 0.0532, + "step": 2017 + }, + { + "epoch": 0.44, + "grad_norm": 0.3852790853300477, + "learning_rate": 2.461077910000974e-05, + "loss": 0.0526, + "step": 2018 + }, + { + "epoch": 0.44, + "grad_norm": 0.39799662114630846, + "learning_rate": 2.4596932971198446e-05, + "loss": 0.0678, + "step": 2019 + }, + { + "epoch": 0.44, + "grad_norm": 0.42710675392442127, + "learning_rate": 2.4583084515851194e-05, + "loss": 0.0618, + "step": 2020 + }, + { + "epoch": 0.44, + "grad_norm": 0.4730950517637139, + "learning_rate": 2.456923374097678e-05, + "loss": 0.0679, + "step": 2021 + }, + { + "epoch": 0.44, + "grad_norm": 0.3588081118074976, + "learning_rate": 2.4555380653585158e-05, + "loss": 0.0578, + "step": 2022 + }, + { + "epoch": 0.44, + "grad_norm": 0.4513238923236032, + "learning_rate": 2.4541525260687468e-05, + "loss": 0.0626, + "step": 2023 + }, + { + "epoch": 0.44, + "grad_norm": 0.5074836840144351, + "learning_rate": 2.4527667569295996e-05, + "loss": 0.0722, + "step": 2024 + }, + { + "epoch": 0.44, + "grad_norm": 0.45947534801574375, + "learning_rate": 2.4513807586424214e-05, + "loss": 0.0532, + "step": 2025 + }, + { + "epoch": 0.44, + "grad_norm": 0.41736844885552515, + "learning_rate": 2.449994531908675e-05, + "loss": 0.0625, + "step": 2026 + }, + { + "epoch": 0.45, + "grad_norm": 0.4226561308347636, + "learning_rate": 2.4486080774299364e-05, + "loss": 0.0602, + "step": 2027 + }, + { + "epoch": 0.45, + "grad_norm": 0.44196260886816885, + "learning_rate": 2.4472213959079002e-05, + "loss": 0.0608, + "step": 2028 + }, + { + "epoch": 0.45, + "grad_norm": 0.48246632112555954, + "learning_rate": 2.4458344880443735e-05, + "loss": 0.0785, + "step": 2029 + }, + { + "epoch": 0.45, + "grad_norm": 0.43183535502932363, + "learning_rate": 2.4444473545412804e-05, + "loss": 0.0519, + "step": 2030 + }, + { + "epoch": 0.45, + "grad_norm": 0.47867407219085056, + "learning_rate": 2.4430599961006563e-05, + "loss": 0.0679, + "step": 2031 + }, + { + "epoch": 0.45, + "grad_norm": 0.385196541309476, + "learning_rate": 2.441672413424652e-05, + "loss": 0.0545, + "step": 2032 + }, + { + "epoch": 0.45, + "grad_norm": 0.38971940926330956, + "learning_rate": 2.4402846072155313e-05, + "loss": 0.0598, + "step": 2033 + }, + { + "epoch": 0.45, + "grad_norm": 0.4747944694621555, + "learning_rate": 2.4388965781756727e-05, + "loss": 0.0663, + "step": 2034 + }, + { + "epoch": 0.45, + "grad_norm": 0.5031381711953609, + "learning_rate": 2.437508327007565e-05, + "loss": 0.0781, + "step": 2035 + }, + { + "epoch": 0.45, + "grad_norm": 0.3809346246962901, + "learning_rate": 2.4361198544138117e-05, + "loss": 0.0534, + "step": 2036 + }, + { + "epoch": 0.45, + "grad_norm": 0.34526724195138403, + "learning_rate": 2.4347311610971255e-05, + "loss": 0.0496, + "step": 2037 + }, + { + "epoch": 0.45, + "grad_norm": 0.4932194744743591, + "learning_rate": 2.4333422477603342e-05, + "loss": 0.0835, + "step": 2038 + }, + { + "epoch": 0.45, + "grad_norm": 0.48806754967168864, + "learning_rate": 2.4319531151063753e-05, + "loss": 0.0631, + "step": 2039 + }, + { + "epoch": 0.45, + "grad_norm": 0.3881418457614026, + "learning_rate": 2.4305637638382967e-05, + "loss": 0.0453, + "step": 2040 + }, + { + "epoch": 0.45, + "grad_norm": 0.3958838664809399, + "learning_rate": 2.4291741946592575e-05, + "loss": 0.0618, + "step": 2041 + }, + { + "epoch": 0.45, + "grad_norm": 0.4232946985585955, + "learning_rate": 2.427784408272528e-05, + "loss": 0.0558, + "step": 2042 + }, + { + "epoch": 0.45, + "grad_norm": 0.45405005706414564, + "learning_rate": 2.4263944053814866e-05, + "loss": 0.072, + "step": 2043 + }, + { + "epoch": 0.45, + "grad_norm": 0.41057790245168874, + "learning_rate": 2.4250041866896234e-05, + "loss": 0.0633, + "step": 2044 + }, + { + "epoch": 0.45, + "grad_norm": 0.49894964639299666, + "learning_rate": 2.4236137529005355e-05, + "loss": 0.0733, + "step": 2045 + }, + { + "epoch": 0.45, + "grad_norm": 0.4828462008717174, + "learning_rate": 2.4222231047179303e-05, + "loss": 0.0645, + "step": 2046 + }, + { + "epoch": 0.45, + "grad_norm": 0.4851688985358276, + "learning_rate": 2.420832242845624e-05, + "loss": 0.0518, + "step": 2047 + }, + { + "epoch": 0.45, + "grad_norm": 0.47643881896966234, + "learning_rate": 2.41944116798754e-05, + "loss": 0.0719, + "step": 2048 + }, + { + "epoch": 0.45, + "grad_norm": 0.46149104227028975, + "learning_rate": 2.4180498808477096e-05, + "loss": 0.0696, + "step": 2049 + }, + { + "epoch": 0.45, + "grad_norm": 0.4211329716125816, + "learning_rate": 2.4166583821302712e-05, + "loss": 0.0714, + "step": 2050 + }, + { + "epoch": 0.45, + "grad_norm": 0.3851136120143572, + "learning_rate": 2.4152666725394717e-05, + "loss": 0.0594, + "step": 2051 + }, + { + "epoch": 0.45, + "grad_norm": 0.4271611243632751, + "learning_rate": 2.413874752779664e-05, + "loss": 0.0637, + "step": 2052 + }, + { + "epoch": 0.45, + "grad_norm": 0.4608236592549507, + "learning_rate": 2.412482623555307e-05, + "loss": 0.0786, + "step": 2053 + }, + { + "epoch": 0.45, + "grad_norm": 0.4462314293039966, + "learning_rate": 2.411090285570965e-05, + "loss": 0.0636, + "step": 2054 + }, + { + "epoch": 0.45, + "grad_norm": 0.41341095170918657, + "learning_rate": 2.4096977395313096e-05, + "loss": 0.0691, + "step": 2055 + }, + { + "epoch": 0.45, + "grad_norm": 0.3892252639047788, + "learning_rate": 2.4083049861411173e-05, + "loss": 0.0557, + "step": 2056 + }, + { + "epoch": 0.45, + "grad_norm": 0.43831609495639573, + "learning_rate": 2.4069120261052682e-05, + "loss": 0.0713, + "step": 2057 + }, + { + "epoch": 0.45, + "grad_norm": 0.4160579785202587, + "learning_rate": 2.4055188601287483e-05, + "loss": 0.0649, + "step": 2058 + }, + { + "epoch": 0.45, + "grad_norm": 0.35123628740593144, + "learning_rate": 2.404125488916647e-05, + "loss": 0.0519, + "step": 2059 + }, + { + "epoch": 0.45, + "grad_norm": 0.3956505788576386, + "learning_rate": 2.402731913174159e-05, + "loss": 0.0516, + "step": 2060 + }, + { + "epoch": 0.45, + "grad_norm": 0.5417443335922905, + "learning_rate": 2.4013381336065805e-05, + "loss": 0.0875, + "step": 2061 + }, + { + "epoch": 0.45, + "grad_norm": 0.4087644851229479, + "learning_rate": 2.399944150919313e-05, + "loss": 0.0595, + "step": 2062 + }, + { + "epoch": 0.45, + "grad_norm": 0.4386785504995785, + "learning_rate": 2.398549965817858e-05, + "loss": 0.0634, + "step": 2063 + }, + { + "epoch": 0.45, + "grad_norm": 0.3772173533599839, + "learning_rate": 2.3971555790078228e-05, + "loss": 0.0626, + "step": 2064 + }, + { + "epoch": 0.45, + "grad_norm": 0.49008808764987444, + "learning_rate": 2.3957609911949146e-05, + "loss": 0.0765, + "step": 2065 + }, + { + "epoch": 0.45, + "grad_norm": 0.3784129283938517, + "learning_rate": 2.3943662030849426e-05, + "loss": 0.0541, + "step": 2066 + }, + { + "epoch": 0.45, + "grad_norm": 0.42390100230516714, + "learning_rate": 2.3929712153838173e-05, + "loss": 0.0645, + "step": 2067 + }, + { + "epoch": 0.45, + "grad_norm": 0.5508192333353973, + "learning_rate": 2.3915760287975515e-05, + "loss": 0.0865, + "step": 2068 + }, + { + "epoch": 0.45, + "grad_norm": 0.48057344117933043, + "learning_rate": 2.390180644032257e-05, + "loss": 0.0718, + "step": 2069 + }, + { + "epoch": 0.45, + "grad_norm": 0.44469013559146137, + "learning_rate": 2.3887850617941464e-05, + "loss": 0.0731, + "step": 2070 + }, + { + "epoch": 0.45, + "grad_norm": 0.37449150577991236, + "learning_rate": 2.3873892827895332e-05, + "loss": 0.0592, + "step": 2071 + }, + { + "epoch": 0.46, + "grad_norm": 0.3453121668611188, + "learning_rate": 2.3859933077248285e-05, + "loss": 0.0572, + "step": 2072 + }, + { + "epoch": 0.46, + "grad_norm": 0.3831193628254262, + "learning_rate": 2.3845971373065452e-05, + "loss": 0.0448, + "step": 2073 + }, + { + "epoch": 0.46, + "grad_norm": 0.4014324311553023, + "learning_rate": 2.3832007722412934e-05, + "loss": 0.066, + "step": 2074 + }, + { + "epoch": 0.46, + "grad_norm": 0.4756276701803358, + "learning_rate": 2.3818042132357812e-05, + "loss": 0.0615, + "step": 2075 + }, + { + "epoch": 0.46, + "grad_norm": 0.46100292711300944, + "learning_rate": 2.3804074609968158e-05, + "loss": 0.0621, + "step": 2076 + }, + { + "epoch": 0.46, + "grad_norm": 0.40052272199260675, + "learning_rate": 2.3790105162313032e-05, + "loss": 0.0569, + "step": 2077 + }, + { + "epoch": 0.46, + "grad_norm": 0.47979434684151695, + "learning_rate": 2.3776133796462446e-05, + "loss": 0.0566, + "step": 2078 + }, + { + "epoch": 0.46, + "grad_norm": 0.503394081000372, + "learning_rate": 2.3762160519487402e-05, + "loss": 0.089, + "step": 2079 + }, + { + "epoch": 0.46, + "grad_norm": 0.47883710377584393, + "learning_rate": 2.3748185338459847e-05, + "loss": 0.0656, + "step": 2080 + }, + { + "epoch": 0.46, + "grad_norm": 0.4445039536286361, + "learning_rate": 2.3734208260452727e-05, + "loss": 0.0679, + "step": 2081 + }, + { + "epoch": 0.46, + "grad_norm": 0.6269723526807839, + "learning_rate": 2.372022929253991e-05, + "loss": 0.0928, + "step": 2082 + }, + { + "epoch": 0.46, + "grad_norm": 0.45459800573575326, + "learning_rate": 2.3706248441796246e-05, + "loss": 0.0678, + "step": 2083 + }, + { + "epoch": 0.46, + "grad_norm": 0.48578360533979176, + "learning_rate": 2.369226571529752e-05, + "loss": 0.0953, + "step": 2084 + }, + { + "epoch": 0.46, + "grad_norm": 0.3936335860836555, + "learning_rate": 2.3678281120120485e-05, + "loss": 0.0661, + "step": 2085 + }, + { + "epoch": 0.46, + "grad_norm": 0.3621791611864308, + "learning_rate": 2.366429466334283e-05, + "loss": 0.0416, + "step": 2086 + }, + { + "epoch": 0.46, + "grad_norm": 0.4244046662682168, + "learning_rate": 2.3650306352043182e-05, + "loss": 0.0617, + "step": 2087 + }, + { + "epoch": 0.46, + "grad_norm": 0.3979055930722182, + "learning_rate": 2.3636316193301107e-05, + "loss": 0.047, + "step": 2088 + }, + { + "epoch": 0.46, + "grad_norm": 0.5196930482789566, + "learning_rate": 2.3622324194197118e-05, + "loss": 0.0734, + "step": 2089 + }, + { + "epoch": 0.46, + "grad_norm": 0.3873937091870021, + "learning_rate": 2.3608330361812652e-05, + "loss": 0.0708, + "step": 2090 + }, + { + "epoch": 0.46, + "grad_norm": 0.3548939847473483, + "learning_rate": 2.3594334703230065e-05, + "loss": 0.0524, + "step": 2091 + }, + { + "epoch": 0.46, + "grad_norm": 0.3971874411438089, + "learning_rate": 2.3580337225532663e-05, + "loss": 0.0568, + "step": 2092 + }, + { + "epoch": 0.46, + "grad_norm": 0.4571903636475039, + "learning_rate": 2.356633793580463e-05, + "loss": 0.0694, + "step": 2093 + }, + { + "epoch": 0.46, + "grad_norm": 0.38851891196183885, + "learning_rate": 2.355233684113111e-05, + "loss": 0.0735, + "step": 2094 + }, + { + "epoch": 0.46, + "grad_norm": 0.4311069970136712, + "learning_rate": 2.3538333948598142e-05, + "loss": 0.0633, + "step": 2095 + }, + { + "epoch": 0.46, + "grad_norm": 0.5541316457548243, + "learning_rate": 2.3524329265292668e-05, + "loss": 0.0989, + "step": 2096 + }, + { + "epoch": 0.46, + "grad_norm": 0.5119881248531218, + "learning_rate": 2.3510322798302553e-05, + "loss": 0.0733, + "step": 2097 + }, + { + "epoch": 0.46, + "grad_norm": 0.35231347937292995, + "learning_rate": 2.3496314554716543e-05, + "loss": 0.0499, + "step": 2098 + }, + { + "epoch": 0.46, + "grad_norm": 0.4671012811618288, + "learning_rate": 2.348230454162431e-05, + "loss": 0.0646, + "step": 2099 + }, + { + "epoch": 0.46, + "grad_norm": 0.41502455742116545, + "learning_rate": 2.34682927661164e-05, + "loss": 0.0609, + "step": 2100 + }, + { + "epoch": 0.46, + "grad_norm": 0.36580421750435826, + "learning_rate": 2.3454279235284264e-05, + "loss": 0.0516, + "step": 2101 + }, + { + "epoch": 0.46, + "grad_norm": 0.3288414490352913, + "learning_rate": 2.344026395622023e-05, + "loss": 0.0411, + "step": 2102 + }, + { + "epoch": 0.46, + "grad_norm": 0.5474729266505735, + "learning_rate": 2.3426246936017514e-05, + "loss": 0.0827, + "step": 2103 + }, + { + "epoch": 0.46, + "grad_norm": 0.3939901361416779, + "learning_rate": 2.3412228181770224e-05, + "loss": 0.0615, + "step": 2104 + }, + { + "epoch": 0.46, + "grad_norm": 0.4041682605449802, + "learning_rate": 2.3398207700573336e-05, + "loss": 0.066, + "step": 2105 + }, + { + "epoch": 0.46, + "grad_norm": 0.45185146919493696, + "learning_rate": 2.3384185499522696e-05, + "loss": 0.0771, + "step": 2106 + }, + { + "epoch": 0.46, + "grad_norm": 0.39996471547427803, + "learning_rate": 2.337016158571503e-05, + "loss": 0.0537, + "step": 2107 + }, + { + "epoch": 0.46, + "grad_norm": 0.441245767232677, + "learning_rate": 2.335613596624793e-05, + "loss": 0.0729, + "step": 2108 + }, + { + "epoch": 0.46, + "grad_norm": 0.4054840643605466, + "learning_rate": 2.334210864821984e-05, + "loss": 0.0524, + "step": 2109 + }, + { + "epoch": 0.46, + "grad_norm": 0.48563810981326716, + "learning_rate": 2.3328079638730073e-05, + "loss": 0.0801, + "step": 2110 + }, + { + "epoch": 0.46, + "grad_norm": 0.4039880983221286, + "learning_rate": 2.3314048944878804e-05, + "loss": 0.0548, + "step": 2111 + }, + { + "epoch": 0.46, + "grad_norm": 0.48363093170054183, + "learning_rate": 2.330001657376705e-05, + "loss": 0.0823, + "step": 2112 + }, + { + "epoch": 0.46, + "grad_norm": 0.3865247490288611, + "learning_rate": 2.3285982532496676e-05, + "loss": 0.0684, + "step": 2113 + }, + { + "epoch": 0.46, + "grad_norm": 0.37701545951017845, + "learning_rate": 2.32719468281704e-05, + "loss": 0.0563, + "step": 2114 + }, + { + "epoch": 0.46, + "grad_norm": 0.4805666350852702, + "learning_rate": 2.325790946789178e-05, + "loss": 0.0873, + "step": 2115 + }, + { + "epoch": 0.46, + "grad_norm": 0.394559689242731, + "learning_rate": 2.32438704587652e-05, + "loss": 0.0542, + "step": 2116 + }, + { + "epoch": 0.46, + "grad_norm": 0.3775259419236919, + "learning_rate": 2.3229829807895904e-05, + "loss": 0.0574, + "step": 2117 + }, + { + "epoch": 0.47, + "grad_norm": 0.39554686176948745, + "learning_rate": 2.3215787522389935e-05, + "loss": 0.0521, + "step": 2118 + }, + { + "epoch": 0.47, + "grad_norm": 0.41370662654906654, + "learning_rate": 2.3201743609354187e-05, + "loss": 0.0639, + "step": 2119 + }, + { + "epoch": 0.47, + "grad_norm": 0.34958462121375417, + "learning_rate": 2.3187698075896378e-05, + "loss": 0.0453, + "step": 2120 + }, + { + "epoch": 0.47, + "grad_norm": 0.4804601591008989, + "learning_rate": 2.317365092912503e-05, + "loss": 0.0712, + "step": 2121 + }, + { + "epoch": 0.47, + "grad_norm": 0.4395080871120076, + "learning_rate": 2.3159602176149493e-05, + "loss": 0.078, + "step": 2122 + }, + { + "epoch": 0.47, + "grad_norm": 0.49353773105297966, + "learning_rate": 2.314555182407992e-05, + "loss": 0.0694, + "step": 2123 + }, + { + "epoch": 0.47, + "grad_norm": 0.41822454165715695, + "learning_rate": 2.3131499880027294e-05, + "loss": 0.0651, + "step": 2124 + }, + { + "epoch": 0.47, + "grad_norm": 0.4385989215954988, + "learning_rate": 2.311744635110338e-05, + "loss": 0.0643, + "step": 2125 + }, + { + "epoch": 0.47, + "grad_norm": 0.45052014793681966, + "learning_rate": 2.3103391244420754e-05, + "loss": 0.0683, + "step": 2126 + }, + { + "epoch": 0.47, + "grad_norm": 0.454793816685309, + "learning_rate": 2.30893345670928e-05, + "loss": 0.0699, + "step": 2127 + }, + { + "epoch": 0.47, + "grad_norm": 0.44657509817367813, + "learning_rate": 2.3075276326233676e-05, + "loss": 0.0803, + "step": 2128 + }, + { + "epoch": 0.47, + "grad_norm": 0.40947483315993416, + "learning_rate": 2.306121652895836e-05, + "loss": 0.062, + "step": 2129 + }, + { + "epoch": 0.47, + "grad_norm": 0.38736266655873886, + "learning_rate": 2.3047155182382584e-05, + "loss": 0.0543, + "step": 2130 + }, + { + "epoch": 0.47, + "grad_norm": 0.3548728643355111, + "learning_rate": 2.3033092293622903e-05, + "loss": 0.0476, + "step": 2131 + }, + { + "epoch": 0.47, + "grad_norm": 0.4771604694937133, + "learning_rate": 2.3019027869796607e-05, + "loss": 0.0645, + "step": 2132 + }, + { + "epoch": 0.47, + "grad_norm": 0.4048655468083591, + "learning_rate": 2.3004961918021804e-05, + "loss": 0.0533, + "step": 2133 + }, + { + "epoch": 0.47, + "grad_norm": 0.3945217886819489, + "learning_rate": 2.2990894445417355e-05, + "loss": 0.0698, + "step": 2134 + }, + { + "epoch": 0.47, + "grad_norm": 0.44724783383095956, + "learning_rate": 2.2976825459102898e-05, + "loss": 0.0602, + "step": 2135 + }, + { + "epoch": 0.47, + "grad_norm": 0.4704754068846899, + "learning_rate": 2.2962754966198815e-05, + "loss": 0.0769, + "step": 2136 + }, + { + "epoch": 0.47, + "grad_norm": 0.4445963002428909, + "learning_rate": 2.2948682973826292e-05, + "loss": 0.072, + "step": 2137 + }, + { + "epoch": 0.47, + "grad_norm": 0.38627250047225226, + "learning_rate": 2.2934609489107236e-05, + "loss": 0.0499, + "step": 2138 + }, + { + "epoch": 0.47, + "grad_norm": 0.5143966493187458, + "learning_rate": 2.292053451916433e-05, + "loss": 0.0722, + "step": 2139 + }, + { + "epoch": 0.47, + "grad_norm": 0.42852994882712525, + "learning_rate": 2.2906458071121e-05, + "loss": 0.0503, + "step": 2140 + }, + { + "epoch": 0.47, + "grad_norm": 0.3497800144203384, + "learning_rate": 2.289238015210142e-05, + "loss": 0.0421, + "step": 2141 + }, + { + "epoch": 0.47, + "grad_norm": 0.4088490317765311, + "learning_rate": 2.2878300769230522e-05, + "loss": 0.0426, + "step": 2142 + }, + { + "epoch": 0.47, + "grad_norm": 0.45912554578699355, + "learning_rate": 2.2864219929633956e-05, + "loss": 0.0635, + "step": 2143 + }, + { + "epoch": 0.47, + "grad_norm": 0.4630192606083607, + "learning_rate": 2.2850137640438126e-05, + "loss": 0.0781, + "step": 2144 + }, + { + "epoch": 0.47, + "grad_norm": 0.45750754553403933, + "learning_rate": 2.2836053908770165e-05, + "loss": 0.0743, + "step": 2145 + }, + { + "epoch": 0.47, + "grad_norm": 0.46505834931196594, + "learning_rate": 2.2821968741757935e-05, + "loss": 0.0754, + "step": 2146 + }, + { + "epoch": 0.47, + "grad_norm": 0.38697667623807835, + "learning_rate": 2.280788214653003e-05, + "loss": 0.0662, + "step": 2147 + }, + { + "epoch": 0.47, + "grad_norm": 0.376535846827464, + "learning_rate": 2.2793794130215753e-05, + "loss": 0.0484, + "step": 2148 + }, + { + "epoch": 0.47, + "grad_norm": 0.41352409770681026, + "learning_rate": 2.2779704699945136e-05, + "loss": 0.0645, + "step": 2149 + }, + { + "epoch": 0.47, + "grad_norm": 0.3455735087539365, + "learning_rate": 2.2765613862848936e-05, + "loss": 0.0504, + "step": 2150 + }, + { + "epoch": 0.47, + "grad_norm": 0.42004527563202043, + "learning_rate": 2.2751521626058607e-05, + "loss": 0.0643, + "step": 2151 + }, + { + "epoch": 0.47, + "grad_norm": 0.40863652926214156, + "learning_rate": 2.2737427996706316e-05, + "loss": 0.0544, + "step": 2152 + }, + { + "epoch": 0.47, + "grad_norm": 0.3377745875628734, + "learning_rate": 2.2723332981924937e-05, + "loss": 0.0459, + "step": 2153 + }, + { + "epoch": 0.47, + "grad_norm": 0.39150200622356357, + "learning_rate": 2.2709236588848036e-05, + "loss": 0.0535, + "step": 2154 + }, + { + "epoch": 0.47, + "grad_norm": 0.5176623006064641, + "learning_rate": 2.269513882460989e-05, + "loss": 0.0888, + "step": 2155 + }, + { + "epoch": 0.47, + "grad_norm": 0.40359070514234835, + "learning_rate": 2.268103969634547e-05, + "loss": 0.0482, + "step": 2156 + }, + { + "epoch": 0.47, + "grad_norm": 0.45307968176111113, + "learning_rate": 2.266693921119042e-05, + "loss": 0.0661, + "step": 2157 + }, + { + "epoch": 0.47, + "grad_norm": 0.6422601162212864, + "learning_rate": 2.2652837376281087e-05, + "loss": 0.0816, + "step": 2158 + }, + { + "epoch": 0.47, + "grad_norm": 0.3880241018409379, + "learning_rate": 2.2638734198754496e-05, + "loss": 0.0642, + "step": 2159 + }, + { + "epoch": 0.47, + "grad_norm": 0.3337084539500318, + "learning_rate": 2.2624629685748353e-05, + "loss": 0.0465, + "step": 2160 + }, + { + "epoch": 0.47, + "grad_norm": 0.37604125816273715, + "learning_rate": 2.261052384440104e-05, + "loss": 0.0629, + "step": 2161 + }, + { + "epoch": 0.47, + "grad_norm": 0.41380834876811373, + "learning_rate": 2.2596416681851595e-05, + "loss": 0.0567, + "step": 2162 + }, + { + "epoch": 0.48, + "grad_norm": 0.4765554959142064, + "learning_rate": 2.2582308205239757e-05, + "loss": 0.0697, + "step": 2163 + }, + { + "epoch": 0.48, + "grad_norm": 0.34652462567446174, + "learning_rate": 2.256819842170591e-05, + "loss": 0.0428, + "step": 2164 + }, + { + "epoch": 0.48, + "grad_norm": 0.3953894167167263, + "learning_rate": 2.2554087338391098e-05, + "loss": 0.0622, + "step": 2165 + }, + { + "epoch": 0.48, + "grad_norm": 0.47163784244279544, + "learning_rate": 2.2539974962437022e-05, + "loss": 0.0849, + "step": 2166 + }, + { + "epoch": 0.48, + "grad_norm": 0.4278849506541713, + "learning_rate": 2.252586130098605e-05, + "loss": 0.0663, + "step": 2167 + }, + { + "epoch": 0.48, + "grad_norm": 0.49671398648944554, + "learning_rate": 2.251174636118119e-05, + "loss": 0.0574, + "step": 2168 + }, + { + "epoch": 0.48, + "grad_norm": 0.40812914237217485, + "learning_rate": 2.2497630150166102e-05, + "loss": 0.0607, + "step": 2169 + }, + { + "epoch": 0.48, + "grad_norm": 0.4092688451970953, + "learning_rate": 2.2483512675085085e-05, + "loss": 0.0657, + "step": 2170 + }, + { + "epoch": 0.48, + "grad_norm": 0.41064415023368733, + "learning_rate": 2.2469393943083068e-05, + "loss": 0.0525, + "step": 2171 + }, + { + "epoch": 0.48, + "grad_norm": 0.4334403445391836, + "learning_rate": 2.245527396130565e-05, + "loss": 0.062, + "step": 2172 + }, + { + "epoch": 0.48, + "grad_norm": 0.4424358614004642, + "learning_rate": 2.2441152736899026e-05, + "loss": 0.0805, + "step": 2173 + }, + { + "epoch": 0.48, + "grad_norm": 0.4298174228183831, + "learning_rate": 2.242703027701004e-05, + "loss": 0.0587, + "step": 2174 + }, + { + "epoch": 0.48, + "grad_norm": 0.3311299151354249, + "learning_rate": 2.2412906588786147e-05, + "loss": 0.0548, + "step": 2175 + }, + { + "epoch": 0.48, + "grad_norm": 0.5231035220386947, + "learning_rate": 2.2398781679375445e-05, + "loss": 0.0883, + "step": 2176 + }, + { + "epoch": 0.48, + "grad_norm": 0.432308870762196, + "learning_rate": 2.2384655555926625e-05, + "loss": 0.0757, + "step": 2177 + }, + { + "epoch": 0.48, + "grad_norm": 0.4268838796627569, + "learning_rate": 2.237052822558901e-05, + "loss": 0.0714, + "step": 2178 + }, + { + "epoch": 0.48, + "grad_norm": 0.4251954835217631, + "learning_rate": 2.235639969551253e-05, + "loss": 0.0518, + "step": 2179 + }, + { + "epoch": 0.48, + "grad_norm": 0.38094984104985813, + "learning_rate": 2.2342269972847718e-05, + "loss": 0.0544, + "step": 2180 + }, + { + "epoch": 0.48, + "grad_norm": 0.4359203499322573, + "learning_rate": 2.232813906474572e-05, + "loss": 0.0518, + "step": 2181 + }, + { + "epoch": 0.48, + "grad_norm": 0.42064737147089587, + "learning_rate": 2.2314006978358263e-05, + "loss": 0.0462, + "step": 2182 + }, + { + "epoch": 0.48, + "grad_norm": 0.426380243595148, + "learning_rate": 2.2299873720837692e-05, + "loss": 0.0452, + "step": 2183 + }, + { + "epoch": 0.48, + "grad_norm": 0.3966205907889756, + "learning_rate": 2.2285739299336933e-05, + "loss": 0.0465, + "step": 2184 + }, + { + "epoch": 0.48, + "grad_norm": 0.3828731994773382, + "learning_rate": 2.22716037210095e-05, + "loss": 0.0639, + "step": 2185 + }, + { + "epoch": 0.48, + "grad_norm": 0.39471942832875806, + "learning_rate": 2.2257466993009503e-05, + "loss": 0.0642, + "step": 2186 + }, + { + "epoch": 0.48, + "grad_norm": 0.4215792042821033, + "learning_rate": 2.2243329122491617e-05, + "loss": 0.0574, + "step": 2187 + }, + { + "epoch": 0.48, + "grad_norm": 0.49431560268048147, + "learning_rate": 2.222919011661111e-05, + "loss": 0.077, + "step": 2188 + }, + { + "epoch": 0.48, + "grad_norm": 0.4424619456539286, + "learning_rate": 2.2215049982523827e-05, + "loss": 0.0567, + "step": 2189 + }, + { + "epoch": 0.48, + "grad_norm": 0.4930441228976846, + "learning_rate": 2.2200908727386167e-05, + "loss": 0.0829, + "step": 2190 + }, + { + "epoch": 0.48, + "grad_norm": 0.43646680221292344, + "learning_rate": 2.2186766358355106e-05, + "loss": 0.0687, + "step": 2191 + }, + { + "epoch": 0.48, + "grad_norm": 0.3875802010940222, + "learning_rate": 2.217262288258818e-05, + "loss": 0.0563, + "step": 2192 + }, + { + "epoch": 0.48, + "grad_norm": 0.4571296553349933, + "learning_rate": 2.2158478307243507e-05, + "loss": 0.0694, + "step": 2193 + }, + { + "epoch": 0.48, + "grad_norm": 0.40632144593168007, + "learning_rate": 2.2144332639479722e-05, + "loss": 0.0535, + "step": 2194 + }, + { + "epoch": 0.48, + "grad_norm": 0.4048639641812654, + "learning_rate": 2.213018588645605e-05, + "loss": 0.0625, + "step": 2195 + }, + { + "epoch": 0.48, + "grad_norm": 0.4720688945731608, + "learning_rate": 2.2116038055332238e-05, + "loss": 0.0681, + "step": 2196 + }, + { + "epoch": 0.48, + "grad_norm": 0.4141419892215765, + "learning_rate": 2.2101889153268595e-05, + "loss": 0.0575, + "step": 2197 + }, + { + "epoch": 0.48, + "grad_norm": 0.38017011040985954, + "learning_rate": 2.2087739187425967e-05, + "loss": 0.0561, + "step": 2198 + }, + { + "epoch": 0.48, + "grad_norm": 0.42091886466391487, + "learning_rate": 2.2073588164965737e-05, + "loss": 0.0536, + "step": 2199 + }, + { + "epoch": 0.48, + "grad_norm": 0.395803308184746, + "learning_rate": 2.205943609304983e-05, + "loss": 0.0507, + "step": 2200 + }, + { + "epoch": 0.48, + "grad_norm": 0.41410210258103936, + "learning_rate": 2.2045282978840684e-05, + "loss": 0.0505, + "step": 2201 + }, + { + "epoch": 0.48, + "grad_norm": 0.31556133204089304, + "learning_rate": 2.2031128829501293e-05, + "loss": 0.0407, + "step": 2202 + }, + { + "epoch": 0.48, + "grad_norm": 0.4462056154997179, + "learning_rate": 2.2016973652195145e-05, + "loss": 0.0724, + "step": 2203 + }, + { + "epoch": 0.48, + "grad_norm": 0.4439268303853345, + "learning_rate": 2.200281745408627e-05, + "loss": 0.0596, + "step": 2204 + }, + { + "epoch": 0.48, + "grad_norm": 0.4004929016486724, + "learning_rate": 2.1988660242339205e-05, + "loss": 0.0568, + "step": 2205 + }, + { + "epoch": 0.48, + "grad_norm": 0.3522127284277402, + "learning_rate": 2.1974502024119002e-05, + "loss": 0.0456, + "step": 2206 + }, + { + "epoch": 0.48, + "grad_norm": 0.5424500327237654, + "learning_rate": 2.196034280659122e-05, + "loss": 0.0967, + "step": 2207 + }, + { + "epoch": 0.48, + "grad_norm": 0.3807729378451628, + "learning_rate": 2.1946182596921917e-05, + "loss": 0.0496, + "step": 2208 + }, + { + "epoch": 0.49, + "grad_norm": 0.45126173437400224, + "learning_rate": 2.1932021402277682e-05, + "loss": 0.0748, + "step": 2209 + }, + { + "epoch": 0.49, + "grad_norm": 0.4133062834630545, + "learning_rate": 2.1917859229825565e-05, + "loss": 0.0555, + "step": 2210 + }, + { + "epoch": 0.49, + "grad_norm": 0.41212271718488375, + "learning_rate": 2.1903696086733142e-05, + "loss": 0.0635, + "step": 2211 + }, + { + "epoch": 0.49, + "grad_norm": 0.36619925925389807, + "learning_rate": 2.188953198016846e-05, + "loss": 0.0457, + "step": 2212 + }, + { + "epoch": 0.49, + "grad_norm": 0.3598676704542792, + "learning_rate": 2.1875366917300057e-05, + "loss": 0.0431, + "step": 2213 + }, + { + "epoch": 0.49, + "grad_norm": 0.5540880806015249, + "learning_rate": 2.1861200905296952e-05, + "loss": 0.0816, + "step": 2214 + }, + { + "epoch": 0.49, + "grad_norm": 0.43048077962859, + "learning_rate": 2.1847033951328673e-05, + "loss": 0.0618, + "step": 2215 + }, + { + "epoch": 0.49, + "grad_norm": 0.3485028107336857, + "learning_rate": 2.1832866062565183e-05, + "loss": 0.0458, + "step": 2216 + }, + { + "epoch": 0.49, + "grad_norm": 0.45006451806515346, + "learning_rate": 2.1818697246176943e-05, + "loss": 0.0561, + "step": 2217 + }, + { + "epoch": 0.49, + "grad_norm": 0.414669886935835, + "learning_rate": 2.1804527509334875e-05, + "loss": 0.0639, + "step": 2218 + }, + { + "epoch": 0.49, + "grad_norm": 0.38722038486539806, + "learning_rate": 2.1790356859210378e-05, + "loss": 0.0507, + "step": 2219 + }, + { + "epoch": 0.49, + "grad_norm": 0.3876689642497525, + "learning_rate": 2.17761853029753e-05, + "loss": 0.0625, + "step": 2220 + }, + { + "epoch": 0.49, + "grad_norm": 0.3570540990550638, + "learning_rate": 2.176201284780195e-05, + "loss": 0.0452, + "step": 2221 + }, + { + "epoch": 0.49, + "grad_norm": 0.4030564116747038, + "learning_rate": 2.1747839500863096e-05, + "loss": 0.0664, + "step": 2222 + }, + { + "epoch": 0.49, + "grad_norm": 0.4450630526663112, + "learning_rate": 2.1733665269331953e-05, + "loss": 0.0569, + "step": 2223 + }, + { + "epoch": 0.49, + "grad_norm": 0.34117537533148296, + "learning_rate": 2.1719490160382196e-05, + "loss": 0.043, + "step": 2224 + }, + { + "epoch": 0.49, + "grad_norm": 0.43426080890571217, + "learning_rate": 2.1705314181187922e-05, + "loss": 0.057, + "step": 2225 + }, + { + "epoch": 0.49, + "grad_norm": 0.3532074486008631, + "learning_rate": 2.169113733892369e-05, + "loss": 0.0622, + "step": 2226 + }, + { + "epoch": 0.49, + "grad_norm": 0.5107080218410214, + "learning_rate": 2.1676959640764484e-05, + "loss": 0.0657, + "step": 2227 + }, + { + "epoch": 0.49, + "grad_norm": 0.37515124608793443, + "learning_rate": 2.166278109388572e-05, + "loss": 0.0508, + "step": 2228 + }, + { + "epoch": 0.49, + "grad_norm": 0.3836904306804695, + "learning_rate": 2.1648601705463263e-05, + "loss": 0.0482, + "step": 2229 + }, + { + "epoch": 0.49, + "grad_norm": 0.3251608294742402, + "learning_rate": 2.1634421482673368e-05, + "loss": 0.0361, + "step": 2230 + }, + { + "epoch": 0.49, + "grad_norm": 0.3571923357885689, + "learning_rate": 2.1620240432692737e-05, + "loss": 0.062, + "step": 2231 + }, + { + "epoch": 0.49, + "grad_norm": 0.44233513042496453, + "learning_rate": 2.1606058562698496e-05, + "loss": 0.0776, + "step": 2232 + }, + { + "epoch": 0.49, + "grad_norm": 0.44027822389523236, + "learning_rate": 2.1591875879868177e-05, + "loss": 0.0623, + "step": 2233 + }, + { + "epoch": 0.49, + "grad_norm": 0.4128785029628535, + "learning_rate": 2.157769239137971e-05, + "loss": 0.0465, + "step": 2234 + }, + { + "epoch": 0.49, + "grad_norm": 0.3351935421486037, + "learning_rate": 2.1563508104411457e-05, + "loss": 0.0516, + "step": 2235 + }, + { + "epoch": 0.49, + "grad_norm": 0.3493431292244364, + "learning_rate": 2.1549323026142168e-05, + "loss": 0.0478, + "step": 2236 + }, + { + "epoch": 0.49, + "grad_norm": 0.4243760676082615, + "learning_rate": 2.153513716375099e-05, + "loss": 0.0732, + "step": 2237 + }, + { + "epoch": 0.49, + "grad_norm": 0.3524577186976111, + "learning_rate": 2.1520950524417484e-05, + "loss": 0.0507, + "step": 2238 + }, + { + "epoch": 0.49, + "grad_norm": 0.4432719980591418, + "learning_rate": 2.1506763115321602e-05, + "loss": 0.0595, + "step": 2239 + }, + { + "epoch": 0.49, + "grad_norm": 0.4883926569683455, + "learning_rate": 2.1492574943643666e-05, + "loss": 0.0712, + "step": 2240 + }, + { + "epoch": 0.49, + "grad_norm": 0.41592459958861955, + "learning_rate": 2.1478386016564406e-05, + "loss": 0.0588, + "step": 2241 + }, + { + "epoch": 0.49, + "grad_norm": 0.3325416680808275, + "learning_rate": 2.1464196341264915e-05, + "loss": 0.0545, + "step": 2242 + }, + { + "epoch": 0.49, + "grad_norm": 0.3711408484837303, + "learning_rate": 2.145000592492668e-05, + "loss": 0.0559, + "step": 2243 + }, + { + "epoch": 0.49, + "grad_norm": 0.38396880621795976, + "learning_rate": 2.1435814774731557e-05, + "loss": 0.0454, + "step": 2244 + }, + { + "epoch": 0.49, + "grad_norm": 0.4628181078800962, + "learning_rate": 2.1421622897861777e-05, + "loss": 0.0704, + "step": 2245 + }, + { + "epoch": 0.49, + "grad_norm": 0.3477668479133176, + "learning_rate": 2.1407430301499934e-05, + "loss": 0.051, + "step": 2246 + }, + { + "epoch": 0.49, + "grad_norm": 0.4742206818542849, + "learning_rate": 2.139323699282899e-05, + "loss": 0.0551, + "step": 2247 + }, + { + "epoch": 0.49, + "grad_norm": 0.44695137678370234, + "learning_rate": 2.1379042979032256e-05, + "loss": 0.0703, + "step": 2248 + }, + { + "epoch": 0.49, + "grad_norm": 0.4000663002023182, + "learning_rate": 2.1364848267293424e-05, + "loss": 0.0516, + "step": 2249 + }, + { + "epoch": 0.49, + "grad_norm": 0.43719202509645305, + "learning_rate": 2.1350652864796513e-05, + "loss": 0.0638, + "step": 2250 + }, + { + "epoch": 0.49, + "grad_norm": 0.39235092344908234, + "learning_rate": 2.133645677872591e-05, + "loss": 0.0548, + "step": 2251 + }, + { + "epoch": 0.49, + "grad_norm": 0.3668704388739481, + "learning_rate": 2.1322260016266337e-05, + "loss": 0.0408, + "step": 2252 + }, + { + "epoch": 0.49, + "grad_norm": 0.3150238402367162, + "learning_rate": 2.1308062584602865e-05, + "loss": 0.041, + "step": 2253 + }, + { + "epoch": 0.5, + "grad_norm": 0.5245566289748852, + "learning_rate": 2.1293864490920897e-05, + "loss": 0.0728, + "step": 2254 + }, + { + "epoch": 0.5, + "grad_norm": 0.3644941243520977, + "learning_rate": 2.1279665742406187e-05, + "loss": 0.0399, + "step": 2255 + }, + { + "epoch": 0.5, + "grad_norm": 0.40779250084958746, + "learning_rate": 2.126546634624479e-05, + "loss": 0.0548, + "step": 2256 + }, + { + "epoch": 0.5, + "grad_norm": 0.3788580323047344, + "learning_rate": 2.125126630962312e-05, + "loss": 0.0603, + "step": 2257 + }, + { + "epoch": 0.5, + "grad_norm": 0.39045975050796067, + "learning_rate": 2.1237065639727906e-05, + "loss": 0.0525, + "step": 2258 + }, + { + "epoch": 0.5, + "grad_norm": 0.4137295517953831, + "learning_rate": 2.1222864343746185e-05, + "loss": 0.052, + "step": 2259 + }, + { + "epoch": 0.5, + "grad_norm": 0.4671759084073663, + "learning_rate": 2.1208662428865326e-05, + "loss": 0.0558, + "step": 2260 + }, + { + "epoch": 0.5, + "grad_norm": 0.3834009707770895, + "learning_rate": 2.1194459902272997e-05, + "loss": 0.0619, + "step": 2261 + }, + { + "epoch": 0.5, + "grad_norm": 0.33960149615028973, + "learning_rate": 2.1180256771157194e-05, + "loss": 0.0547, + "step": 2262 + }, + { + "epoch": 0.5, + "grad_norm": 0.3576002665943287, + "learning_rate": 2.1166053042706204e-05, + "loss": 0.0446, + "step": 2263 + }, + { + "epoch": 0.5, + "grad_norm": 0.35063333642538436, + "learning_rate": 2.115184872410862e-05, + "loss": 0.0528, + "step": 2264 + }, + { + "epoch": 0.5, + "grad_norm": 0.39879788200945676, + "learning_rate": 2.113764382255334e-05, + "loss": 0.0545, + "step": 2265 + }, + { + "epoch": 0.5, + "grad_norm": 0.5747178691662079, + "learning_rate": 2.1123438345229537e-05, + "loss": 0.0866, + "step": 2266 + }, + { + "epoch": 0.5, + "grad_norm": 0.572943588549069, + "learning_rate": 2.110923229932671e-05, + "loss": 0.0792, + "step": 2267 + }, + { + "epoch": 0.5, + "grad_norm": 0.3623014794385008, + "learning_rate": 2.1095025692034614e-05, + "loss": 0.0482, + "step": 2268 + }, + { + "epoch": 0.5, + "grad_norm": 0.5828839910191299, + "learning_rate": 2.1080818530543304e-05, + "loss": 0.0825, + "step": 2269 + }, + { + "epoch": 0.5, + "grad_norm": 0.478978707586908, + "learning_rate": 2.106661082204311e-05, + "loss": 0.0578, + "step": 2270 + }, + { + "epoch": 0.5, + "grad_norm": 0.36041172424779067, + "learning_rate": 2.105240257372464e-05, + "loss": 0.0578, + "step": 2271 + }, + { + "epoch": 0.5, + "grad_norm": 0.38219240140228306, + "learning_rate": 2.1038193792778775e-05, + "loss": 0.057, + "step": 2272 + }, + { + "epoch": 0.5, + "grad_norm": 0.36689056402462356, + "learning_rate": 2.102398448639667e-05, + "loss": 0.0555, + "step": 2273 + }, + { + "epoch": 0.5, + "grad_norm": 0.41942277567971076, + "learning_rate": 2.100977466176973e-05, + "loss": 0.0474, + "step": 2274 + }, + { + "epoch": 0.5, + "grad_norm": 0.4291063713877314, + "learning_rate": 2.099556432608965e-05, + "loss": 0.069, + "step": 2275 + }, + { + "epoch": 0.5, + "grad_norm": 0.32794329743943423, + "learning_rate": 2.0981353486548363e-05, + "loss": 0.0444, + "step": 2276 + }, + { + "epoch": 0.5, + "grad_norm": 0.4401056489035421, + "learning_rate": 2.096714215033806e-05, + "loss": 0.0674, + "step": 2277 + }, + { + "epoch": 0.5, + "grad_norm": 0.3810406766864367, + "learning_rate": 2.095293032465119e-05, + "loss": 0.0634, + "step": 2278 + }, + { + "epoch": 0.5, + "grad_norm": 0.42038816246072847, + "learning_rate": 2.0938718016680433e-05, + "loss": 0.069, + "step": 2279 + }, + { + "epoch": 0.5, + "grad_norm": 0.3479869212907461, + "learning_rate": 2.0924505233618734e-05, + "loss": 0.05, + "step": 2280 + }, + { + "epoch": 0.5, + "grad_norm": 0.3993746637811574, + "learning_rate": 2.0910291982659277e-05, + "loss": 0.06, + "step": 2281 + }, + { + "epoch": 0.5, + "grad_norm": 0.4005270748702094, + "learning_rate": 2.0896078270995463e-05, + "loss": 0.0489, + "step": 2282 + }, + { + "epoch": 0.5, + "grad_norm": 0.396981721331143, + "learning_rate": 2.0881864105820936e-05, + "loss": 0.0685, + "step": 2283 + }, + { + "epoch": 0.5, + "grad_norm": 0.36236271560600947, + "learning_rate": 2.0867649494329587e-05, + "loss": 0.0481, + "step": 2284 + }, + { + "epoch": 0.5, + "grad_norm": 0.3550763741797328, + "learning_rate": 2.085343444371551e-05, + "loss": 0.051, + "step": 2285 + }, + { + "epoch": 0.5, + "grad_norm": 0.3787923425353629, + "learning_rate": 2.083921896117303e-05, + "loss": 0.0529, + "step": 2286 + }, + { + "epoch": 0.5, + "grad_norm": 0.39855603717447186, + "learning_rate": 2.0825003053896686e-05, + "loss": 0.0469, + "step": 2287 + }, + { + "epoch": 0.5, + "grad_norm": 0.36050130735945973, + "learning_rate": 2.0810786729081237e-05, + "loss": 0.0589, + "step": 2288 + }, + { + "epoch": 0.5, + "grad_norm": 0.373236063972357, + "learning_rate": 2.079656999392166e-05, + "loss": 0.0405, + "step": 2289 + }, + { + "epoch": 0.5, + "grad_norm": 0.4681168094320828, + "learning_rate": 2.0782352855613128e-05, + "loss": 0.075, + "step": 2290 + }, + { + "epoch": 0.5, + "grad_norm": 0.4689835125713308, + "learning_rate": 2.0768135321351016e-05, + "loss": 0.0728, + "step": 2291 + }, + { + "epoch": 0.5, + "grad_norm": 0.331002402888966, + "learning_rate": 2.0753917398330902e-05, + "loss": 0.0493, + "step": 2292 + }, + { + "epoch": 0.5, + "grad_norm": 0.36350895298160313, + "learning_rate": 2.073969909374858e-05, + "loss": 0.0413, + "step": 2293 + }, + { + "epoch": 0.5, + "grad_norm": 0.43343697540317466, + "learning_rate": 2.0725480414800012e-05, + "loss": 0.0616, + "step": 2294 + }, + { + "epoch": 0.5, + "grad_norm": 0.3888972027018617, + "learning_rate": 2.0711261368681356e-05, + "loss": 0.0509, + "step": 2295 + }, + { + "epoch": 0.5, + "grad_norm": 0.3704147551794404, + "learning_rate": 2.069704196258896e-05, + "loss": 0.0412, + "step": 2296 + }, + { + "epoch": 0.5, + "grad_norm": 0.390403971128073, + "learning_rate": 2.068282220371936e-05, + "loss": 0.0567, + "step": 2297 + }, + { + "epoch": 0.5, + "grad_norm": 0.42857761960439805, + "learning_rate": 2.066860209926925e-05, + "loss": 0.0492, + "step": 2298 + }, + { + "epoch": 0.5, + "grad_norm": 0.4495829059886345, + "learning_rate": 2.0654381656435526e-05, + "loss": 0.0656, + "step": 2299 + }, + { + "epoch": 0.51, + "grad_norm": 0.314731332667737, + "learning_rate": 2.064016088241523e-05, + "loss": 0.0397, + "step": 2300 + }, + { + "epoch": 0.51, + "grad_norm": 0.4139443714850748, + "learning_rate": 2.0625939784405586e-05, + "loss": 0.0552, + "step": 2301 + }, + { + "epoch": 0.51, + "grad_norm": 0.4813590291607367, + "learning_rate": 2.0611718369603982e-05, + "loss": 0.0897, + "step": 2302 + }, + { + "epoch": 0.51, + "grad_norm": 0.4462453207203422, + "learning_rate": 2.0597496645207964e-05, + "loss": 0.0727, + "step": 2303 + }, + { + "epoch": 0.51, + "grad_norm": 0.4211409505176882, + "learning_rate": 2.0583274618415227e-05, + "loss": 0.0639, + "step": 2304 + }, + { + "epoch": 0.51, + "grad_norm": 0.35422693001649885, + "learning_rate": 2.056905229642363e-05, + "loss": 0.0498, + "step": 2305 + }, + { + "epoch": 0.51, + "grad_norm": 0.3386480188822363, + "learning_rate": 2.055482968643118e-05, + "loss": 0.0424, + "step": 2306 + }, + { + "epoch": 0.51, + "grad_norm": 0.4217757355504114, + "learning_rate": 2.0540606795636022e-05, + "loss": 0.0708, + "step": 2307 + }, + { + "epoch": 0.51, + "grad_norm": 0.38629292635355156, + "learning_rate": 2.0526383631236454e-05, + "loss": 0.0576, + "step": 2308 + }, + { + "epoch": 0.51, + "grad_norm": 0.3800503672611192, + "learning_rate": 2.0512160200430896e-05, + "loss": 0.0543, + "step": 2309 + }, + { + "epoch": 0.51, + "grad_norm": 0.3909405768720875, + "learning_rate": 2.0497936510417928e-05, + "loss": 0.065, + "step": 2310 + }, + { + "epoch": 0.51, + "grad_norm": 0.30695811315431687, + "learning_rate": 2.048371256839624e-05, + "loss": 0.0409, + "step": 2311 + }, + { + "epoch": 0.51, + "grad_norm": 0.45774239859423366, + "learning_rate": 2.046948838156465e-05, + "loss": 0.0712, + "step": 2312 + }, + { + "epoch": 0.51, + "grad_norm": 0.40685103045526805, + "learning_rate": 2.0455263957122113e-05, + "loss": 0.0633, + "step": 2313 + }, + { + "epoch": 0.51, + "grad_norm": 0.3732654071254558, + "learning_rate": 2.04410393022677e-05, + "loss": 0.0518, + "step": 2314 + }, + { + "epoch": 0.51, + "grad_norm": 0.4410804964372272, + "learning_rate": 2.0426814424200592e-05, + "loss": 0.0498, + "step": 2315 + }, + { + "epoch": 0.51, + "grad_norm": 0.48450435941465114, + "learning_rate": 2.041258933012009e-05, + "loss": 0.0722, + "step": 2316 + }, + { + "epoch": 0.51, + "grad_norm": 0.33728466935387436, + "learning_rate": 2.0398364027225593e-05, + "loss": 0.0481, + "step": 2317 + }, + { + "epoch": 0.51, + "grad_norm": 0.42652390832959736, + "learning_rate": 2.0384138522716626e-05, + "loss": 0.0631, + "step": 2318 + }, + { + "epoch": 0.51, + "grad_norm": 0.4007601451708101, + "learning_rate": 2.036991282379279e-05, + "loss": 0.0543, + "step": 2319 + }, + { + "epoch": 0.51, + "grad_norm": 0.3920582037638183, + "learning_rate": 2.0355686937653818e-05, + "loss": 0.048, + "step": 2320 + }, + { + "epoch": 0.51, + "grad_norm": 0.400337680944842, + "learning_rate": 2.03414608714995e-05, + "loss": 0.0511, + "step": 2321 + }, + { + "epoch": 0.51, + "grad_norm": 0.3276742041075169, + "learning_rate": 2.0327234632529738e-05, + "loss": 0.042, + "step": 2322 + }, + { + "epoch": 0.51, + "grad_norm": 0.38644815861691056, + "learning_rate": 2.0313008227944527e-05, + "loss": 0.0457, + "step": 2323 + }, + { + "epoch": 0.51, + "grad_norm": 0.42529913528824476, + "learning_rate": 2.029878166494393e-05, + "loss": 0.0631, + "step": 2324 + }, + { + "epoch": 0.51, + "grad_norm": 0.4331816871780329, + "learning_rate": 2.0284554950728106e-05, + "loss": 0.0625, + "step": 2325 + }, + { + "epoch": 0.51, + "grad_norm": 0.3881718279896845, + "learning_rate": 2.0270328092497266e-05, + "loss": 0.0331, + "step": 2326 + }, + { + "epoch": 0.51, + "grad_norm": 0.3789899674574855, + "learning_rate": 2.025610109745173e-05, + "loss": 0.064, + "step": 2327 + }, + { + "epoch": 0.51, + "grad_norm": 0.3858736417812594, + "learning_rate": 2.024187397279186e-05, + "loss": 0.0475, + "step": 2328 + }, + { + "epoch": 0.51, + "grad_norm": 0.36311207185275024, + "learning_rate": 2.0227646725718085e-05, + "loss": 0.0522, + "step": 2329 + }, + { + "epoch": 0.51, + "grad_norm": 0.42080811418689446, + "learning_rate": 2.021341936343091e-05, + "loss": 0.0669, + "step": 2330 + }, + { + "epoch": 0.51, + "grad_norm": 0.4367811223292118, + "learning_rate": 2.0199191893130893e-05, + "loss": 0.0642, + "step": 2331 + }, + { + "epoch": 0.51, + "grad_norm": 0.44304571946339333, + "learning_rate": 2.018496432201863e-05, + "loss": 0.0667, + "step": 2332 + }, + { + "epoch": 0.51, + "grad_norm": 0.39111846272706985, + "learning_rate": 2.017073665729479e-05, + "loss": 0.0576, + "step": 2333 + }, + { + "epoch": 0.51, + "grad_norm": 0.3756661458893297, + "learning_rate": 2.0156508906160083e-05, + "loss": 0.0558, + "step": 2334 + }, + { + "epoch": 0.51, + "grad_norm": 0.36196762405032007, + "learning_rate": 2.0142281075815253e-05, + "loss": 0.0538, + "step": 2335 + }, + { + "epoch": 0.51, + "grad_norm": 0.41753657449199966, + "learning_rate": 2.0128053173461105e-05, + "loss": 0.0533, + "step": 2336 + }, + { + "epoch": 0.51, + "grad_norm": 0.35695799445414533, + "learning_rate": 2.0113825206298458e-05, + "loss": 0.049, + "step": 2337 + }, + { + "epoch": 0.51, + "grad_norm": 0.43629295869942647, + "learning_rate": 2.009959718152818e-05, + "loss": 0.0629, + "step": 2338 + }, + { + "epoch": 0.51, + "grad_norm": 0.36733210536684613, + "learning_rate": 2.008536910635115e-05, + "loss": 0.0533, + "step": 2339 + }, + { + "epoch": 0.51, + "grad_norm": 0.34239426858627797, + "learning_rate": 2.00711409879683e-05, + "loss": 0.0422, + "step": 2340 + }, + { + "epoch": 0.51, + "grad_norm": 0.370170287497713, + "learning_rate": 2.0056912833580557e-05, + "loss": 0.0583, + "step": 2341 + }, + { + "epoch": 0.51, + "grad_norm": 0.37309516474357096, + "learning_rate": 2.0042684650388882e-05, + "loss": 0.0557, + "step": 2342 + }, + { + "epoch": 0.51, + "grad_norm": 0.3704571936199428, + "learning_rate": 2.0028456445594234e-05, + "loss": 0.0485, + "step": 2343 + }, + { + "epoch": 0.51, + "grad_norm": 0.5111271590786414, + "learning_rate": 2.0014228226397618e-05, + "loss": 0.0692, + "step": 2344 + }, + { + "epoch": 0.52, + "grad_norm": 0.408704732406459, + "learning_rate": 2e-05, + "loss": 0.0522, + "step": 2345 + }, + { + "epoch": 0.52, + "grad_norm": 0.4179396931182973, + "learning_rate": 1.998577177360239e-05, + "loss": 0.0465, + "step": 2346 + }, + { + "epoch": 0.52, + "grad_norm": 0.4131803652646927, + "learning_rate": 1.997154355440577e-05, + "loss": 0.0621, + "step": 2347 + }, + { + "epoch": 0.52, + "grad_norm": 0.5090762625579097, + "learning_rate": 1.995731534961113e-05, + "loss": 0.0812, + "step": 2348 + }, + { + "epoch": 0.52, + "grad_norm": 0.3399041878463311, + "learning_rate": 1.9943087166419453e-05, + "loss": 0.0424, + "step": 2349 + }, + { + "epoch": 0.52, + "grad_norm": 0.4759776482082502, + "learning_rate": 1.9928859012031703e-05, + "loss": 0.0664, + "step": 2350 + }, + { + "epoch": 0.52, + "grad_norm": 0.40435171446462115, + "learning_rate": 1.991463089364885e-05, + "loss": 0.0494, + "step": 2351 + }, + { + "epoch": 0.52, + "grad_norm": 0.34789546494594176, + "learning_rate": 1.9900402818471825e-05, + "loss": 0.0511, + "step": 2352 + }, + { + "epoch": 0.52, + "grad_norm": 0.33811444934171764, + "learning_rate": 1.9886174793701546e-05, + "loss": 0.0375, + "step": 2353 + }, + { + "epoch": 0.52, + "grad_norm": 0.44538547520252336, + "learning_rate": 1.98719468265389e-05, + "loss": 0.0686, + "step": 2354 + }, + { + "epoch": 0.52, + "grad_norm": 0.35413298259696435, + "learning_rate": 1.985771892418475e-05, + "loss": 0.0517, + "step": 2355 + }, + { + "epoch": 0.52, + "grad_norm": 0.3550469127739452, + "learning_rate": 1.9843491093839927e-05, + "loss": 0.0567, + "step": 2356 + }, + { + "epoch": 0.52, + "grad_norm": 0.39031416923953993, + "learning_rate": 1.982926334270522e-05, + "loss": 0.0485, + "step": 2357 + }, + { + "epoch": 0.52, + "grad_norm": 0.35131035345630074, + "learning_rate": 1.9815035677981378e-05, + "loss": 0.0486, + "step": 2358 + }, + { + "epoch": 0.52, + "grad_norm": 0.4054292677985494, + "learning_rate": 1.9800808106869117e-05, + "loss": 0.0615, + "step": 2359 + }, + { + "epoch": 0.52, + "grad_norm": 0.29001962872784276, + "learning_rate": 1.9786580636569092e-05, + "loss": 0.0312, + "step": 2360 + }, + { + "epoch": 0.52, + "grad_norm": 0.3415296109097626, + "learning_rate": 1.9772353274281918e-05, + "loss": 0.035, + "step": 2361 + }, + { + "epoch": 0.52, + "grad_norm": 0.30966855244264235, + "learning_rate": 1.9758126027208146e-05, + "loss": 0.031, + "step": 2362 + }, + { + "epoch": 0.52, + "grad_norm": 0.3744315965633129, + "learning_rate": 1.9743898902548273e-05, + "loss": 0.0589, + "step": 2363 + }, + { + "epoch": 0.52, + "grad_norm": 0.4780805205555588, + "learning_rate": 1.972967190750274e-05, + "loss": 0.0575, + "step": 2364 + }, + { + "epoch": 0.52, + "grad_norm": 0.4121418980786573, + "learning_rate": 1.9715445049271907e-05, + "loss": 0.0533, + "step": 2365 + }, + { + "epoch": 0.52, + "grad_norm": 0.46827083574793354, + "learning_rate": 1.9701218335056076e-05, + "loss": 0.0667, + "step": 2366 + }, + { + "epoch": 0.52, + "grad_norm": 0.3978848832437258, + "learning_rate": 1.9686991772055476e-05, + "loss": 0.0542, + "step": 2367 + }, + { + "epoch": 0.52, + "grad_norm": 0.39031993397702963, + "learning_rate": 1.9672765367470265e-05, + "loss": 0.0574, + "step": 2368 + }, + { + "epoch": 0.52, + "grad_norm": 0.3806681226158581, + "learning_rate": 1.9658539128500507e-05, + "loss": 0.0535, + "step": 2369 + }, + { + "epoch": 0.52, + "grad_norm": 0.5512545560791239, + "learning_rate": 1.964431306234619e-05, + "loss": 0.0692, + "step": 2370 + }, + { + "epoch": 0.52, + "grad_norm": 0.39036734666751816, + "learning_rate": 1.9630087176207212e-05, + "loss": 0.0645, + "step": 2371 + }, + { + "epoch": 0.52, + "grad_norm": 0.3950232657691014, + "learning_rate": 1.9615861477283384e-05, + "loss": 0.0426, + "step": 2372 + }, + { + "epoch": 0.52, + "grad_norm": 0.3738941337640487, + "learning_rate": 1.9601635972774414e-05, + "loss": 0.0541, + "step": 2373 + }, + { + "epoch": 0.52, + "grad_norm": 0.36719159874759005, + "learning_rate": 1.958741066987992e-05, + "loss": 0.061, + "step": 2374 + }, + { + "epoch": 0.52, + "grad_norm": 0.3758156370104676, + "learning_rate": 1.9573185575799414e-05, + "loss": 0.0546, + "step": 2375 + }, + { + "epoch": 0.52, + "grad_norm": 0.3452507119112734, + "learning_rate": 1.95589606977323e-05, + "loss": 0.0481, + "step": 2376 + }, + { + "epoch": 0.52, + "grad_norm": 0.36708523969916607, + "learning_rate": 1.9544736042877886e-05, + "loss": 0.0505, + "step": 2377 + }, + { + "epoch": 0.52, + "grad_norm": 0.35413298259696435, + "learning_rate": 1.9530511618435352e-05, + "loss": 0.0432, + "step": 2378 + }, + { + "epoch": 0.52, + "grad_norm": 0.35248344384197294, + "learning_rate": 1.9516287431603767e-05, + "loss": 0.0411, + "step": 2379 + }, + { + "epoch": 0.52, + "grad_norm": 0.41347060093390803, + "learning_rate": 1.950206348958208e-05, + "loss": 0.06, + "step": 2380 + }, + { + "epoch": 0.52, + "grad_norm": 0.3132393316635138, + "learning_rate": 1.948783979956911e-05, + "loss": 0.0455, + "step": 2381 + }, + { + "epoch": 0.52, + "grad_norm": 0.48297148065402884, + "learning_rate": 1.9473616368763556e-05, + "loss": 0.0713, + "step": 2382 + }, + { + "epoch": 0.52, + "grad_norm": 0.3520791601840363, + "learning_rate": 1.9459393204363988e-05, + "loss": 0.0344, + "step": 2383 + }, + { + "epoch": 0.52, + "grad_norm": 0.44459028405128037, + "learning_rate": 1.944517031356882e-05, + "loss": 0.0627, + "step": 2384 + }, + { + "epoch": 0.52, + "grad_norm": 0.3420051594032704, + "learning_rate": 1.9430947703576373e-05, + "loss": 0.0582, + "step": 2385 + }, + { + "epoch": 0.52, + "grad_norm": 0.3404259134894278, + "learning_rate": 1.9416725381584777e-05, + "loss": 0.0393, + "step": 2386 + }, + { + "epoch": 0.52, + "grad_norm": 0.289782799432669, + "learning_rate": 1.9402503354792043e-05, + "loss": 0.048, + "step": 2387 + }, + { + "epoch": 0.52, + "grad_norm": 0.3397732584839841, + "learning_rate": 1.938828163039602e-05, + "loss": 0.0401, + "step": 2388 + }, + { + "epoch": 0.52, + "grad_norm": 0.32293331451981655, + "learning_rate": 1.9374060215594417e-05, + "loss": 0.0371, + "step": 2389 + }, + { + "epoch": 0.52, + "grad_norm": 0.4402223424612593, + "learning_rate": 1.9359839117584775e-05, + "loss": 0.0523, + "step": 2390 + }, + { + "epoch": 0.53, + "grad_norm": 0.3906156538798913, + "learning_rate": 1.934561834356448e-05, + "loss": 0.0559, + "step": 2391 + }, + { + "epoch": 0.53, + "grad_norm": 0.4286245550140536, + "learning_rate": 1.9331397900730754e-05, + "loss": 0.0484, + "step": 2392 + }, + { + "epoch": 0.53, + "grad_norm": 0.3395809275399548, + "learning_rate": 1.9317177796280643e-05, + "loss": 0.0467, + "step": 2393 + }, + { + "epoch": 0.53, + "grad_norm": 0.3323436613670127, + "learning_rate": 1.930295803741104e-05, + "loss": 0.0462, + "step": 2394 + }, + { + "epoch": 0.53, + "grad_norm": 0.40455667008457696, + "learning_rate": 1.9288738631318648e-05, + "loss": 0.0459, + "step": 2395 + }, + { + "epoch": 0.53, + "grad_norm": 0.47881007574553464, + "learning_rate": 1.9274519585199995e-05, + "loss": 0.0668, + "step": 2396 + }, + { + "epoch": 0.53, + "grad_norm": 0.4284875239822807, + "learning_rate": 1.9260300906251422e-05, + "loss": 0.0618, + "step": 2397 + }, + { + "epoch": 0.53, + "grad_norm": 0.41261309532758883, + "learning_rate": 1.92460826016691e-05, + "loss": 0.0531, + "step": 2398 + }, + { + "epoch": 0.53, + "grad_norm": 0.3862499130890881, + "learning_rate": 1.9231864678648994e-05, + "loss": 0.0604, + "step": 2399 + }, + { + "epoch": 0.53, + "grad_norm": 0.4244319113266689, + "learning_rate": 1.9217647144386885e-05, + "loss": 0.0706, + "step": 2400 + }, + { + "epoch": 0.53, + "grad_norm": 0.38182235654596364, + "learning_rate": 1.9203430006078348e-05, + "loss": 0.0493, + "step": 2401 + }, + { + "epoch": 0.53, + "grad_norm": 0.3904403440806351, + "learning_rate": 1.918921327091876e-05, + "loss": 0.055, + "step": 2402 + }, + { + "epoch": 0.53, + "grad_norm": 0.3772309223455192, + "learning_rate": 1.9174996946103318e-05, + "loss": 0.0557, + "step": 2403 + }, + { + "epoch": 0.53, + "grad_norm": 0.3871718555457978, + "learning_rate": 1.9160781038826973e-05, + "loss": 0.044, + "step": 2404 + }, + { + "epoch": 0.53, + "grad_norm": 0.3547079727381328, + "learning_rate": 1.9146565556284492e-05, + "loss": 0.0448, + "step": 2405 + }, + { + "epoch": 0.53, + "grad_norm": 0.37406062645948895, + "learning_rate": 1.9132350505670416e-05, + "loss": 0.0505, + "step": 2406 + }, + { + "epoch": 0.53, + "grad_norm": 0.35909978030619516, + "learning_rate": 1.9118135894179067e-05, + "loss": 0.0537, + "step": 2407 + }, + { + "epoch": 0.53, + "grad_norm": 0.3432187613770574, + "learning_rate": 1.910392172900455e-05, + "loss": 0.0558, + "step": 2408 + }, + { + "epoch": 0.53, + "grad_norm": 0.36632735372906744, + "learning_rate": 1.9089708017340733e-05, + "loss": 0.0451, + "step": 2409 + }, + { + "epoch": 0.53, + "grad_norm": 0.37284915737457436, + "learning_rate": 1.9075494766381263e-05, + "loss": 0.0422, + "step": 2410 + }, + { + "epoch": 0.53, + "grad_norm": 0.3631603901305983, + "learning_rate": 1.906128198331957e-05, + "loss": 0.0448, + "step": 2411 + }, + { + "epoch": 0.53, + "grad_norm": 0.4002529372751417, + "learning_rate": 1.9047069675348816e-05, + "loss": 0.0488, + "step": 2412 + }, + { + "epoch": 0.53, + "grad_norm": 0.34655052250355045, + "learning_rate": 1.9032857849661942e-05, + "loss": 0.0377, + "step": 2413 + }, + { + "epoch": 0.53, + "grad_norm": 0.392345625319532, + "learning_rate": 1.901864651345164e-05, + "loss": 0.0498, + "step": 2414 + }, + { + "epoch": 0.53, + "grad_norm": 0.37201830482951675, + "learning_rate": 1.9004435673910356e-05, + "loss": 0.0472, + "step": 2415 + }, + { + "epoch": 0.53, + "grad_norm": 0.34400427690331264, + "learning_rate": 1.8990225338230276e-05, + "loss": 0.0372, + "step": 2416 + }, + { + "epoch": 0.53, + "grad_norm": 0.34003686350944584, + "learning_rate": 1.8976015513603344e-05, + "loss": 0.0426, + "step": 2417 + }, + { + "epoch": 0.53, + "grad_norm": 0.4402325478679173, + "learning_rate": 1.8961806207221235e-05, + "loss": 0.0523, + "step": 2418 + }, + { + "epoch": 0.53, + "grad_norm": 0.377458855742547, + "learning_rate": 1.8947597426275368e-05, + "loss": 0.0394, + "step": 2419 + }, + { + "epoch": 0.53, + "grad_norm": 0.3755874006904432, + "learning_rate": 1.8933389177956896e-05, + "loss": 0.0408, + "step": 2420 + }, + { + "epoch": 0.53, + "grad_norm": 0.4364706771337179, + "learning_rate": 1.8919181469456703e-05, + "loss": 0.0583, + "step": 2421 + }, + { + "epoch": 0.53, + "grad_norm": 0.3461203132234935, + "learning_rate": 1.8904974307965393e-05, + "loss": 0.0426, + "step": 2422 + }, + { + "epoch": 0.53, + "grad_norm": 0.34278681584956155, + "learning_rate": 1.8890767700673296e-05, + "loss": 0.0375, + "step": 2423 + }, + { + "epoch": 0.53, + "grad_norm": 0.38381606533579277, + "learning_rate": 1.8876561654770466e-05, + "loss": 0.0504, + "step": 2424 + }, + { + "epoch": 0.53, + "grad_norm": 0.31069624805201673, + "learning_rate": 1.8862356177446667e-05, + "loss": 0.0406, + "step": 2425 + }, + { + "epoch": 0.53, + "grad_norm": 0.37474832434021943, + "learning_rate": 1.8848151275891383e-05, + "loss": 0.0469, + "step": 2426 + }, + { + "epoch": 0.53, + "grad_norm": 0.3399929508362034, + "learning_rate": 1.8833946957293796e-05, + "loss": 0.035, + "step": 2427 + }, + { + "epoch": 0.53, + "grad_norm": 0.29861208018557206, + "learning_rate": 1.8819743228842806e-05, + "loss": 0.0445, + "step": 2428 + }, + { + "epoch": 0.53, + "grad_norm": 0.3823212371359132, + "learning_rate": 1.8805540097727003e-05, + "loss": 0.0523, + "step": 2429 + }, + { + "epoch": 0.53, + "grad_norm": 0.3661808052382058, + "learning_rate": 1.8791337571134677e-05, + "loss": 0.0437, + "step": 2430 + }, + { + "epoch": 0.53, + "grad_norm": 0.43966059555670706, + "learning_rate": 1.877713565625382e-05, + "loss": 0.0717, + "step": 2431 + }, + { + "epoch": 0.53, + "grad_norm": 0.42020291527145803, + "learning_rate": 1.8762934360272097e-05, + "loss": 0.0399, + "step": 2432 + }, + { + "epoch": 0.53, + "grad_norm": 0.33925193469677256, + "learning_rate": 1.8748733690376883e-05, + "loss": 0.0396, + "step": 2433 + }, + { + "epoch": 0.53, + "grad_norm": 0.3561620344345107, + "learning_rate": 1.8734533653755216e-05, + "loss": 0.0548, + "step": 2434 + }, + { + "epoch": 0.53, + "grad_norm": 0.3905977048874348, + "learning_rate": 1.8720334257593826e-05, + "loss": 0.0555, + "step": 2435 + }, + { + "epoch": 0.54, + "grad_norm": 0.3188506005050462, + "learning_rate": 1.8706135509079103e-05, + "loss": 0.0429, + "step": 2436 + }, + { + "epoch": 0.54, + "grad_norm": 0.29873289198951014, + "learning_rate": 1.869193741539714e-05, + "loss": 0.0397, + "step": 2437 + }, + { + "epoch": 0.54, + "grad_norm": 0.33672384891056945, + "learning_rate": 1.8677739983733666e-05, + "loss": 0.0382, + "step": 2438 + }, + { + "epoch": 0.54, + "grad_norm": 0.38256970315878586, + "learning_rate": 1.8663543221274096e-05, + "loss": 0.0551, + "step": 2439 + }, + { + "epoch": 0.54, + "grad_norm": 0.357453475138913, + "learning_rate": 1.8649347135203494e-05, + "loss": 0.0482, + "step": 2440 + }, + { + "epoch": 0.54, + "grad_norm": 0.41836751650342313, + "learning_rate": 1.8635151732706586e-05, + "loss": 0.0528, + "step": 2441 + }, + { + "epoch": 0.54, + "grad_norm": 0.45123432602435154, + "learning_rate": 1.862095702096775e-05, + "loss": 0.0814, + "step": 2442 + }, + { + "epoch": 0.54, + "grad_norm": 0.3612207407800378, + "learning_rate": 1.860676300717102e-05, + "loss": 0.0443, + "step": 2443 + }, + { + "epoch": 0.54, + "grad_norm": 0.38921682215656894, + "learning_rate": 1.8592569698500076e-05, + "loss": 0.0546, + "step": 2444 + }, + { + "epoch": 0.54, + "grad_norm": 0.4375983876905605, + "learning_rate": 1.8578377102138223e-05, + "loss": 0.0591, + "step": 2445 + }, + { + "epoch": 0.54, + "grad_norm": 0.3719401094194151, + "learning_rate": 1.8564185225268446e-05, + "loss": 0.0482, + "step": 2446 + }, + { + "epoch": 0.54, + "grad_norm": 0.3118690319675185, + "learning_rate": 1.8549994075073327e-05, + "loss": 0.0492, + "step": 2447 + }, + { + "epoch": 0.54, + "grad_norm": 0.30169929010979746, + "learning_rate": 1.853580365873509e-05, + "loss": 0.035, + "step": 2448 + }, + { + "epoch": 0.54, + "grad_norm": 0.31670811464494275, + "learning_rate": 1.8521613983435604e-05, + "loss": 0.0383, + "step": 2449 + }, + { + "epoch": 0.54, + "grad_norm": 0.39067542704295666, + "learning_rate": 1.8507425056356338e-05, + "loss": 0.0519, + "step": 2450 + }, + { + "epoch": 0.54, + "grad_norm": 0.4427692334043251, + "learning_rate": 1.8493236884678405e-05, + "loss": 0.0681, + "step": 2451 + }, + { + "epoch": 0.54, + "grad_norm": 0.33943585877574, + "learning_rate": 1.847904947558252e-05, + "loss": 0.0401, + "step": 2452 + }, + { + "epoch": 0.54, + "grad_norm": 0.3351146467659251, + "learning_rate": 1.8464862836249014e-05, + "loss": 0.0492, + "step": 2453 + }, + { + "epoch": 0.54, + "grad_norm": 0.2713231355761402, + "learning_rate": 1.8450676973857842e-05, + "loss": 0.0354, + "step": 2454 + }, + { + "epoch": 0.54, + "grad_norm": 0.35652747056555917, + "learning_rate": 1.843649189558855e-05, + "loss": 0.0473, + "step": 2455 + }, + { + "epoch": 0.54, + "grad_norm": 0.37071698788929364, + "learning_rate": 1.8422307608620292e-05, + "loss": 0.047, + "step": 2456 + }, + { + "epoch": 0.54, + "grad_norm": 0.3180103963623487, + "learning_rate": 1.840812412013183e-05, + "loss": 0.0448, + "step": 2457 + }, + { + "epoch": 0.54, + "grad_norm": 0.371256983142258, + "learning_rate": 1.8393941437301507e-05, + "loss": 0.0463, + "step": 2458 + }, + { + "epoch": 0.54, + "grad_norm": 0.3359779843620849, + "learning_rate": 1.8379759567307266e-05, + "loss": 0.049, + "step": 2459 + }, + { + "epoch": 0.54, + "grad_norm": 0.3967052464630342, + "learning_rate": 1.8365578517326642e-05, + "loss": 0.0514, + "step": 2460 + }, + { + "epoch": 0.54, + "grad_norm": 0.35972747931859106, + "learning_rate": 1.8351398294536747e-05, + "loss": 0.0389, + "step": 2461 + }, + { + "epoch": 0.54, + "grad_norm": 0.4577699218323339, + "learning_rate": 1.833721890611428e-05, + "loss": 0.0621, + "step": 2462 + }, + { + "epoch": 0.54, + "grad_norm": 0.4291514086865876, + "learning_rate": 1.832304035923552e-05, + "loss": 0.0661, + "step": 2463 + }, + { + "epoch": 0.54, + "grad_norm": 0.43516989503352316, + "learning_rate": 1.8308862661076313e-05, + "loss": 0.0589, + "step": 2464 + }, + { + "epoch": 0.54, + "grad_norm": 0.38741815302530797, + "learning_rate": 1.829468581881208e-05, + "loss": 0.0564, + "step": 2465 + }, + { + "epoch": 0.54, + "grad_norm": 0.37852651700468604, + "learning_rate": 1.8280509839617814e-05, + "loss": 0.0466, + "step": 2466 + }, + { + "epoch": 0.54, + "grad_norm": 0.32719829805658424, + "learning_rate": 1.8266334730668054e-05, + "loss": 0.0385, + "step": 2467 + }, + { + "epoch": 0.54, + "grad_norm": 0.40737122086407546, + "learning_rate": 1.8252160499136914e-05, + "loss": 0.051, + "step": 2468 + }, + { + "epoch": 0.54, + "grad_norm": 0.36836061366207723, + "learning_rate": 1.8237987152198063e-05, + "loss": 0.0511, + "step": 2469 + }, + { + "epoch": 0.54, + "grad_norm": 0.4453552292518177, + "learning_rate": 1.822381469702471e-05, + "loss": 0.0661, + "step": 2470 + }, + { + "epoch": 0.54, + "grad_norm": 0.32987103935269346, + "learning_rate": 1.8209643140789622e-05, + "loss": 0.0463, + "step": 2471 + }, + { + "epoch": 0.54, + "grad_norm": 0.4150262987754753, + "learning_rate": 1.8195472490665125e-05, + "loss": 0.0531, + "step": 2472 + }, + { + "epoch": 0.54, + "grad_norm": 0.4306926761039924, + "learning_rate": 1.8181302753823064e-05, + "loss": 0.064, + "step": 2473 + }, + { + "epoch": 0.54, + "grad_norm": 0.39084008970839546, + "learning_rate": 1.8167133937434823e-05, + "loss": 0.0502, + "step": 2474 + }, + { + "epoch": 0.54, + "grad_norm": 0.5909688967670853, + "learning_rate": 1.8152966048671334e-05, + "loss": 0.0722, + "step": 2475 + }, + { + "epoch": 0.54, + "grad_norm": 0.38884734980182967, + "learning_rate": 1.813879909470305e-05, + "loss": 0.0555, + "step": 2476 + }, + { + "epoch": 0.54, + "grad_norm": 0.37595676518621624, + "learning_rate": 1.8124633082699956e-05, + "loss": 0.0502, + "step": 2477 + }, + { + "epoch": 0.54, + "grad_norm": 0.474914844935941, + "learning_rate": 1.8110468019831553e-05, + "loss": 0.0721, + "step": 2478 + }, + { + "epoch": 0.54, + "grad_norm": 0.3843573907369612, + "learning_rate": 1.8096303913266864e-05, + "loss": 0.0756, + "step": 2479 + }, + { + "epoch": 0.54, + "grad_norm": 0.40612778292424834, + "learning_rate": 1.808214077017444e-05, + "loss": 0.0581, + "step": 2480 + }, + { + "epoch": 0.54, + "grad_norm": 0.3720001394825335, + "learning_rate": 1.8067978597722325e-05, + "loss": 0.0549, + "step": 2481 + }, + { + "epoch": 0.55, + "grad_norm": 0.3975497809613756, + "learning_rate": 1.8053817403078087e-05, + "loss": 0.0665, + "step": 2482 + }, + { + "epoch": 0.55, + "grad_norm": 0.3122717977815116, + "learning_rate": 1.8039657193408788e-05, + "loss": 0.043, + "step": 2483 + }, + { + "epoch": 0.55, + "grad_norm": 0.40600406098096514, + "learning_rate": 1.8025497975881004e-05, + "loss": 0.0543, + "step": 2484 + }, + { + "epoch": 0.55, + "grad_norm": 0.300010101327112, + "learning_rate": 1.8011339757660798e-05, + "loss": 0.0365, + "step": 2485 + }, + { + "epoch": 0.55, + "grad_norm": 0.3967849268788347, + "learning_rate": 1.7997182545913732e-05, + "loss": 0.0409, + "step": 2486 + }, + { + "epoch": 0.55, + "grad_norm": 0.29865259737449684, + "learning_rate": 1.798302634780486e-05, + "loss": 0.0439, + "step": 2487 + }, + { + "epoch": 0.55, + "grad_norm": 0.3211444888899818, + "learning_rate": 1.796887117049871e-05, + "loss": 0.0387, + "step": 2488 + }, + { + "epoch": 0.55, + "grad_norm": 0.3812861472331901, + "learning_rate": 1.7954717021159316e-05, + "loss": 0.055, + "step": 2489 + }, + { + "epoch": 0.55, + "grad_norm": 0.3748673760814513, + "learning_rate": 1.7940563906950175e-05, + "loss": 0.0447, + "step": 2490 + }, + { + "epoch": 0.55, + "grad_norm": 0.3460907783122389, + "learning_rate": 1.7926411835034267e-05, + "loss": 0.0545, + "step": 2491 + }, + { + "epoch": 0.55, + "grad_norm": 0.41721838147453766, + "learning_rate": 1.791226081257404e-05, + "loss": 0.0578, + "step": 2492 + }, + { + "epoch": 0.55, + "grad_norm": 0.3697827924639646, + "learning_rate": 1.7898110846731415e-05, + "loss": 0.0587, + "step": 2493 + }, + { + "epoch": 0.55, + "grad_norm": 0.4420947385348038, + "learning_rate": 1.7883961944667772e-05, + "loss": 0.0588, + "step": 2494 + }, + { + "epoch": 0.55, + "grad_norm": 0.33188920067903177, + "learning_rate": 1.786981411354396e-05, + "loss": 0.0486, + "step": 2495 + }, + { + "epoch": 0.55, + "grad_norm": 0.40870752154700785, + "learning_rate": 1.7855667360520277e-05, + "loss": 0.0512, + "step": 2496 + }, + { + "epoch": 0.55, + "grad_norm": 0.35365349625987297, + "learning_rate": 1.7841521692756497e-05, + "loss": 0.05, + "step": 2497 + }, + { + "epoch": 0.55, + "grad_norm": 0.3862683148655958, + "learning_rate": 1.782737711741182e-05, + "loss": 0.0457, + "step": 2498 + }, + { + "epoch": 0.55, + "grad_norm": 0.34301279953855635, + "learning_rate": 1.7813233641644904e-05, + "loss": 0.0419, + "step": 2499 + }, + { + "epoch": 0.55, + "grad_norm": 0.326006270529698, + "learning_rate": 1.7799091272613843e-05, + "loss": 0.0404, + "step": 2500 + }, + { + "epoch": 0.55, + "grad_norm": 0.38251507162182874, + "learning_rate": 1.778495001747618e-05, + "loss": 0.0534, + "step": 2501 + }, + { + "epoch": 0.55, + "grad_norm": 0.3813334717597466, + "learning_rate": 1.7770809883388896e-05, + "loss": 0.044, + "step": 2502 + }, + { + "epoch": 0.55, + "grad_norm": 0.39002479217662644, + "learning_rate": 1.775667087750839e-05, + "loss": 0.0435, + "step": 2503 + }, + { + "epoch": 0.55, + "grad_norm": 0.3337172281948741, + "learning_rate": 1.774253300699051e-05, + "loss": 0.0428, + "step": 2504 + }, + { + "epoch": 0.55, + "grad_norm": 0.36579148748737944, + "learning_rate": 1.77283962789905e-05, + "loss": 0.0432, + "step": 2505 + }, + { + "epoch": 0.55, + "grad_norm": 0.3800404866196099, + "learning_rate": 1.771426070066307e-05, + "loss": 0.0534, + "step": 2506 + }, + { + "epoch": 0.55, + "grad_norm": 0.35188721389983824, + "learning_rate": 1.770012627916231e-05, + "loss": 0.0443, + "step": 2507 + }, + { + "epoch": 0.55, + "grad_norm": 0.35454603054259937, + "learning_rate": 1.768599302164174e-05, + "loss": 0.0421, + "step": 2508 + }, + { + "epoch": 0.55, + "grad_norm": 0.37317982646238296, + "learning_rate": 1.7671860935254285e-05, + "loss": 0.0587, + "step": 2509 + }, + { + "epoch": 0.55, + "grad_norm": 0.2835275173347284, + "learning_rate": 1.7657730027152286e-05, + "loss": 0.036, + "step": 2510 + }, + { + "epoch": 0.55, + "grad_norm": 0.35067588481115647, + "learning_rate": 1.7643600304487475e-05, + "loss": 0.042, + "step": 2511 + }, + { + "epoch": 0.55, + "grad_norm": 0.3442711239562008, + "learning_rate": 1.7629471774410997e-05, + "loss": 0.0367, + "step": 2512 + }, + { + "epoch": 0.55, + "grad_norm": 0.3680934484137127, + "learning_rate": 1.7615344444073385e-05, + "loss": 0.0415, + "step": 2513 + }, + { + "epoch": 0.55, + "grad_norm": 0.35494281301469954, + "learning_rate": 1.7601218320624562e-05, + "loss": 0.047, + "step": 2514 + }, + { + "epoch": 0.55, + "grad_norm": 0.41312054788957775, + "learning_rate": 1.7587093411213856e-05, + "loss": 0.0511, + "step": 2515 + }, + { + "epoch": 0.55, + "grad_norm": 0.32683657684756035, + "learning_rate": 1.7572969722989967e-05, + "loss": 0.0441, + "step": 2516 + }, + { + "epoch": 0.55, + "grad_norm": 0.4399027579856521, + "learning_rate": 1.755884726310098e-05, + "loss": 0.062, + "step": 2517 + }, + { + "epoch": 0.55, + "grad_norm": 0.4171422826714955, + "learning_rate": 1.754472603869436e-05, + "loss": 0.0502, + "step": 2518 + }, + { + "epoch": 0.55, + "grad_norm": 0.38591179318083907, + "learning_rate": 1.7530606056916935e-05, + "loss": 0.0452, + "step": 2519 + }, + { + "epoch": 0.55, + "grad_norm": 0.32631634465151743, + "learning_rate": 1.751648732491493e-05, + "loss": 0.046, + "step": 2520 + }, + { + "epoch": 0.55, + "grad_norm": 0.3409260255227103, + "learning_rate": 1.7502369849833908e-05, + "loss": 0.0461, + "step": 2521 + }, + { + "epoch": 0.55, + "grad_norm": 0.3786171075383073, + "learning_rate": 1.748825363881881e-05, + "loss": 0.046, + "step": 2522 + }, + { + "epoch": 0.55, + "grad_norm": 0.35722992125319664, + "learning_rate": 1.7474138699013953e-05, + "loss": 0.0563, + "step": 2523 + }, + { + "epoch": 0.55, + "grad_norm": 0.298713450151254, + "learning_rate": 1.746002503756298e-05, + "loss": 0.0374, + "step": 2524 + }, + { + "epoch": 0.55, + "grad_norm": 0.30910242384037684, + "learning_rate": 1.7445912661608912e-05, + "loss": 0.0407, + "step": 2525 + }, + { + "epoch": 0.55, + "grad_norm": 0.46861530593996237, + "learning_rate": 1.7431801578294097e-05, + "loss": 0.0517, + "step": 2526 + }, + { + "epoch": 0.56, + "grad_norm": 0.394621111767584, + "learning_rate": 1.7417691794760247e-05, + "loss": 0.0524, + "step": 2527 + }, + { + "epoch": 0.56, + "grad_norm": 0.3686938129873323, + "learning_rate": 1.740358331814841e-05, + "loss": 0.0452, + "step": 2528 + }, + { + "epoch": 0.56, + "grad_norm": 0.4074832829749992, + "learning_rate": 1.7389476155598974e-05, + "loss": 0.0514, + "step": 2529 + }, + { + "epoch": 0.56, + "grad_norm": 0.342856394410026, + "learning_rate": 1.7375370314251657e-05, + "loss": 0.0429, + "step": 2530 + }, + { + "epoch": 0.56, + "grad_norm": 0.3690525333308805, + "learning_rate": 1.7361265801245504e-05, + "loss": 0.0548, + "step": 2531 + }, + { + "epoch": 0.56, + "grad_norm": 0.37886233911028505, + "learning_rate": 1.7347162623718913e-05, + "loss": 0.0615, + "step": 2532 + }, + { + "epoch": 0.56, + "grad_norm": 0.35522498691184834, + "learning_rate": 1.7333060788809582e-05, + "loss": 0.0384, + "step": 2533 + }, + { + "epoch": 0.56, + "grad_norm": 0.35634166307916276, + "learning_rate": 1.7318960303654534e-05, + "loss": 0.0404, + "step": 2534 + }, + { + "epoch": 0.56, + "grad_norm": 0.4526984574928489, + "learning_rate": 1.7304861175390112e-05, + "loss": 0.0644, + "step": 2535 + }, + { + "epoch": 0.56, + "grad_norm": 0.3635237509931962, + "learning_rate": 1.729076341115197e-05, + "loss": 0.0689, + "step": 2536 + }, + { + "epoch": 0.56, + "grad_norm": 0.3394390414952252, + "learning_rate": 1.7276667018075073e-05, + "loss": 0.0521, + "step": 2537 + }, + { + "epoch": 0.56, + "grad_norm": 0.358893528049773, + "learning_rate": 1.726257200329369e-05, + "loss": 0.0439, + "step": 2538 + }, + { + "epoch": 0.56, + "grad_norm": 0.3475280696866417, + "learning_rate": 1.72484783739414e-05, + "loss": 0.0516, + "step": 2539 + }, + { + "epoch": 0.56, + "grad_norm": 0.31888040371514104, + "learning_rate": 1.7234386137151067e-05, + "loss": 0.0389, + "step": 2540 + }, + { + "epoch": 0.56, + "grad_norm": 0.3433524563974395, + "learning_rate": 1.7220295300054867e-05, + "loss": 0.0416, + "step": 2541 + }, + { + "epoch": 0.56, + "grad_norm": 0.3505328039852608, + "learning_rate": 1.7206205869784254e-05, + "loss": 0.0434, + "step": 2542 + }, + { + "epoch": 0.56, + "grad_norm": 0.31793390406469835, + "learning_rate": 1.719211785346998e-05, + "loss": 0.0378, + "step": 2543 + }, + { + "epoch": 0.56, + "grad_norm": 0.3443520648747606, + "learning_rate": 1.717803125824207e-05, + "loss": 0.0513, + "step": 2544 + }, + { + "epoch": 0.56, + "grad_norm": 0.328554734885512, + "learning_rate": 1.716394609122984e-05, + "loss": 0.0448, + "step": 2545 + }, + { + "epoch": 0.56, + "grad_norm": 0.2885518589813259, + "learning_rate": 1.714986235956188e-05, + "loss": 0.0401, + "step": 2546 + }, + { + "epoch": 0.56, + "grad_norm": 0.4041937728850449, + "learning_rate": 1.713578007036605e-05, + "loss": 0.0582, + "step": 2547 + }, + { + "epoch": 0.56, + "grad_norm": 0.39700011361334303, + "learning_rate": 1.712169923076948e-05, + "loss": 0.058, + "step": 2548 + }, + { + "epoch": 0.56, + "grad_norm": 0.29387935265903975, + "learning_rate": 1.710761984789858e-05, + "loss": 0.0382, + "step": 2549 + }, + { + "epoch": 0.56, + "grad_norm": 0.503105355110246, + "learning_rate": 1.7093541928879004e-05, + "loss": 0.0946, + "step": 2550 + }, + { + "epoch": 0.56, + "grad_norm": 0.33110208087717735, + "learning_rate": 1.7079465480835677e-05, + "loss": 0.0399, + "step": 2551 + }, + { + "epoch": 0.56, + "grad_norm": 0.26865055107203595, + "learning_rate": 1.7065390510892767e-05, + "loss": 0.0423, + "step": 2552 + }, + { + "epoch": 0.56, + "grad_norm": 0.4193389039629841, + "learning_rate": 1.7051317026173715e-05, + "loss": 0.0548, + "step": 2553 + }, + { + "epoch": 0.56, + "grad_norm": 0.3646464384051765, + "learning_rate": 1.703724503380119e-05, + "loss": 0.0496, + "step": 2554 + }, + { + "epoch": 0.56, + "grad_norm": 0.328030254672491, + "learning_rate": 1.7023174540897112e-05, + "loss": 0.0367, + "step": 2555 + }, + { + "epoch": 0.56, + "grad_norm": 0.37608506259623486, + "learning_rate": 1.7009105554582652e-05, + "loss": 0.0529, + "step": 2556 + }, + { + "epoch": 0.56, + "grad_norm": 0.3777206197480758, + "learning_rate": 1.6995038081978193e-05, + "loss": 0.0584, + "step": 2557 + }, + { + "epoch": 0.56, + "grad_norm": 0.30499260860226335, + "learning_rate": 1.6980972130203396e-05, + "loss": 0.0367, + "step": 2558 + }, + { + "epoch": 0.56, + "grad_norm": 0.3273897106889949, + "learning_rate": 1.6966907706377103e-05, + "loss": 0.0396, + "step": 2559 + }, + { + "epoch": 0.56, + "grad_norm": 0.3536332498520532, + "learning_rate": 1.695284481761742e-05, + "loss": 0.0504, + "step": 2560 + }, + { + "epoch": 0.56, + "grad_norm": 0.388388218718895, + "learning_rate": 1.6938783471041647e-05, + "loss": 0.0537, + "step": 2561 + }, + { + "epoch": 0.56, + "grad_norm": 0.2954232205110963, + "learning_rate": 1.692472367376633e-05, + "loss": 0.039, + "step": 2562 + }, + { + "epoch": 0.56, + "grad_norm": 0.464982372739429, + "learning_rate": 1.691066543290721e-05, + "loss": 0.0805, + "step": 2563 + }, + { + "epoch": 0.56, + "grad_norm": 0.27277565284516625, + "learning_rate": 1.6896608755579256e-05, + "loss": 0.0413, + "step": 2564 + }, + { + "epoch": 0.56, + "grad_norm": 0.35453117303355675, + "learning_rate": 1.6882553648896625e-05, + "loss": 0.0425, + "step": 2565 + }, + { + "epoch": 0.56, + "grad_norm": 0.27941979857681537, + "learning_rate": 1.686850011997271e-05, + "loss": 0.0342, + "step": 2566 + }, + { + "epoch": 0.56, + "grad_norm": 0.3715773793955695, + "learning_rate": 1.685444817592008e-05, + "loss": 0.0618, + "step": 2567 + }, + { + "epoch": 0.56, + "grad_norm": 0.373749675112682, + "learning_rate": 1.6840397823850513e-05, + "loss": 0.0532, + "step": 2568 + }, + { + "epoch": 0.56, + "grad_norm": 0.3291121008569529, + "learning_rate": 1.6826349070874973e-05, + "loss": 0.0406, + "step": 2569 + }, + { + "epoch": 0.56, + "grad_norm": 0.35252667772513235, + "learning_rate": 1.6812301924103626e-05, + "loss": 0.0529, + "step": 2570 + }, + { + "epoch": 0.56, + "grad_norm": 0.3799685501586278, + "learning_rate": 1.6798256390645816e-05, + "loss": 0.0451, + "step": 2571 + }, + { + "epoch": 0.56, + "grad_norm": 0.35622731688714665, + "learning_rate": 1.6784212477610075e-05, + "loss": 0.0442, + "step": 2572 + }, + { + "epoch": 0.57, + "grad_norm": 0.3117283831568278, + "learning_rate": 1.6770170192104107e-05, + "loss": 0.0331, + "step": 2573 + }, + { + "epoch": 0.57, + "grad_norm": 0.32447007790578225, + "learning_rate": 1.67561295412348e-05, + "loss": 0.0452, + "step": 2574 + }, + { + "epoch": 0.57, + "grad_norm": 0.3309813336600599, + "learning_rate": 1.6742090532108228e-05, + "loss": 0.0392, + "step": 2575 + }, + { + "epoch": 0.57, + "grad_norm": 0.40694373758744323, + "learning_rate": 1.6728053171829603e-05, + "loss": 0.0606, + "step": 2576 + }, + { + "epoch": 0.57, + "grad_norm": 0.5453939819669776, + "learning_rate": 1.6714017467503328e-05, + "loss": 0.1091, + "step": 2577 + }, + { + "epoch": 0.57, + "grad_norm": 0.4217280732488526, + "learning_rate": 1.6699983426232955e-05, + "loss": 0.048, + "step": 2578 + }, + { + "epoch": 0.57, + "grad_norm": 0.3787519396364005, + "learning_rate": 1.6685951055121203e-05, + "loss": 0.0446, + "step": 2579 + }, + { + "epoch": 0.57, + "grad_norm": 0.335666791384736, + "learning_rate": 1.667192036126993e-05, + "loss": 0.047, + "step": 2580 + }, + { + "epoch": 0.57, + "grad_norm": 0.26844197075243886, + "learning_rate": 1.665789135178017e-05, + "loss": 0.0343, + "step": 2581 + }, + { + "epoch": 0.57, + "grad_norm": 0.36392243519255474, + "learning_rate": 1.664386403375208e-05, + "loss": 0.0453, + "step": 2582 + }, + { + "epoch": 0.57, + "grad_norm": 0.4301809598543906, + "learning_rate": 1.6629838414284972e-05, + "loss": 0.0472, + "step": 2583 + }, + { + "epoch": 0.57, + "grad_norm": 0.3268916702396021, + "learning_rate": 1.6615814500477307e-05, + "loss": 0.0423, + "step": 2584 + }, + { + "epoch": 0.57, + "grad_norm": 0.3298473680154127, + "learning_rate": 1.6601792299426668e-05, + "loss": 0.0391, + "step": 2585 + }, + { + "epoch": 0.57, + "grad_norm": 0.3484045718761079, + "learning_rate": 1.658777181822978e-05, + "loss": 0.0436, + "step": 2586 + }, + { + "epoch": 0.57, + "grad_norm": 0.3389652780822943, + "learning_rate": 1.6573753063982492e-05, + "loss": 0.0448, + "step": 2587 + }, + { + "epoch": 0.57, + "grad_norm": 0.37909805712879996, + "learning_rate": 1.655973604377978e-05, + "loss": 0.06, + "step": 2588 + }, + { + "epoch": 0.57, + "grad_norm": 0.3678902536237636, + "learning_rate": 1.6545720764715746e-05, + "loss": 0.039, + "step": 2589 + }, + { + "epoch": 0.57, + "grad_norm": 0.3260284611076184, + "learning_rate": 1.6531707233883607e-05, + "loss": 0.0459, + "step": 2590 + }, + { + "epoch": 0.57, + "grad_norm": 0.40296197818244606, + "learning_rate": 1.651769545837569e-05, + "loss": 0.0643, + "step": 2591 + }, + { + "epoch": 0.57, + "grad_norm": 0.2994351050540486, + "learning_rate": 1.650368544528346e-05, + "loss": 0.03, + "step": 2592 + }, + { + "epoch": 0.57, + "grad_norm": 0.352678202806525, + "learning_rate": 1.6489677201697453e-05, + "loss": 0.0436, + "step": 2593 + }, + { + "epoch": 0.57, + "grad_norm": 0.35567778425230934, + "learning_rate": 1.6475670734707336e-05, + "loss": 0.0538, + "step": 2594 + }, + { + "epoch": 0.57, + "grad_norm": 0.31933516756017416, + "learning_rate": 1.6461666051401865e-05, + "loss": 0.0378, + "step": 2595 + }, + { + "epoch": 0.57, + "grad_norm": 0.3431270003998748, + "learning_rate": 1.6447663158868897e-05, + "loss": 0.0368, + "step": 2596 + }, + { + "epoch": 0.57, + "grad_norm": 0.3797904247919991, + "learning_rate": 1.6433662064195378e-05, + "loss": 0.0427, + "step": 2597 + }, + { + "epoch": 0.57, + "grad_norm": 0.40201406952923524, + "learning_rate": 1.641966277446735e-05, + "loss": 0.0773, + "step": 2598 + }, + { + "epoch": 0.57, + "grad_norm": 0.3433373423549848, + "learning_rate": 1.6405665296769942e-05, + "loss": 0.0334, + "step": 2599 + }, + { + "epoch": 0.57, + "grad_norm": 0.41390034361412825, + "learning_rate": 1.6391669638187355e-05, + "loss": 0.0634, + "step": 2600 + }, + { + "epoch": 0.57, + "grad_norm": 0.3183565198150102, + "learning_rate": 1.6377675805802882e-05, + "loss": 0.0384, + "step": 2601 + }, + { + "epoch": 0.57, + "grad_norm": 0.34967596602657924, + "learning_rate": 1.6363683806698896e-05, + "loss": 0.0466, + "step": 2602 + }, + { + "epoch": 0.57, + "grad_norm": 0.3991341204438977, + "learning_rate": 1.6349693647956824e-05, + "loss": 0.0667, + "step": 2603 + }, + { + "epoch": 0.57, + "grad_norm": 0.3746840417493953, + "learning_rate": 1.6335705336657176e-05, + "loss": 0.0417, + "step": 2604 + }, + { + "epoch": 0.57, + "grad_norm": 0.30447485423593174, + "learning_rate": 1.632171887987952e-05, + "loss": 0.0398, + "step": 2605 + }, + { + "epoch": 0.57, + "grad_norm": 0.27851170434104927, + "learning_rate": 1.6307734284702484e-05, + "loss": 0.0446, + "step": 2606 + }, + { + "epoch": 0.57, + "grad_norm": 0.3622261229518271, + "learning_rate": 1.6293751558203764e-05, + "loss": 0.0417, + "step": 2607 + }, + { + "epoch": 0.57, + "grad_norm": 0.3671122128835008, + "learning_rate": 1.6279770707460096e-05, + "loss": 0.0481, + "step": 2608 + }, + { + "epoch": 0.57, + "grad_norm": 0.29716355956407814, + "learning_rate": 1.6265791739547276e-05, + "loss": 0.0413, + "step": 2609 + }, + { + "epoch": 0.57, + "grad_norm": 0.3207904459828551, + "learning_rate": 1.625181466154015e-05, + "loss": 0.0335, + "step": 2610 + }, + { + "epoch": 0.57, + "grad_norm": 0.3842392433599277, + "learning_rate": 1.62378394805126e-05, + "loss": 0.0432, + "step": 2611 + }, + { + "epoch": 0.57, + "grad_norm": 0.3508987306493617, + "learning_rate": 1.6223866203537558e-05, + "loss": 0.0439, + "step": 2612 + }, + { + "epoch": 0.57, + "grad_norm": 0.3887107101284892, + "learning_rate": 1.6209894837686974e-05, + "loss": 0.0493, + "step": 2613 + }, + { + "epoch": 0.57, + "grad_norm": 0.34506361480253667, + "learning_rate": 1.6195925390031845e-05, + "loss": 0.0385, + "step": 2614 + }, + { + "epoch": 0.57, + "grad_norm": 0.3061896731845957, + "learning_rate": 1.61819578676422e-05, + "loss": 0.0326, + "step": 2615 + }, + { + "epoch": 0.57, + "grad_norm": 0.32517755591683667, + "learning_rate": 1.616799227758708e-05, + "loss": 0.0453, + "step": 2616 + }, + { + "epoch": 0.57, + "grad_norm": 0.3009063658647218, + "learning_rate": 1.6154028626934548e-05, + "loss": 0.0353, + "step": 2617 + }, + { + "epoch": 0.58, + "grad_norm": 0.34151820130246274, + "learning_rate": 1.6140066922751715e-05, + "loss": 0.0423, + "step": 2618 + }, + { + "epoch": 0.58, + "grad_norm": 0.3868738884362936, + "learning_rate": 1.612610717210467e-05, + "loss": 0.0526, + "step": 2619 + }, + { + "epoch": 0.58, + "grad_norm": 0.29858877532184536, + "learning_rate": 1.611214938205854e-05, + "loss": 0.0415, + "step": 2620 + }, + { + "epoch": 0.58, + "grad_norm": 0.30091310063498156, + "learning_rate": 1.609819355967744e-05, + "loss": 0.0368, + "step": 2621 + }, + { + "epoch": 0.58, + "grad_norm": 0.43227457296134053, + "learning_rate": 1.6084239712024492e-05, + "loss": 0.0533, + "step": 2622 + }, + { + "epoch": 0.58, + "grad_norm": 0.36195687927499137, + "learning_rate": 1.6070287846161834e-05, + "loss": 0.041, + "step": 2623 + }, + { + "epoch": 0.58, + "grad_norm": 0.2939982699218436, + "learning_rate": 1.6056337969150584e-05, + "loss": 0.0298, + "step": 2624 + }, + { + "epoch": 0.58, + "grad_norm": 0.33352795025660326, + "learning_rate": 1.6042390088050864e-05, + "loss": 0.0448, + "step": 2625 + }, + { + "epoch": 0.58, + "grad_norm": 0.49970352800821244, + "learning_rate": 1.6028444209921775e-05, + "loss": 0.0493, + "step": 2626 + }, + { + "epoch": 0.58, + "grad_norm": 0.3028238717966314, + "learning_rate": 1.601450034182142e-05, + "loss": 0.0352, + "step": 2627 + }, + { + "epoch": 0.58, + "grad_norm": 0.33227471793707164, + "learning_rate": 1.6000558490806877e-05, + "loss": 0.0548, + "step": 2628 + }, + { + "epoch": 0.58, + "grad_norm": 0.2640987190823617, + "learning_rate": 1.59866186639342e-05, + "loss": 0.0317, + "step": 2629 + }, + { + "epoch": 0.58, + "grad_norm": 0.3609816875079501, + "learning_rate": 1.597268086825842e-05, + "loss": 0.0461, + "step": 2630 + }, + { + "epoch": 0.58, + "grad_norm": 0.3688559266875854, + "learning_rate": 1.5958745110833536e-05, + "loss": 0.0521, + "step": 2631 + }, + { + "epoch": 0.58, + "grad_norm": 0.366729450630457, + "learning_rate": 1.5944811398712527e-05, + "loss": 0.052, + "step": 2632 + }, + { + "epoch": 0.58, + "grad_norm": 0.35198386689963646, + "learning_rate": 1.5930879738947328e-05, + "loss": 0.0561, + "step": 2633 + }, + { + "epoch": 0.58, + "grad_norm": 0.32985824394095403, + "learning_rate": 1.5916950138588834e-05, + "loss": 0.0519, + "step": 2634 + }, + { + "epoch": 0.58, + "grad_norm": 0.3273032320388355, + "learning_rate": 1.5903022604686908e-05, + "loss": 0.047, + "step": 2635 + }, + { + "epoch": 0.58, + "grad_norm": 0.2991084361099232, + "learning_rate": 1.5889097144290357e-05, + "loss": 0.0357, + "step": 2636 + }, + { + "epoch": 0.58, + "grad_norm": 0.3294866894962564, + "learning_rate": 1.587517376444694e-05, + "loss": 0.0484, + "step": 2637 + }, + { + "epoch": 0.58, + "grad_norm": 0.34033152773148057, + "learning_rate": 1.5861252472203367e-05, + "loss": 0.0433, + "step": 2638 + }, + { + "epoch": 0.58, + "grad_norm": 0.32795379350134185, + "learning_rate": 1.5847333274605286e-05, + "loss": 0.0445, + "step": 2639 + }, + { + "epoch": 0.58, + "grad_norm": 0.44996915420142114, + "learning_rate": 1.5833416178697298e-05, + "loss": 0.082, + "step": 2640 + }, + { + "epoch": 0.58, + "grad_norm": 0.310305219470742, + "learning_rate": 1.5819501191522917e-05, + "loss": 0.0355, + "step": 2641 + }, + { + "epoch": 0.58, + "grad_norm": 0.2931443006346977, + "learning_rate": 1.5805588320124607e-05, + "loss": 0.0338, + "step": 2642 + }, + { + "epoch": 0.58, + "grad_norm": 0.3440901958568532, + "learning_rate": 1.5791677571543762e-05, + "loss": 0.0477, + "step": 2643 + }, + { + "epoch": 0.58, + "grad_norm": 0.43826924543888984, + "learning_rate": 1.5777768952820697e-05, + "loss": 0.0781, + "step": 2644 + }, + { + "epoch": 0.58, + "grad_norm": 0.38238867828954753, + "learning_rate": 1.576386247099465e-05, + "loss": 0.0561, + "step": 2645 + }, + { + "epoch": 0.58, + "grad_norm": 0.3758116521453048, + "learning_rate": 1.5749958133103772e-05, + "loss": 0.0616, + "step": 2646 + }, + { + "epoch": 0.58, + "grad_norm": 0.2914145634669556, + "learning_rate": 1.5736055946185137e-05, + "loss": 0.0326, + "step": 2647 + }, + { + "epoch": 0.58, + "grad_norm": 0.3146572398322556, + "learning_rate": 1.572215591727473e-05, + "loss": 0.0393, + "step": 2648 + }, + { + "epoch": 0.58, + "grad_norm": 0.3275491907399743, + "learning_rate": 1.570825805340743e-05, + "loss": 0.0462, + "step": 2649 + }, + { + "epoch": 0.58, + "grad_norm": 0.3821830245198099, + "learning_rate": 1.5694362361617043e-05, + "loss": 0.0479, + "step": 2650 + }, + { + "epoch": 0.58, + "grad_norm": 0.38612555273676535, + "learning_rate": 1.568046884893626e-05, + "loss": 0.0585, + "step": 2651 + }, + { + "epoch": 0.58, + "grad_norm": 0.35829701664699615, + "learning_rate": 1.5666577522396658e-05, + "loss": 0.0501, + "step": 2652 + }, + { + "epoch": 0.58, + "grad_norm": 0.3198429364267204, + "learning_rate": 1.565268838902875e-05, + "loss": 0.039, + "step": 2653 + }, + { + "epoch": 0.58, + "grad_norm": 0.33457842637181534, + "learning_rate": 1.5638801455861893e-05, + "loss": 0.0462, + "step": 2654 + }, + { + "epoch": 0.58, + "grad_norm": 0.34628602363815075, + "learning_rate": 1.5624916729924354e-05, + "loss": 0.0505, + "step": 2655 + }, + { + "epoch": 0.58, + "grad_norm": 0.28053922378784835, + "learning_rate": 1.561103421824328e-05, + "loss": 0.0305, + "step": 2656 + }, + { + "epoch": 0.58, + "grad_norm": 0.3882918488804002, + "learning_rate": 1.5597153927844693e-05, + "loss": 0.0448, + "step": 2657 + }, + { + "epoch": 0.58, + "grad_norm": 0.4070758311921904, + "learning_rate": 1.5583275865753492e-05, + "loss": 0.0483, + "step": 2658 + }, + { + "epoch": 0.58, + "grad_norm": 0.3473070941618986, + "learning_rate": 1.556940003899345e-05, + "loss": 0.0444, + "step": 2659 + }, + { + "epoch": 0.58, + "grad_norm": 0.4384647699142405, + "learning_rate": 1.55555264545872e-05, + "loss": 0.0759, + "step": 2660 + }, + { + "epoch": 0.58, + "grad_norm": 0.321066527129186, + "learning_rate": 1.5541655119556262e-05, + "loss": 0.0393, + "step": 2661 + }, + { + "epoch": 0.58, + "grad_norm": 0.32812148046876627, + "learning_rate": 1.5527786040921e-05, + "loss": 0.0379, + "step": 2662 + }, + { + "epoch": 0.58, + "grad_norm": 0.2679898790309503, + "learning_rate": 1.551391922570064e-05, + "loss": 0.0298, + "step": 2663 + }, + { + "epoch": 0.59, + "grad_norm": 0.4259319213815324, + "learning_rate": 1.550005468091326e-05, + "loss": 0.0726, + "step": 2664 + }, + { + "epoch": 0.59, + "grad_norm": 0.2978625936497866, + "learning_rate": 1.548619241357579e-05, + "loss": 0.0356, + "step": 2665 + }, + { + "epoch": 0.59, + "grad_norm": 0.4194003392848951, + "learning_rate": 1.5472332430704007e-05, + "loss": 0.0549, + "step": 2666 + }, + { + "epoch": 0.59, + "grad_norm": 0.30088993684977516, + "learning_rate": 1.545847473931254e-05, + "loss": 0.0426, + "step": 2667 + }, + { + "epoch": 0.59, + "grad_norm": 0.44909686217909617, + "learning_rate": 1.5444619346414845e-05, + "loss": 0.0627, + "step": 2668 + }, + { + "epoch": 0.59, + "grad_norm": 0.3775639896284503, + "learning_rate": 1.543076625902322e-05, + "loss": 0.0522, + "step": 2669 + }, + { + "epoch": 0.59, + "grad_norm": 0.3054246299862562, + "learning_rate": 1.5416915484148805e-05, + "loss": 0.0386, + "step": 2670 + }, + { + "epoch": 0.59, + "grad_norm": 0.39041315056117604, + "learning_rate": 1.5403067028801558e-05, + "loss": 0.0507, + "step": 2671 + }, + { + "epoch": 0.59, + "grad_norm": 0.31897717776320117, + "learning_rate": 1.5389220899990267e-05, + "loss": 0.0503, + "step": 2672 + }, + { + "epoch": 0.59, + "grad_norm": 0.307557708802915, + "learning_rate": 1.5375377104722545e-05, + "loss": 0.043, + "step": 2673 + }, + { + "epoch": 0.59, + "grad_norm": 0.3631592617515664, + "learning_rate": 1.5361535650004818e-05, + "loss": 0.0489, + "step": 2674 + }, + { + "epoch": 0.59, + "grad_norm": 0.33431035022896627, + "learning_rate": 1.5347696542842333e-05, + "loss": 0.0384, + "step": 2675 + }, + { + "epoch": 0.59, + "grad_norm": 0.3262609257458062, + "learning_rate": 1.5333859790239148e-05, + "loss": 0.042, + "step": 2676 + }, + { + "epoch": 0.59, + "grad_norm": 0.3015285843470328, + "learning_rate": 1.5320025399198125e-05, + "loss": 0.0466, + "step": 2677 + }, + { + "epoch": 0.59, + "grad_norm": 0.4392304937366175, + "learning_rate": 1.530619337672093e-05, + "loss": 0.0576, + "step": 2678 + }, + { + "epoch": 0.59, + "grad_norm": 0.3149699353448315, + "learning_rate": 1.5292363729808048e-05, + "loss": 0.0371, + "step": 2679 + }, + { + "epoch": 0.59, + "grad_norm": 0.3775983831905447, + "learning_rate": 1.5278536465458738e-05, + "loss": 0.0641, + "step": 2680 + }, + { + "epoch": 0.59, + "grad_norm": 0.3950621366217049, + "learning_rate": 1.5264711590671067e-05, + "loss": 0.0524, + "step": 2681 + }, + { + "epoch": 0.59, + "grad_norm": 0.3429047857260409, + "learning_rate": 1.5250889112441889e-05, + "loss": 0.0512, + "step": 2682 + }, + { + "epoch": 0.59, + "grad_norm": 0.2749659869443981, + "learning_rate": 1.5237069037766843e-05, + "loss": 0.0361, + "step": 2683 + }, + { + "epoch": 0.59, + "grad_norm": 0.3164070859356422, + "learning_rate": 1.5223251373640354e-05, + "loss": 0.0386, + "step": 2684 + }, + { + "epoch": 0.59, + "grad_norm": 0.28880691828118865, + "learning_rate": 1.5209436127055627e-05, + "loss": 0.0301, + "step": 2685 + }, + { + "epoch": 0.59, + "grad_norm": 0.3426884602907715, + "learning_rate": 1.5195623305004637e-05, + "loss": 0.037, + "step": 2686 + }, + { + "epoch": 0.59, + "grad_norm": 0.32675668975035227, + "learning_rate": 1.5181812914478146e-05, + "loss": 0.039, + "step": 2687 + }, + { + "epoch": 0.59, + "grad_norm": 0.36544928118454606, + "learning_rate": 1.5168004962465681e-05, + "loss": 0.0533, + "step": 2688 + }, + { + "epoch": 0.59, + "grad_norm": 0.27877802104980726, + "learning_rate": 1.5154199455955523e-05, + "loss": 0.0308, + "step": 2689 + }, + { + "epoch": 0.59, + "grad_norm": 0.3260941327124316, + "learning_rate": 1.5140396401934725e-05, + "loss": 0.0497, + "step": 2690 + }, + { + "epoch": 0.59, + "grad_norm": 0.29156792479249044, + "learning_rate": 1.5126595807389098e-05, + "loss": 0.0418, + "step": 2691 + }, + { + "epoch": 0.59, + "grad_norm": 0.3199316994412665, + "learning_rate": 1.5112797679303206e-05, + "loss": 0.0384, + "step": 2692 + }, + { + "epoch": 0.59, + "grad_norm": 0.35845826192788094, + "learning_rate": 1.5099002024660368e-05, + "loss": 0.0579, + "step": 2693 + }, + { + "epoch": 0.59, + "grad_norm": 0.3281439367007041, + "learning_rate": 1.5085208850442649e-05, + "loss": 0.0431, + "step": 2694 + }, + { + "epoch": 0.59, + "grad_norm": 0.38845412705942295, + "learning_rate": 1.5071418163630855e-05, + "loss": 0.0475, + "step": 2695 + }, + { + "epoch": 0.59, + "grad_norm": 0.38170965111248956, + "learning_rate": 1.5057629971204546e-05, + "loss": 0.0535, + "step": 2696 + }, + { + "epoch": 0.59, + "grad_norm": 0.3093982061679945, + "learning_rate": 1.5043844280142005e-05, + "loss": 0.0325, + "step": 2697 + }, + { + "epoch": 0.59, + "grad_norm": 0.28981425502855296, + "learning_rate": 1.5030061097420255e-05, + "loss": 0.0433, + "step": 2698 + }, + { + "epoch": 0.59, + "grad_norm": 0.31525880655108296, + "learning_rate": 1.5016280430015052e-05, + "loss": 0.0359, + "step": 2699 + }, + { + "epoch": 0.59, + "grad_norm": 0.3033155093331658, + "learning_rate": 1.5002502284900871e-05, + "loss": 0.0319, + "step": 2700 + }, + { + "epoch": 0.59, + "grad_norm": 0.3376865101202634, + "learning_rate": 1.4988726669050917e-05, + "loss": 0.0498, + "step": 2701 + }, + { + "epoch": 0.59, + "grad_norm": 0.41858342686618616, + "learning_rate": 1.4974953589437117e-05, + "loss": 0.0576, + "step": 2702 + }, + { + "epoch": 0.59, + "grad_norm": 0.3182429118187405, + "learning_rate": 1.4961183053030106e-05, + "loss": 0.0403, + "step": 2703 + }, + { + "epoch": 0.59, + "grad_norm": 0.3506493152510276, + "learning_rate": 1.4947415066799247e-05, + "loss": 0.0492, + "step": 2704 + }, + { + "epoch": 0.59, + "grad_norm": 0.2502868020514377, + "learning_rate": 1.4933649637712593e-05, + "loss": 0.03, + "step": 2705 + }, + { + "epoch": 0.59, + "grad_norm": 0.33765236497062373, + "learning_rate": 1.4919886772736915e-05, + "loss": 0.0325, + "step": 2706 + }, + { + "epoch": 0.59, + "grad_norm": 0.29693614179525, + "learning_rate": 1.4906126478837683e-05, + "loss": 0.0356, + "step": 2707 + }, + { + "epoch": 0.59, + "grad_norm": 0.3101886626634362, + "learning_rate": 1.4892368762979067e-05, + "loss": 0.0302, + "step": 2708 + }, + { + "epoch": 0.59, + "grad_norm": 0.3130714198962901, + "learning_rate": 1.4878613632123928e-05, + "loss": 0.0318, + "step": 2709 + }, + { + "epoch": 0.6, + "grad_norm": 0.3793925281604337, + "learning_rate": 1.4864861093233827e-05, + "loss": 0.0504, + "step": 2710 + }, + { + "epoch": 0.6, + "grad_norm": 0.3023329996568408, + "learning_rate": 1.4851111153269005e-05, + "loss": 0.0421, + "step": 2711 + }, + { + "epoch": 0.6, + "grad_norm": 0.3567602360211933, + "learning_rate": 1.4837363819188379e-05, + "loss": 0.0402, + "step": 2712 + }, + { + "epoch": 0.6, + "grad_norm": 0.33821174488992095, + "learning_rate": 1.4823619097949584e-05, + "loss": 0.0482, + "step": 2713 + }, + { + "epoch": 0.6, + "grad_norm": 0.3185716751967196, + "learning_rate": 1.4809876996508897e-05, + "loss": 0.0481, + "step": 2714 + }, + { + "epoch": 0.6, + "grad_norm": 0.3710762421342836, + "learning_rate": 1.4796137521821274e-05, + "loss": 0.0469, + "step": 2715 + }, + { + "epoch": 0.6, + "grad_norm": 0.2994532560113849, + "learning_rate": 1.4782400680840352e-05, + "loss": 0.0335, + "step": 2716 + }, + { + "epoch": 0.6, + "grad_norm": 0.32841658125444484, + "learning_rate": 1.4768666480518432e-05, + "loss": 0.0407, + "step": 2717 + }, + { + "epoch": 0.6, + "grad_norm": 0.3524634367329856, + "learning_rate": 1.4754934927806473e-05, + "loss": 0.0466, + "step": 2718 + }, + { + "epoch": 0.6, + "grad_norm": 0.3246530817861148, + "learning_rate": 1.4741206029654098e-05, + "loss": 0.0337, + "step": 2719 + }, + { + "epoch": 0.6, + "grad_norm": 0.3791839722292734, + "learning_rate": 1.472747979300959e-05, + "loss": 0.0434, + "step": 2720 + }, + { + "epoch": 0.6, + "grad_norm": 0.336279440002278, + "learning_rate": 1.4713756224819872e-05, + "loss": 0.0417, + "step": 2721 + }, + { + "epoch": 0.6, + "grad_norm": 0.2882385625480761, + "learning_rate": 1.4700035332030545e-05, + "loss": 0.0374, + "step": 2722 + }, + { + "epoch": 0.6, + "grad_norm": 0.28726647419164664, + "learning_rate": 1.468631712158582e-05, + "loss": 0.044, + "step": 2723 + }, + { + "epoch": 0.6, + "grad_norm": 0.49525735590255704, + "learning_rate": 1.4672601600428578e-05, + "loss": 0.0589, + "step": 2724 + }, + { + "epoch": 0.6, + "grad_norm": 0.3239146725506776, + "learning_rate": 1.465888877550032e-05, + "loss": 0.0361, + "step": 2725 + }, + { + "epoch": 0.6, + "grad_norm": 0.31194783543268184, + "learning_rate": 1.4645178653741194e-05, + "loss": 0.0417, + "step": 2726 + }, + { + "epoch": 0.6, + "grad_norm": 0.39087053211734085, + "learning_rate": 1.4631471242089978e-05, + "loss": 0.0435, + "step": 2727 + }, + { + "epoch": 0.6, + "grad_norm": 0.34705907989168727, + "learning_rate": 1.4617766547484075e-05, + "loss": 0.0473, + "step": 2728 + }, + { + "epoch": 0.6, + "grad_norm": 0.32899760973128733, + "learning_rate": 1.4604064576859513e-05, + "loss": 0.0309, + "step": 2729 + }, + { + "epoch": 0.6, + "grad_norm": 0.4117188795693934, + "learning_rate": 1.459036533715095e-05, + "loss": 0.0578, + "step": 2730 + }, + { + "epoch": 0.6, + "grad_norm": 0.37993141000758085, + "learning_rate": 1.4576668835291654e-05, + "loss": 0.0381, + "step": 2731 + }, + { + "epoch": 0.6, + "grad_norm": 0.35835120282232763, + "learning_rate": 1.4562975078213504e-05, + "loss": 0.0458, + "step": 2732 + }, + { + "epoch": 0.6, + "grad_norm": 0.4180931726097148, + "learning_rate": 1.4549284072846996e-05, + "loss": 0.0523, + "step": 2733 + }, + { + "epoch": 0.6, + "grad_norm": 0.368766958848891, + "learning_rate": 1.4535595826121233e-05, + "loss": 0.0461, + "step": 2734 + }, + { + "epoch": 0.6, + "grad_norm": 0.37347475448631995, + "learning_rate": 1.4521910344963918e-05, + "loss": 0.0523, + "step": 2735 + }, + { + "epoch": 0.6, + "grad_norm": 0.2811875141454974, + "learning_rate": 1.450822763630136e-05, + "loss": 0.0296, + "step": 2736 + }, + { + "epoch": 0.6, + "grad_norm": 0.3818052430755057, + "learning_rate": 1.4494547707058459e-05, + "loss": 0.0525, + "step": 2737 + }, + { + "epoch": 0.6, + "grad_norm": 0.2714308816365113, + "learning_rate": 1.4480870564158704e-05, + "loss": 0.0256, + "step": 2738 + }, + { + "epoch": 0.6, + "grad_norm": 0.3279851547325715, + "learning_rate": 1.44671962145242e-05, + "loss": 0.0396, + "step": 2739 + }, + { + "epoch": 0.6, + "grad_norm": 0.3168864080154981, + "learning_rate": 1.4453524665075607e-05, + "loss": 0.0375, + "step": 2740 + }, + { + "epoch": 0.6, + "grad_norm": 0.2895756110984442, + "learning_rate": 1.4439855922732182e-05, + "loss": 0.0352, + "step": 2741 + }, + { + "epoch": 0.6, + "grad_norm": 0.3121535049197099, + "learning_rate": 1.4426189994411756e-05, + "loss": 0.0447, + "step": 2742 + }, + { + "epoch": 0.6, + "grad_norm": 0.3011568251237467, + "learning_rate": 1.4412526887030745e-05, + "loss": 0.0447, + "step": 2743 + }, + { + "epoch": 0.6, + "grad_norm": 0.34098788844481187, + "learning_rate": 1.4398866607504128e-05, + "loss": 0.0515, + "step": 2744 + }, + { + "epoch": 0.6, + "grad_norm": 0.3039500285882022, + "learning_rate": 1.4385209162745453e-05, + "loss": 0.035, + "step": 2745 + }, + { + "epoch": 0.6, + "grad_norm": 0.36113540046057563, + "learning_rate": 1.4371554559666843e-05, + "loss": 0.0576, + "step": 2746 + }, + { + "epoch": 0.6, + "grad_norm": 0.4289938442926769, + "learning_rate": 1.4357902805178965e-05, + "loss": 0.0448, + "step": 2747 + }, + { + "epoch": 0.6, + "grad_norm": 0.39897293689915525, + "learning_rate": 1.434425390619107e-05, + "loss": 0.0571, + "step": 2748 + }, + { + "epoch": 0.6, + "grad_norm": 0.3216986321138276, + "learning_rate": 1.4330607869610945e-05, + "loss": 0.0371, + "step": 2749 + }, + { + "epoch": 0.6, + "grad_norm": 0.4036541127822171, + "learning_rate": 1.431696470234493e-05, + "loss": 0.051, + "step": 2750 + }, + { + "epoch": 0.6, + "grad_norm": 0.31114819926250076, + "learning_rate": 1.4303324411297918e-05, + "loss": 0.0492, + "step": 2751 + }, + { + "epoch": 0.6, + "grad_norm": 0.3387100998933343, + "learning_rate": 1.4289687003373342e-05, + "loss": 0.0468, + "step": 2752 + }, + { + "epoch": 0.6, + "grad_norm": 0.2328871549688841, + "learning_rate": 1.4276052485473177e-05, + "loss": 0.0239, + "step": 2753 + }, + { + "epoch": 0.6, + "grad_norm": 0.3202228653331459, + "learning_rate": 1.4262420864497939e-05, + "loss": 0.0432, + "step": 2754 + }, + { + "epoch": 0.61, + "grad_norm": 0.2696762524200947, + "learning_rate": 1.4248792147346668e-05, + "loss": 0.0317, + "step": 2755 + }, + { + "epoch": 0.61, + "grad_norm": 0.31967655879116696, + "learning_rate": 1.4235166340916955e-05, + "loss": 0.045, + "step": 2756 + }, + { + "epoch": 0.61, + "grad_norm": 0.2970760568117321, + "learning_rate": 1.4221543452104891e-05, + "loss": 0.0308, + "step": 2757 + }, + { + "epoch": 0.61, + "grad_norm": 0.42458892253619795, + "learning_rate": 1.4207923487805108e-05, + "loss": 0.0672, + "step": 2758 + }, + { + "epoch": 0.61, + "grad_norm": 0.33230513328162714, + "learning_rate": 1.4194306454910757e-05, + "loss": 0.0397, + "step": 2759 + }, + { + "epoch": 0.61, + "grad_norm": 0.42762986739777514, + "learning_rate": 1.4180692360313494e-05, + "loss": 0.0554, + "step": 2760 + }, + { + "epoch": 0.61, + "grad_norm": 0.27931077295684037, + "learning_rate": 1.4167081210903501e-05, + "loss": 0.04, + "step": 2761 + }, + { + "epoch": 0.61, + "grad_norm": 0.41547331587630704, + "learning_rate": 1.4153473013569468e-05, + "loss": 0.0458, + "step": 2762 + }, + { + "epoch": 0.61, + "grad_norm": 0.3026604592932078, + "learning_rate": 1.413986777519858e-05, + "loss": 0.0389, + "step": 2763 + }, + { + "epoch": 0.61, + "grad_norm": 0.40036944821808274, + "learning_rate": 1.412626550267653e-05, + "loss": 0.0751, + "step": 2764 + }, + { + "epoch": 0.61, + "grad_norm": 0.47896027439194794, + "learning_rate": 1.4112666202887522e-05, + "loss": 0.0722, + "step": 2765 + }, + { + "epoch": 0.61, + "grad_norm": 0.27886565471312885, + "learning_rate": 1.4099069882714236e-05, + "loss": 0.038, + "step": 2766 + }, + { + "epoch": 0.61, + "grad_norm": 0.30368425797912313, + "learning_rate": 1.4085476549037856e-05, + "loss": 0.0446, + "step": 2767 + }, + { + "epoch": 0.61, + "grad_norm": 0.2981647908149259, + "learning_rate": 1.4071886208738053e-05, + "loss": 0.0388, + "step": 2768 + }, + { + "epoch": 0.61, + "grad_norm": 0.33278599396225034, + "learning_rate": 1.4058298868692979e-05, + "loss": 0.0512, + "step": 2769 + }, + { + "epoch": 0.61, + "grad_norm": 0.33682190007711665, + "learning_rate": 1.4044714535779269e-05, + "loss": 0.0424, + "step": 2770 + }, + { + "epoch": 0.61, + "grad_norm": 0.30438516973860114, + "learning_rate": 1.403113321687204e-05, + "loss": 0.0483, + "step": 2771 + }, + { + "epoch": 0.61, + "grad_norm": 0.33630166165650016, + "learning_rate": 1.4017554918844872e-05, + "loss": 0.0401, + "step": 2772 + }, + { + "epoch": 0.61, + "grad_norm": 0.340924135150723, + "learning_rate": 1.4003979648569839e-05, + "loss": 0.0489, + "step": 2773 + }, + { + "epoch": 0.61, + "grad_norm": 0.28300769245153057, + "learning_rate": 1.3990407412917462e-05, + "loss": 0.0283, + "step": 2774 + }, + { + "epoch": 0.61, + "grad_norm": 0.3466167661491617, + "learning_rate": 1.3976838218756733e-05, + "loss": 0.0487, + "step": 2775 + }, + { + "epoch": 0.61, + "grad_norm": 0.3346830945121258, + "learning_rate": 1.3963272072955106e-05, + "loss": 0.0462, + "step": 2776 + }, + { + "epoch": 0.61, + "grad_norm": 0.29863913799445185, + "learning_rate": 1.3949708982378487e-05, + "loss": 0.0415, + "step": 2777 + }, + { + "epoch": 0.61, + "grad_norm": 0.253393359450197, + "learning_rate": 1.3936148953891242e-05, + "loss": 0.0321, + "step": 2778 + }, + { + "epoch": 0.61, + "grad_norm": 0.3637070570173247, + "learning_rate": 1.392259199435618e-05, + "loss": 0.0524, + "step": 2779 + }, + { + "epoch": 0.61, + "grad_norm": 0.3759873425808452, + "learning_rate": 1.3909038110634567e-05, + "loss": 0.0573, + "step": 2780 + }, + { + "epoch": 0.61, + "grad_norm": 0.28541006262766017, + "learning_rate": 1.3895487309586097e-05, + "loss": 0.0416, + "step": 2781 + }, + { + "epoch": 0.61, + "grad_norm": 0.40106802601906233, + "learning_rate": 1.388193959806893e-05, + "loss": 0.0618, + "step": 2782 + }, + { + "epoch": 0.61, + "grad_norm": 0.3662019652221195, + "learning_rate": 1.3868394982939636e-05, + "loss": 0.0489, + "step": 2783 + }, + { + "epoch": 0.61, + "grad_norm": 0.34370404716597247, + "learning_rate": 1.3854853471053225e-05, + "loss": 0.0477, + "step": 2784 + }, + { + "epoch": 0.61, + "grad_norm": 0.3543753733624572, + "learning_rate": 1.3841315069263146e-05, + "loss": 0.0482, + "step": 2785 + }, + { + "epoch": 0.61, + "grad_norm": 0.35525774719252445, + "learning_rate": 1.3827779784421262e-05, + "loss": 0.0406, + "step": 2786 + }, + { + "epoch": 0.61, + "grad_norm": 0.3808691561494414, + "learning_rate": 1.3814247623377868e-05, + "loss": 0.0578, + "step": 2787 + }, + { + "epoch": 0.61, + "grad_norm": 0.23788466046753176, + "learning_rate": 1.3800718592981668e-05, + "loss": 0.0244, + "step": 2788 + }, + { + "epoch": 0.61, + "grad_norm": 0.3964649815819659, + "learning_rate": 1.3787192700079792e-05, + "loss": 0.0508, + "step": 2789 + }, + { + "epoch": 0.61, + "grad_norm": 0.26043611771738767, + "learning_rate": 1.377366995151777e-05, + "loss": 0.0312, + "step": 2790 + }, + { + "epoch": 0.61, + "grad_norm": 0.37760732145197373, + "learning_rate": 1.3760150354139558e-05, + "loss": 0.0401, + "step": 2791 + }, + { + "epoch": 0.61, + "grad_norm": 0.2952347920625057, + "learning_rate": 1.3746633914787504e-05, + "loss": 0.036, + "step": 2792 + }, + { + "epoch": 0.61, + "grad_norm": 0.27096026759420505, + "learning_rate": 1.3733120640302358e-05, + "loss": 0.0276, + "step": 2793 + }, + { + "epoch": 0.61, + "grad_norm": 0.28976262851872026, + "learning_rate": 1.3719610537523274e-05, + "loss": 0.0368, + "step": 2794 + }, + { + "epoch": 0.61, + "grad_norm": 0.2947502773848208, + "learning_rate": 1.3706103613287796e-05, + "loss": 0.0366, + "step": 2795 + }, + { + "epoch": 0.61, + "grad_norm": 0.36182000985970075, + "learning_rate": 1.369259987443186e-05, + "loss": 0.0538, + "step": 2796 + }, + { + "epoch": 0.61, + "grad_norm": 0.2963917463149579, + "learning_rate": 1.3679099327789794e-05, + "loss": 0.0267, + "step": 2797 + }, + { + "epoch": 0.61, + "grad_norm": 0.3148398893701771, + "learning_rate": 1.3665601980194297e-05, + "loss": 0.0496, + "step": 2798 + }, + { + "epoch": 0.61, + "grad_norm": 0.3313389348355963, + "learning_rate": 1.3652107838476476e-05, + "loss": 0.0375, + "step": 2799 + }, + { + "epoch": 0.61, + "grad_norm": 0.2519413849217148, + "learning_rate": 1.3638616909465791e-05, + "loss": 0.0359, + "step": 2800 + }, + { + "epoch": 0.62, + "grad_norm": 0.32790310486681473, + "learning_rate": 1.3625129199990083e-05, + "loss": 0.0364, + "step": 2801 + }, + { + "epoch": 0.62, + "grad_norm": 0.37081440902313717, + "learning_rate": 1.3611644716875568e-05, + "loss": 0.061, + "step": 2802 + }, + { + "epoch": 0.62, + "grad_norm": 0.3476633542652982, + "learning_rate": 1.3598163466946823e-05, + "loss": 0.0408, + "step": 2803 + }, + { + "epoch": 0.62, + "grad_norm": 0.3780332911227297, + "learning_rate": 1.3584685457026789e-05, + "loss": 0.0504, + "step": 2804 + }, + { + "epoch": 0.62, + "grad_norm": 0.31770609896556845, + "learning_rate": 1.3571210693936774e-05, + "loss": 0.0388, + "step": 2805 + }, + { + "epoch": 0.62, + "grad_norm": 0.31575286182698364, + "learning_rate": 1.3557739184496435e-05, + "loss": 0.0437, + "step": 2806 + }, + { + "epoch": 0.62, + "grad_norm": 0.3594767177759614, + "learning_rate": 1.3544270935523778e-05, + "loss": 0.0447, + "step": 2807 + }, + { + "epoch": 0.62, + "grad_norm": 0.36754325109400565, + "learning_rate": 1.3530805953835182e-05, + "loss": 0.0515, + "step": 2808 + }, + { + "epoch": 0.62, + "grad_norm": 0.4071580934482072, + "learning_rate": 1.351734424624535e-05, + "loss": 0.0578, + "step": 2809 + }, + { + "epoch": 0.62, + "grad_norm": 0.3646757576366893, + "learning_rate": 1.3503885819567335e-05, + "loss": 0.0547, + "step": 2810 + }, + { + "epoch": 0.62, + "grad_norm": 0.2684636603321318, + "learning_rate": 1.3490430680612528e-05, + "loss": 0.0354, + "step": 2811 + }, + { + "epoch": 0.62, + "grad_norm": 0.28134491431539715, + "learning_rate": 1.3476978836190658e-05, + "loss": 0.0264, + "step": 2812 + }, + { + "epoch": 0.62, + "grad_norm": 0.33385682598409755, + "learning_rate": 1.3463530293109783e-05, + "loss": 0.0379, + "step": 2813 + }, + { + "epoch": 0.62, + "grad_norm": 0.32932744636579747, + "learning_rate": 1.34500850581763e-05, + "loss": 0.0367, + "step": 2814 + }, + { + "epoch": 0.62, + "grad_norm": 0.37019697552233266, + "learning_rate": 1.3436643138194918e-05, + "loss": 0.0495, + "step": 2815 + }, + { + "epoch": 0.62, + "grad_norm": 0.358547314897313, + "learning_rate": 1.3423204539968677e-05, + "loss": 0.0321, + "step": 2816 + }, + { + "epoch": 0.62, + "grad_norm": 0.2801113922919793, + "learning_rate": 1.3409769270298934e-05, + "loss": 0.029, + "step": 2817 + }, + { + "epoch": 0.62, + "grad_norm": 0.3508957261880292, + "learning_rate": 1.3396337335985361e-05, + "loss": 0.0336, + "step": 2818 + }, + { + "epoch": 0.62, + "grad_norm": 0.3909390712800989, + "learning_rate": 1.3382908743825947e-05, + "loss": 0.0522, + "step": 2819 + }, + { + "epoch": 0.62, + "grad_norm": 0.2928226106459401, + "learning_rate": 1.336948350061698e-05, + "loss": 0.0346, + "step": 2820 + }, + { + "epoch": 0.62, + "grad_norm": 0.2954126026903477, + "learning_rate": 1.335606161315306e-05, + "loss": 0.0261, + "step": 2821 + }, + { + "epoch": 0.62, + "grad_norm": 0.2717804015217529, + "learning_rate": 1.3342643088227085e-05, + "loss": 0.0398, + "step": 2822 + }, + { + "epoch": 0.62, + "grad_norm": 0.33073288121978467, + "learning_rate": 1.3329227932630255e-05, + "loss": 0.0388, + "step": 2823 + }, + { + "epoch": 0.62, + "grad_norm": 0.33370518308471864, + "learning_rate": 1.3315816153152055e-05, + "loss": 0.0439, + "step": 2824 + }, + { + "epoch": 0.62, + "grad_norm": 0.30608677504159165, + "learning_rate": 1.3302407756580278e-05, + "loss": 0.0387, + "step": 2825 + }, + { + "epoch": 0.62, + "grad_norm": 0.33089457778381837, + "learning_rate": 1.3289002749700992e-05, + "loss": 0.0325, + "step": 2826 + }, + { + "epoch": 0.62, + "grad_norm": 0.4467097997419248, + "learning_rate": 1.3275601139298556e-05, + "loss": 0.0457, + "step": 2827 + }, + { + "epoch": 0.62, + "grad_norm": 0.3706083636868158, + "learning_rate": 1.3262202932155602e-05, + "loss": 0.0484, + "step": 2828 + }, + { + "epoch": 0.62, + "grad_norm": 0.3404835345982751, + "learning_rate": 1.3248808135053048e-05, + "loss": 0.0498, + "step": 2829 + }, + { + "epoch": 0.62, + "grad_norm": 0.2949020555922916, + "learning_rate": 1.3235416754770082e-05, + "loss": 0.0475, + "step": 2830 + }, + { + "epoch": 0.62, + "grad_norm": 0.377446064790745, + "learning_rate": 1.3222028798084165e-05, + "loss": 0.0534, + "step": 2831 + }, + { + "epoch": 0.62, + "grad_norm": 0.32032310654361856, + "learning_rate": 1.3208644271771026e-05, + "loss": 0.0443, + "step": 2832 + }, + { + "epoch": 0.62, + "grad_norm": 0.37997190318260876, + "learning_rate": 1.3195263182604638e-05, + "loss": 0.0436, + "step": 2833 + }, + { + "epoch": 0.62, + "grad_norm": 0.3834620822777257, + "learning_rate": 1.3181885537357277e-05, + "loss": 0.0493, + "step": 2834 + }, + { + "epoch": 0.62, + "grad_norm": 0.2942071173338933, + "learning_rate": 1.3168511342799444e-05, + "loss": 0.0337, + "step": 2835 + }, + { + "epoch": 0.62, + "grad_norm": 0.30761294891828145, + "learning_rate": 1.3155140605699894e-05, + "loss": 0.0422, + "step": 2836 + }, + { + "epoch": 0.62, + "grad_norm": 0.3319158914665372, + "learning_rate": 1.3141773332825647e-05, + "loss": 0.0331, + "step": 2837 + }, + { + "epoch": 0.62, + "grad_norm": 0.44578035773700425, + "learning_rate": 1.3128409530941957e-05, + "loss": 0.0581, + "step": 2838 + }, + { + "epoch": 0.62, + "grad_norm": 0.32801149312387384, + "learning_rate": 1.3115049206812325e-05, + "loss": 0.0507, + "step": 2839 + }, + { + "epoch": 0.62, + "grad_norm": 0.382789377565198, + "learning_rate": 1.3101692367198498e-05, + "loss": 0.0921, + "step": 2840 + }, + { + "epoch": 0.62, + "grad_norm": 0.30446166447700507, + "learning_rate": 1.3088339018860439e-05, + "loss": 0.0435, + "step": 2841 + }, + { + "epoch": 0.62, + "grad_norm": 0.39309102804653384, + "learning_rate": 1.307498916855638e-05, + "loss": 0.0453, + "step": 2842 + }, + { + "epoch": 0.62, + "grad_norm": 0.2857678139199307, + "learning_rate": 1.3061642823042757e-05, + "loss": 0.039, + "step": 2843 + }, + { + "epoch": 0.62, + "grad_norm": 0.3226794735584289, + "learning_rate": 1.3048299989074234e-05, + "loss": 0.0423, + "step": 2844 + }, + { + "epoch": 0.62, + "grad_norm": 0.22917456866136676, + "learning_rate": 1.3034960673403699e-05, + "loss": 0.0292, + "step": 2845 + }, + { + "epoch": 0.63, + "grad_norm": 0.2897688766383443, + "learning_rate": 1.3021624882782262e-05, + "loss": 0.0356, + "step": 2846 + }, + { + "epoch": 0.63, + "grad_norm": 0.3024013533413087, + "learning_rate": 1.3008292623959253e-05, + "loss": 0.0429, + "step": 2847 + }, + { + "epoch": 0.63, + "grad_norm": 0.3064032443695261, + "learning_rate": 1.2994963903682205e-05, + "loss": 0.0392, + "step": 2848 + }, + { + "epoch": 0.63, + "grad_norm": 0.30547864610100867, + "learning_rate": 1.2981638728696868e-05, + "loss": 0.0475, + "step": 2849 + }, + { + "epoch": 0.63, + "grad_norm": 0.3186415609393602, + "learning_rate": 1.2968317105747189e-05, + "loss": 0.043, + "step": 2850 + }, + { + "epoch": 0.63, + "grad_norm": 0.32282427137613623, + "learning_rate": 1.2954999041575331e-05, + "loss": 0.0412, + "step": 2851 + }, + { + "epoch": 0.63, + "grad_norm": 0.3826593365122257, + "learning_rate": 1.2941684542921646e-05, + "loss": 0.0493, + "step": 2852 + }, + { + "epoch": 0.63, + "grad_norm": 0.2961884893065571, + "learning_rate": 1.2928373616524682e-05, + "loss": 0.0288, + "step": 2853 + }, + { + "epoch": 0.63, + "grad_norm": 0.3475935590133547, + "learning_rate": 1.291506626912118e-05, + "loss": 0.0475, + "step": 2854 + }, + { + "epoch": 0.63, + "grad_norm": 0.35536169179632393, + "learning_rate": 1.290176250744607e-05, + "loss": 0.0425, + "step": 2855 + }, + { + "epoch": 0.63, + "grad_norm": 0.3053531955003352, + "learning_rate": 1.2888462338232466e-05, + "loss": 0.0371, + "step": 2856 + }, + { + "epoch": 0.63, + "grad_norm": 0.2528767475118705, + "learning_rate": 1.287516576821167e-05, + "loss": 0.0245, + "step": 2857 + }, + { + "epoch": 0.63, + "grad_norm": 0.2888324182721138, + "learning_rate": 1.2861872804113154e-05, + "loss": 0.0376, + "step": 2858 + }, + { + "epoch": 0.63, + "grad_norm": 0.3111029151397963, + "learning_rate": 1.284858345266456e-05, + "loss": 0.0434, + "step": 2859 + }, + { + "epoch": 0.63, + "grad_norm": 0.42479953105033164, + "learning_rate": 1.2835297720591729e-05, + "loss": 0.0472, + "step": 2860 + }, + { + "epoch": 0.63, + "grad_norm": 0.2913783456363997, + "learning_rate": 1.282201561461864e-05, + "loss": 0.0347, + "step": 2861 + }, + { + "epoch": 0.63, + "grad_norm": 0.37374209985708284, + "learning_rate": 1.2808737141467451e-05, + "loss": 0.0481, + "step": 2862 + }, + { + "epoch": 0.63, + "grad_norm": 0.3775900958717164, + "learning_rate": 1.2795462307858478e-05, + "loss": 0.0506, + "step": 2863 + }, + { + "epoch": 0.63, + "grad_norm": 0.31289014304287066, + "learning_rate": 1.2782191120510196e-05, + "loss": 0.0361, + "step": 2864 + }, + { + "epoch": 0.63, + "grad_norm": 0.2958720733681559, + "learning_rate": 1.2768923586139232e-05, + "loss": 0.0393, + "step": 2865 + }, + { + "epoch": 0.63, + "grad_norm": 0.3559906869055891, + "learning_rate": 1.275565971146037e-05, + "loss": 0.048, + "step": 2866 + }, + { + "epoch": 0.63, + "grad_norm": 0.2940382066111115, + "learning_rate": 1.2742399503186528e-05, + "loss": 0.0433, + "step": 2867 + }, + { + "epoch": 0.63, + "grad_norm": 0.3047524162975901, + "learning_rate": 1.2729142968028793e-05, + "loss": 0.0368, + "step": 2868 + }, + { + "epoch": 0.63, + "grad_norm": 0.28501375112905175, + "learning_rate": 1.2715890112696379e-05, + "loss": 0.0292, + "step": 2869 + }, + { + "epoch": 0.63, + "grad_norm": 0.2598049162784958, + "learning_rate": 1.2702640943896625e-05, + "loss": 0.0312, + "step": 2870 + }, + { + "epoch": 0.63, + "grad_norm": 0.3311882312332492, + "learning_rate": 1.2689395468335027e-05, + "loss": 0.0424, + "step": 2871 + }, + { + "epoch": 0.63, + "grad_norm": 0.3100638922291191, + "learning_rate": 1.2676153692715195e-05, + "loss": 0.0322, + "step": 2872 + }, + { + "epoch": 0.63, + "grad_norm": 0.3158769536699534, + "learning_rate": 1.2662915623738874e-05, + "loss": 0.0401, + "step": 2873 + }, + { + "epoch": 0.63, + "grad_norm": 0.3905736317237826, + "learning_rate": 1.2649681268105933e-05, + "loss": 0.0494, + "step": 2874 + }, + { + "epoch": 0.63, + "grad_norm": 0.3623962492290295, + "learning_rate": 1.263645063251436e-05, + "loss": 0.0494, + "step": 2875 + }, + { + "epoch": 0.63, + "grad_norm": 0.3047706172742518, + "learning_rate": 1.2623223723660258e-05, + "loss": 0.0402, + "step": 2876 + }, + { + "epoch": 0.63, + "grad_norm": 0.29039474153923495, + "learning_rate": 1.2610000548237851e-05, + "loss": 0.0345, + "step": 2877 + }, + { + "epoch": 0.63, + "grad_norm": 0.42741917183463485, + "learning_rate": 1.259678111293947e-05, + "loss": 0.0456, + "step": 2878 + }, + { + "epoch": 0.63, + "grad_norm": 0.36327702509566523, + "learning_rate": 1.2583565424455552e-05, + "loss": 0.0591, + "step": 2879 + }, + { + "epoch": 0.63, + "grad_norm": 0.25609254768094003, + "learning_rate": 1.2570353489474637e-05, + "loss": 0.0385, + "step": 2880 + }, + { + "epoch": 0.63, + "grad_norm": 0.3580431515431179, + "learning_rate": 1.2557145314683364e-05, + "loss": 0.0451, + "step": 2881 + }, + { + "epoch": 0.63, + "grad_norm": 0.24723126894418496, + "learning_rate": 1.254394090676647e-05, + "loss": 0.0337, + "step": 2882 + }, + { + "epoch": 0.63, + "grad_norm": 0.24767297062551186, + "learning_rate": 1.2530740272406792e-05, + "loss": 0.0307, + "step": 2883 + }, + { + "epoch": 0.63, + "grad_norm": 0.2776789184659262, + "learning_rate": 1.2517543418285247e-05, + "loss": 0.0334, + "step": 2884 + }, + { + "epoch": 0.63, + "grad_norm": 0.32881618544241237, + "learning_rate": 1.2504350351080845e-05, + "loss": 0.0403, + "step": 2885 + }, + { + "epoch": 0.63, + "grad_norm": 0.3183871649098128, + "learning_rate": 1.2491161077470682e-05, + "loss": 0.0448, + "step": 2886 + }, + { + "epoch": 0.63, + "grad_norm": 0.2888848042097438, + "learning_rate": 1.2477975604129929e-05, + "loss": 0.0389, + "step": 2887 + }, + { + "epoch": 0.63, + "grad_norm": 0.3045754593476319, + "learning_rate": 1.2464793937731831e-05, + "loss": 0.0383, + "step": 2888 + }, + { + "epoch": 0.63, + "grad_norm": 0.42008218588417245, + "learning_rate": 1.2451616084947714e-05, + "loss": 0.0645, + "step": 2889 + }, + { + "epoch": 0.63, + "grad_norm": 0.2813345199320012, + "learning_rate": 1.243844205244697e-05, + "loss": 0.0334, + "step": 2890 + }, + { + "epoch": 0.63, + "grad_norm": 0.37685954459211185, + "learning_rate": 1.2425271846897053e-05, + "loss": 0.0558, + "step": 2891 + }, + { + "epoch": 0.64, + "grad_norm": 0.4008627736124671, + "learning_rate": 1.2412105474963491e-05, + "loss": 0.0533, + "step": 2892 + }, + { + "epoch": 0.64, + "grad_norm": 0.4143105789384026, + "learning_rate": 1.2398942943309855e-05, + "loss": 0.066, + "step": 2893 + }, + { + "epoch": 0.64, + "grad_norm": 0.34944816154299846, + "learning_rate": 1.2385784258597796e-05, + "loss": 0.0427, + "step": 2894 + }, + { + "epoch": 0.64, + "grad_norm": 0.3303130854129566, + "learning_rate": 1.2372629427487e-05, + "loss": 0.05, + "step": 2895 + }, + { + "epoch": 0.64, + "grad_norm": 0.2697725459850731, + "learning_rate": 1.235947845663521e-05, + "loss": 0.027, + "step": 2896 + }, + { + "epoch": 0.64, + "grad_norm": 0.3182834932663021, + "learning_rate": 1.2346331352698206e-05, + "loss": 0.0363, + "step": 2897 + }, + { + "epoch": 0.64, + "grad_norm": 0.24598310647552704, + "learning_rate": 1.2333188122329824e-05, + "loss": 0.0408, + "step": 2898 + }, + { + "epoch": 0.64, + "grad_norm": 0.29446130119261393, + "learning_rate": 1.2320048772181932e-05, + "loss": 0.0401, + "step": 2899 + }, + { + "epoch": 0.64, + "grad_norm": 0.27915071784669715, + "learning_rate": 1.2306913308904435e-05, + "loss": 0.0265, + "step": 2900 + }, + { + "epoch": 0.64, + "grad_norm": 0.31153922205430945, + "learning_rate": 1.2293781739145274e-05, + "loss": 0.0347, + "step": 2901 + }, + { + "epoch": 0.64, + "grad_norm": 0.2740717752120284, + "learning_rate": 1.2280654069550404e-05, + "loss": 0.0375, + "step": 2902 + }, + { + "epoch": 0.64, + "grad_norm": 0.32661056712366027, + "learning_rate": 1.2267530306763837e-05, + "loss": 0.0329, + "step": 2903 + }, + { + "epoch": 0.64, + "grad_norm": 0.25680366284318407, + "learning_rate": 1.2254410457427581e-05, + "loss": 0.0272, + "step": 2904 + }, + { + "epoch": 0.64, + "grad_norm": 0.40490566249909893, + "learning_rate": 1.2241294528181678e-05, + "loss": 0.0406, + "step": 2905 + }, + { + "epoch": 0.64, + "grad_norm": 0.24799753434259125, + "learning_rate": 1.2228182525664175e-05, + "loss": 0.0265, + "step": 2906 + }, + { + "epoch": 0.64, + "grad_norm": 0.32278382224724494, + "learning_rate": 1.2215074456511136e-05, + "loss": 0.0349, + "step": 2907 + }, + { + "epoch": 0.64, + "grad_norm": 0.2880573193834743, + "learning_rate": 1.2201970327356639e-05, + "loss": 0.0378, + "step": 2908 + }, + { + "epoch": 0.64, + "grad_norm": 0.3387098689255451, + "learning_rate": 1.2188870144832758e-05, + "loss": 0.0433, + "step": 2909 + }, + { + "epoch": 0.64, + "grad_norm": 0.3689953957324699, + "learning_rate": 1.217577391556958e-05, + "loss": 0.0433, + "step": 2910 + }, + { + "epoch": 0.64, + "grad_norm": 0.28615291381637514, + "learning_rate": 1.2162681646195187e-05, + "loss": 0.0269, + "step": 2911 + }, + { + "epoch": 0.64, + "grad_norm": 0.37840625375641673, + "learning_rate": 1.2149593343335658e-05, + "loss": 0.0417, + "step": 2912 + }, + { + "epoch": 0.64, + "grad_norm": 0.33880744429424287, + "learning_rate": 1.2136509013615063e-05, + "loss": 0.0418, + "step": 2913 + }, + { + "epoch": 0.64, + "grad_norm": 0.2916809260197858, + "learning_rate": 1.2123428663655457e-05, + "loss": 0.0345, + "step": 2914 + }, + { + "epoch": 0.64, + "grad_norm": 0.349445347161046, + "learning_rate": 1.211035230007689e-05, + "loss": 0.0398, + "step": 2915 + }, + { + "epoch": 0.64, + "grad_norm": 0.30144971408574184, + "learning_rate": 1.209727992949739e-05, + "loss": 0.0345, + "step": 2916 + }, + { + "epoch": 0.64, + "grad_norm": 0.28327437195210436, + "learning_rate": 1.2084211558532958e-05, + "loss": 0.0269, + "step": 2917 + }, + { + "epoch": 0.64, + "grad_norm": 0.35992758188097324, + "learning_rate": 1.2071147193797578e-05, + "loss": 0.0456, + "step": 2918 + }, + { + "epoch": 0.64, + "grad_norm": 0.3139806835397025, + "learning_rate": 1.2058086841903211e-05, + "loss": 0.0362, + "step": 2919 + }, + { + "epoch": 0.64, + "grad_norm": 0.33326614964467255, + "learning_rate": 1.204503050945978e-05, + "loss": 0.0426, + "step": 2920 + }, + { + "epoch": 0.64, + "grad_norm": 0.26053383893253446, + "learning_rate": 1.2031978203075172e-05, + "loss": 0.032, + "step": 2921 + }, + { + "epoch": 0.64, + "grad_norm": 0.2218677210621221, + "learning_rate": 1.2018929929355241e-05, + "loss": 0.0185, + "step": 2922 + }, + { + "epoch": 0.64, + "grad_norm": 0.4643167666122263, + "learning_rate": 1.2005885694903796e-05, + "loss": 0.0613, + "step": 2923 + }, + { + "epoch": 0.64, + "grad_norm": 0.3133421636518249, + "learning_rate": 1.1992845506322607e-05, + "loss": 0.0306, + "step": 2924 + }, + { + "epoch": 0.64, + "grad_norm": 0.3368456784936427, + "learning_rate": 1.1979809370211392e-05, + "loss": 0.0441, + "step": 2925 + }, + { + "epoch": 0.64, + "grad_norm": 0.3515826537395515, + "learning_rate": 1.196677729316782e-05, + "loss": 0.0424, + "step": 2926 + }, + { + "epoch": 0.64, + "grad_norm": 0.32891554060474937, + "learning_rate": 1.1953749281787502e-05, + "loss": 0.0388, + "step": 2927 + }, + { + "epoch": 0.64, + "grad_norm": 0.383314698567998, + "learning_rate": 1.194072534266399e-05, + "loss": 0.0481, + "step": 2928 + }, + { + "epoch": 0.64, + "grad_norm": 0.2887688253301843, + "learning_rate": 1.1927705482388794e-05, + "loss": 0.0403, + "step": 2929 + }, + { + "epoch": 0.64, + "grad_norm": 0.3175047889483473, + "learning_rate": 1.1914689707551337e-05, + "loss": 0.0392, + "step": 2930 + }, + { + "epoch": 0.64, + "grad_norm": 0.3006459960029491, + "learning_rate": 1.1901678024738983e-05, + "loss": 0.0337, + "step": 2931 + }, + { + "epoch": 0.64, + "grad_norm": 0.3369660599983346, + "learning_rate": 1.1888670440537025e-05, + "loss": 0.0335, + "step": 2932 + }, + { + "epoch": 0.64, + "grad_norm": 0.32840089325379707, + "learning_rate": 1.1875666961528679e-05, + "loss": 0.0336, + "step": 2933 + }, + { + "epoch": 0.64, + "grad_norm": 0.31666973116713637, + "learning_rate": 1.1862667594295086e-05, + "loss": 0.043, + "step": 2934 + }, + { + "epoch": 0.64, + "grad_norm": 0.28063185632481785, + "learning_rate": 1.1849672345415306e-05, + "loss": 0.0425, + "step": 2935 + }, + { + "epoch": 0.64, + "grad_norm": 0.34532325693352134, + "learning_rate": 1.1836681221466308e-05, + "loss": 0.0393, + "step": 2936 + }, + { + "epoch": 0.65, + "grad_norm": 0.30076662250967534, + "learning_rate": 1.1823694229022995e-05, + "loss": 0.0391, + "step": 2937 + }, + { + "epoch": 0.65, + "grad_norm": 0.3062019003833736, + "learning_rate": 1.181071137465815e-05, + "loss": 0.035, + "step": 2938 + }, + { + "epoch": 0.65, + "grad_norm": 0.37222397001806645, + "learning_rate": 1.1797732664942481e-05, + "loss": 0.0744, + "step": 2939 + }, + { + "epoch": 0.65, + "grad_norm": 0.2864970600075523, + "learning_rate": 1.1784758106444594e-05, + "loss": 0.039, + "step": 2940 + }, + { + "epoch": 0.65, + "grad_norm": 0.2879502575008263, + "learning_rate": 1.1771787705730983e-05, + "loss": 0.0304, + "step": 2941 + }, + { + "epoch": 0.65, + "grad_norm": 0.31231118697034904, + "learning_rate": 1.175882146936606e-05, + "loss": 0.0406, + "step": 2942 + }, + { + "epoch": 0.65, + "grad_norm": 0.3410455347601863, + "learning_rate": 1.1745859403912108e-05, + "loss": 0.0366, + "step": 2943 + }, + { + "epoch": 0.65, + "grad_norm": 0.30274079838983087, + "learning_rate": 1.1732901515929312e-05, + "loss": 0.0391, + "step": 2944 + }, + { + "epoch": 0.65, + "grad_norm": 0.3174696348718507, + "learning_rate": 1.1719947811975732e-05, + "loss": 0.0403, + "step": 2945 + }, + { + "epoch": 0.65, + "grad_norm": 0.31613914376879376, + "learning_rate": 1.1706998298607325e-05, + "loss": 0.0428, + "step": 2946 + }, + { + "epoch": 0.65, + "grad_norm": 0.43372456407309057, + "learning_rate": 1.1694052982377915e-05, + "loss": 0.0508, + "step": 2947 + }, + { + "epoch": 0.65, + "grad_norm": 0.27547772700698475, + "learning_rate": 1.1681111869839209e-05, + "loss": 0.0354, + "step": 2948 + }, + { + "epoch": 0.65, + "grad_norm": 0.23440111332696512, + "learning_rate": 1.166817496754078e-05, + "loss": 0.0248, + "step": 2949 + }, + { + "epoch": 0.65, + "grad_norm": 0.6419558309085079, + "learning_rate": 1.1655242282030068e-05, + "loss": 0.0934, + "step": 2950 + }, + { + "epoch": 0.65, + "grad_norm": 0.4207470437930287, + "learning_rate": 1.1642313819852405e-05, + "loss": 0.0574, + "step": 2951 + }, + { + "epoch": 0.65, + "grad_norm": 0.2677592757227379, + "learning_rate": 1.1629389587550939e-05, + "loss": 0.0438, + "step": 2952 + }, + { + "epoch": 0.65, + "grad_norm": 0.30039862038000953, + "learning_rate": 1.1616469591666725e-05, + "loss": 0.0329, + "step": 2953 + }, + { + "epoch": 0.65, + "grad_norm": 0.2554436429253239, + "learning_rate": 1.1603553838738635e-05, + "loss": 0.0307, + "step": 2954 + }, + { + "epoch": 0.65, + "grad_norm": 0.3277463772091227, + "learning_rate": 1.1590642335303417e-05, + "loss": 0.0348, + "step": 2955 + }, + { + "epoch": 0.65, + "grad_norm": 0.28809162716177045, + "learning_rate": 1.1577735087895664e-05, + "loss": 0.0267, + "step": 2956 + }, + { + "epoch": 0.65, + "grad_norm": 0.2916931356045432, + "learning_rate": 1.1564832103047818e-05, + "loss": 0.0361, + "step": 2957 + }, + { + "epoch": 0.65, + "grad_norm": 0.2786283695535751, + "learning_rate": 1.1551933387290149e-05, + "loss": 0.0338, + "step": 2958 + }, + { + "epoch": 0.65, + "grad_norm": 0.3756814169764887, + "learning_rate": 1.1539038947150783e-05, + "loss": 0.0556, + "step": 2959 + }, + { + "epoch": 0.65, + "grad_norm": 0.3044743158896093, + "learning_rate": 1.152614878915567e-05, + "loss": 0.0357, + "step": 2960 + }, + { + "epoch": 0.65, + "grad_norm": 0.2520853479294366, + "learning_rate": 1.1513262919828603e-05, + "loss": 0.0318, + "step": 2961 + }, + { + "epoch": 0.65, + "grad_norm": 0.30014479788558823, + "learning_rate": 1.1500381345691192e-05, + "loss": 0.0374, + "step": 2962 + }, + { + "epoch": 0.65, + "grad_norm": 0.37943443373256613, + "learning_rate": 1.1487504073262886e-05, + "loss": 0.0519, + "step": 2963 + }, + { + "epoch": 0.65, + "grad_norm": 0.26828896988529727, + "learning_rate": 1.1474631109060957e-05, + "loss": 0.0327, + "step": 2964 + }, + { + "epoch": 0.65, + "grad_norm": 0.3523671584378296, + "learning_rate": 1.1461762459600476e-05, + "loss": 0.0442, + "step": 2965 + }, + { + "epoch": 0.65, + "grad_norm": 0.26789693565200207, + "learning_rate": 1.1448898131394364e-05, + "loss": 0.0293, + "step": 2966 + }, + { + "epoch": 0.65, + "grad_norm": 0.2616347989618018, + "learning_rate": 1.1436038130953317e-05, + "loss": 0.0356, + "step": 2967 + }, + { + "epoch": 0.65, + "grad_norm": 0.23439360385810168, + "learning_rate": 1.142318246478588e-05, + "loss": 0.0297, + "step": 2968 + }, + { + "epoch": 0.65, + "grad_norm": 0.27644543837366153, + "learning_rate": 1.1410331139398365e-05, + "loss": 0.0425, + "step": 2969 + }, + { + "epoch": 0.65, + "grad_norm": 0.31879390713338973, + "learning_rate": 1.1397484161294924e-05, + "loss": 0.0387, + "step": 2970 + }, + { + "epoch": 0.65, + "grad_norm": 0.2862545054093661, + "learning_rate": 1.138464153697747e-05, + "loss": 0.0386, + "step": 2971 + }, + { + "epoch": 0.65, + "grad_norm": 0.3208737689501007, + "learning_rate": 1.1371803272945759e-05, + "loss": 0.0434, + "step": 2972 + }, + { + "epoch": 0.65, + "grad_norm": 0.3828787259796405, + "learning_rate": 1.1358969375697297e-05, + "loss": 0.0419, + "step": 2973 + }, + { + "epoch": 0.65, + "grad_norm": 0.34019136737319144, + "learning_rate": 1.1346139851727412e-05, + "loss": 0.0394, + "step": 2974 + }, + { + "epoch": 0.65, + "grad_norm": 0.28714105859439243, + "learning_rate": 1.1333314707529188e-05, + "loss": 0.0325, + "step": 2975 + }, + { + "epoch": 0.65, + "grad_norm": 0.24338669906714824, + "learning_rate": 1.1320493949593528e-05, + "loss": 0.0251, + "step": 2976 + }, + { + "epoch": 0.65, + "grad_norm": 0.23301690332500383, + "learning_rate": 1.1307677584409076e-05, + "loss": 0.0299, + "step": 2977 + }, + { + "epoch": 0.65, + "grad_norm": 0.3014458336750867, + "learning_rate": 1.1294865618462294e-05, + "loss": 0.0355, + "step": 2978 + }, + { + "epoch": 0.65, + "grad_norm": 0.2848679116820537, + "learning_rate": 1.128205805823737e-05, + "loss": 0.0279, + "step": 2979 + }, + { + "epoch": 0.65, + "grad_norm": 0.3149477107873396, + "learning_rate": 1.1269254910216316e-05, + "loss": 0.0406, + "step": 2980 + }, + { + "epoch": 0.65, + "grad_norm": 0.3981554491624712, + "learning_rate": 1.1256456180878867e-05, + "loss": 0.0541, + "step": 2981 + }, + { + "epoch": 0.65, + "grad_norm": 0.3531659752357583, + "learning_rate": 1.1243661876702552e-05, + "loss": 0.0536, + "step": 2982 + }, + { + "epoch": 0.66, + "grad_norm": 0.3250805534889989, + "learning_rate": 1.1230872004162631e-05, + "loss": 0.0537, + "step": 2983 + }, + { + "epoch": 0.66, + "grad_norm": 0.2604980611756454, + "learning_rate": 1.1218086569732152e-05, + "loss": 0.0323, + "step": 2984 + }, + { + "epoch": 0.66, + "grad_norm": 0.26375732450122863, + "learning_rate": 1.1205305579881883e-05, + "loss": 0.0361, + "step": 2985 + }, + { + "epoch": 0.66, + "grad_norm": 0.2597553996891595, + "learning_rate": 1.1192529041080382e-05, + "loss": 0.0243, + "step": 2986 + }, + { + "epoch": 0.66, + "grad_norm": 0.30060744527767147, + "learning_rate": 1.1179756959793918e-05, + "loss": 0.0306, + "step": 2987 + }, + { + "epoch": 0.66, + "grad_norm": 0.35249120117934213, + "learning_rate": 1.1166989342486524e-05, + "loss": 0.0486, + "step": 2988 + }, + { + "epoch": 0.66, + "grad_norm": 0.26966911052098275, + "learning_rate": 1.1154226195619979e-05, + "loss": 0.0249, + "step": 2989 + }, + { + "epoch": 0.66, + "grad_norm": 0.3111691508172709, + "learning_rate": 1.1141467525653773e-05, + "loss": 0.0332, + "step": 2990 + }, + { + "epoch": 0.66, + "grad_norm": 0.2694851241517991, + "learning_rate": 1.1128713339045162e-05, + "loss": 0.0269, + "step": 2991 + }, + { + "epoch": 0.66, + "grad_norm": 0.4213832001964649, + "learning_rate": 1.1115963642249107e-05, + "loss": 0.0444, + "step": 2992 + }, + { + "epoch": 0.66, + "grad_norm": 0.2772025299513124, + "learning_rate": 1.110321844171832e-05, + "loss": 0.0401, + "step": 2993 + }, + { + "epoch": 0.66, + "grad_norm": 0.3647805520649611, + "learning_rate": 1.1090477743903212e-05, + "loss": 0.0505, + "step": 2994 + }, + { + "epoch": 0.66, + "grad_norm": 0.26937476051921777, + "learning_rate": 1.1077741555251938e-05, + "loss": 0.0311, + "step": 2995 + }, + { + "epoch": 0.66, + "grad_norm": 0.2771986729672004, + "learning_rate": 1.1065009882210352e-05, + "loss": 0.0404, + "step": 2996 + }, + { + "epoch": 0.66, + "grad_norm": 0.2800261839050899, + "learning_rate": 1.1052282731222035e-05, + "loss": 0.0341, + "step": 2997 + }, + { + "epoch": 0.66, + "grad_norm": 0.24857849312844407, + "learning_rate": 1.1039560108728277e-05, + "loss": 0.0315, + "step": 2998 + }, + { + "epoch": 0.66, + "grad_norm": 0.3018654689903294, + "learning_rate": 1.1026842021168088e-05, + "loss": 0.032, + "step": 2999 + }, + { + "epoch": 0.66, + "grad_norm": 0.3464088564074809, + "learning_rate": 1.101412847497815e-05, + "loss": 0.0424, + "step": 3000 + }, + { + "epoch": 0.66, + "grad_norm": 0.3691593750727993, + "learning_rate": 1.100141947659288e-05, + "loss": 0.0518, + "step": 3001 + }, + { + "epoch": 0.66, + "grad_norm": 0.3224610978512565, + "learning_rate": 1.0988715032444369e-05, + "loss": 0.0417, + "step": 3002 + }, + { + "epoch": 0.66, + "grad_norm": 0.2211300173153049, + "learning_rate": 1.0976015148962427e-05, + "loss": 0.0245, + "step": 3003 + }, + { + "epoch": 0.66, + "grad_norm": 0.3657974757425489, + "learning_rate": 1.0963319832574528e-05, + "loss": 0.0489, + "step": 3004 + }, + { + "epoch": 0.66, + "grad_norm": 0.2633445827705902, + "learning_rate": 1.0950629089705857e-05, + "loss": 0.028, + "step": 3005 + }, + { + "epoch": 0.66, + "grad_norm": 0.24596810539431577, + "learning_rate": 1.0937942926779279e-05, + "loss": 0.0301, + "step": 3006 + }, + { + "epoch": 0.66, + "grad_norm": 0.266475549896758, + "learning_rate": 1.0925261350215344e-05, + "loss": 0.0342, + "step": 3007 + }, + { + "epoch": 0.66, + "grad_norm": 0.31072261324263634, + "learning_rate": 1.091258436643226e-05, + "loss": 0.0427, + "step": 3008 + }, + { + "epoch": 0.66, + "grad_norm": 0.2857607873897929, + "learning_rate": 1.0899911981845946e-05, + "loss": 0.038, + "step": 3009 + }, + { + "epoch": 0.66, + "grad_norm": 0.24947990020506072, + "learning_rate": 1.0887244202869951e-05, + "loss": 0.0364, + "step": 3010 + }, + { + "epoch": 0.66, + "grad_norm": 0.27201095119687674, + "learning_rate": 1.0874581035915534e-05, + "loss": 0.0297, + "step": 3011 + }, + { + "epoch": 0.66, + "grad_norm": 0.2753170131482302, + "learning_rate": 1.0861922487391588e-05, + "loss": 0.0312, + "step": 3012 + }, + { + "epoch": 0.66, + "grad_norm": 0.29611178219925743, + "learning_rate": 1.0849268563704696e-05, + "loss": 0.034, + "step": 3013 + }, + { + "epoch": 0.66, + "grad_norm": 0.3435229722174912, + "learning_rate": 1.0836619271259072e-05, + "loss": 0.0441, + "step": 3014 + }, + { + "epoch": 0.66, + "grad_norm": 0.2658495654982007, + "learning_rate": 1.0823974616456607e-05, + "loss": 0.0338, + "step": 3015 + }, + { + "epoch": 0.66, + "grad_norm": 0.29094652981374264, + "learning_rate": 1.0811334605696837e-05, + "loss": 0.0371, + "step": 3016 + }, + { + "epoch": 0.66, + "grad_norm": 0.252277608889838, + "learning_rate": 1.0798699245376959e-05, + "loss": 0.0273, + "step": 3017 + }, + { + "epoch": 0.66, + "grad_norm": 0.2975109341359133, + "learning_rate": 1.078606854189179e-05, + "loss": 0.0383, + "step": 3018 + }, + { + "epoch": 0.66, + "grad_norm": 0.36072595267535207, + "learning_rate": 1.0773442501633822e-05, + "loss": 0.0433, + "step": 3019 + }, + { + "epoch": 0.66, + "grad_norm": 0.3628036271553774, + "learning_rate": 1.0760821130993157e-05, + "loss": 0.0625, + "step": 3020 + }, + { + "epoch": 0.66, + "grad_norm": 0.3527759379658487, + "learning_rate": 1.0748204436357562e-05, + "loss": 0.0415, + "step": 3021 + }, + { + "epoch": 0.66, + "grad_norm": 0.37273677699622976, + "learning_rate": 1.0735592424112404e-05, + "loss": 0.0464, + "step": 3022 + }, + { + "epoch": 0.66, + "grad_norm": 0.3031672797261527, + "learning_rate": 1.0722985100640717e-05, + "loss": 0.0437, + "step": 3023 + }, + { + "epoch": 0.66, + "grad_norm": 0.4999697348256345, + "learning_rate": 1.0710382472323145e-05, + "loss": 0.099, + "step": 3024 + }, + { + "epoch": 0.66, + "grad_norm": 0.29739183060223257, + "learning_rate": 1.0697784545537943e-05, + "loss": 0.0391, + "step": 3025 + }, + { + "epoch": 0.66, + "grad_norm": 0.3408796045707868, + "learning_rate": 1.0685191326661015e-05, + "loss": 0.0495, + "step": 3026 + }, + { + "epoch": 0.66, + "grad_norm": 0.3168906283615832, + "learning_rate": 1.0672602822065845e-05, + "loss": 0.0412, + "step": 3027 + }, + { + "epoch": 0.67, + "grad_norm": 0.22778330125223972, + "learning_rate": 1.0660019038123577e-05, + "loss": 0.0265, + "step": 3028 + }, + { + "epoch": 0.67, + "grad_norm": 0.31655710650685553, + "learning_rate": 1.0647439981202918e-05, + "loss": 0.0323, + "step": 3029 + }, + { + "epoch": 0.67, + "grad_norm": 0.2903265250999385, + "learning_rate": 1.0634865657670227e-05, + "loss": 0.0455, + "step": 3030 + }, + { + "epoch": 0.67, + "grad_norm": 0.2576086654583902, + "learning_rate": 1.0622296073889417e-05, + "loss": 0.0364, + "step": 3031 + }, + { + "epoch": 0.67, + "grad_norm": 0.2611284573747479, + "learning_rate": 1.0609731236222069e-05, + "loss": 0.0346, + "step": 3032 + }, + { + "epoch": 0.67, + "grad_norm": 0.2360144101401728, + "learning_rate": 1.0597171151027297e-05, + "loss": 0.0349, + "step": 3033 + }, + { + "epoch": 0.67, + "grad_norm": 0.2767069046268152, + "learning_rate": 1.058461582466185e-05, + "loss": 0.036, + "step": 3034 + }, + { + "epoch": 0.67, + "grad_norm": 0.29520680387895365, + "learning_rate": 1.0572065263480046e-05, + "loss": 0.0328, + "step": 3035 + }, + { + "epoch": 0.67, + "grad_norm": 0.2730848217792552, + "learning_rate": 1.0559519473833815e-05, + "loss": 0.0235, + "step": 3036 + }, + { + "epoch": 0.67, + "grad_norm": 0.3385652741735248, + "learning_rate": 1.0546978462072642e-05, + "loss": 0.0386, + "step": 3037 + }, + { + "epoch": 0.67, + "grad_norm": 0.2996355629707367, + "learning_rate": 1.0534442234543623e-05, + "loss": 0.0436, + "step": 3038 + }, + { + "epoch": 0.67, + "grad_norm": 0.3374086702363696, + "learning_rate": 1.0521910797591408e-05, + "loss": 0.0393, + "step": 3039 + }, + { + "epoch": 0.67, + "grad_norm": 0.34725073405930423, + "learning_rate": 1.0509384157558236e-05, + "loss": 0.0497, + "step": 3040 + }, + { + "epoch": 0.67, + "grad_norm": 0.3113323809669054, + "learning_rate": 1.0496862320783926e-05, + "loss": 0.043, + "step": 3041 + }, + { + "epoch": 0.67, + "grad_norm": 0.32109315459110344, + "learning_rate": 1.0484345293605853e-05, + "loss": 0.0342, + "step": 3042 + }, + { + "epoch": 0.67, + "grad_norm": 0.2767139456583379, + "learning_rate": 1.0471833082358954e-05, + "loss": 0.0296, + "step": 3043 + }, + { + "epoch": 0.67, + "grad_norm": 0.3111162424314804, + "learning_rate": 1.0459325693375746e-05, + "loss": 0.042, + "step": 3044 + }, + { + "epoch": 0.67, + "grad_norm": 0.3189332388597514, + "learning_rate": 1.0446823132986283e-05, + "loss": 0.0361, + "step": 3045 + }, + { + "epoch": 0.67, + "grad_norm": 0.31497175676644285, + "learning_rate": 1.0434325407518204e-05, + "loss": 0.0347, + "step": 3046 + }, + { + "epoch": 0.67, + "grad_norm": 0.2779880749191047, + "learning_rate": 1.0421832523296665e-05, + "loss": 0.0261, + "step": 3047 + }, + { + "epoch": 0.67, + "grad_norm": 0.31046564011343325, + "learning_rate": 1.04093444866444e-05, + "loss": 0.0403, + "step": 3048 + }, + { + "epoch": 0.67, + "grad_norm": 0.2831393860747274, + "learning_rate": 1.0396861303881691e-05, + "loss": 0.0286, + "step": 3049 + }, + { + "epoch": 0.67, + "grad_norm": 0.2727575294371441, + "learning_rate": 1.0384382981326336e-05, + "loss": 0.0359, + "step": 3050 + }, + { + "epoch": 0.67, + "grad_norm": 0.3663008312520523, + "learning_rate": 1.0371909525293709e-05, + "loss": 0.0523, + "step": 3051 + }, + { + "epoch": 0.67, + "grad_norm": 0.30116656010510984, + "learning_rate": 1.0359440942096682e-05, + "loss": 0.0309, + "step": 3052 + }, + { + "epoch": 0.67, + "grad_norm": 0.3353678619920413, + "learning_rate": 1.0346977238045699e-05, + "loss": 0.0397, + "step": 3053 + }, + { + "epoch": 0.67, + "grad_norm": 0.3399476298151822, + "learning_rate": 1.0334518419448703e-05, + "loss": 0.0396, + "step": 3054 + }, + { + "epoch": 0.67, + "grad_norm": 0.23916380456187727, + "learning_rate": 1.0322064492611195e-05, + "loss": 0.0299, + "step": 3055 + }, + { + "epoch": 0.67, + "grad_norm": 0.25943841618555563, + "learning_rate": 1.0309615463836162e-05, + "loss": 0.0241, + "step": 3056 + }, + { + "epoch": 0.67, + "grad_norm": 0.3557983798400247, + "learning_rate": 1.0297171339424148e-05, + "loss": 0.049, + "step": 3057 + }, + { + "epoch": 0.67, + "grad_norm": 0.324009885498815, + "learning_rate": 1.0284732125673198e-05, + "loss": 0.0409, + "step": 3058 + }, + { + "epoch": 0.67, + "grad_norm": 0.20160362060773568, + "learning_rate": 1.0272297828878881e-05, + "loss": 0.0209, + "step": 3059 + }, + { + "epoch": 0.67, + "grad_norm": 0.26653484585635645, + "learning_rate": 1.0259868455334259e-05, + "loss": 0.0284, + "step": 3060 + }, + { + "epoch": 0.67, + "grad_norm": 0.2883275328288837, + "learning_rate": 1.0247444011329928e-05, + "loss": 0.0343, + "step": 3061 + }, + { + "epoch": 0.67, + "grad_norm": 0.30784072020467396, + "learning_rate": 1.0235024503153956e-05, + "loss": 0.0337, + "step": 3062 + }, + { + "epoch": 0.67, + "grad_norm": 0.39213557944528954, + "learning_rate": 1.0222609937091952e-05, + "loss": 0.0571, + "step": 3063 + }, + { + "epoch": 0.67, + "grad_norm": 0.35248830540852255, + "learning_rate": 1.0210200319426988e-05, + "loss": 0.0289, + "step": 3064 + }, + { + "epoch": 0.67, + "grad_norm": 0.24872786988966386, + "learning_rate": 1.0197795656439662e-05, + "loss": 0.0281, + "step": 3065 + }, + { + "epoch": 0.67, + "grad_norm": 0.3033114439975192, + "learning_rate": 1.0185395954408031e-05, + "loss": 0.0372, + "step": 3066 + }, + { + "epoch": 0.67, + "grad_norm": 0.28971036273483797, + "learning_rate": 1.0173001219607683e-05, + "loss": 0.0481, + "step": 3067 + }, + { + "epoch": 0.67, + "grad_norm": 0.3235645357995982, + "learning_rate": 1.0160611458311651e-05, + "loss": 0.0369, + "step": 3068 + }, + { + "epoch": 0.67, + "grad_norm": 0.405914241250726, + "learning_rate": 1.0148226676790482e-05, + "loss": 0.047, + "step": 3069 + }, + { + "epoch": 0.67, + "grad_norm": 0.33128543565256136, + "learning_rate": 1.013584688131218e-05, + "loss": 0.0559, + "step": 3070 + }, + { + "epoch": 0.67, + "grad_norm": 0.2835106988154603, + "learning_rate": 1.0123472078142248e-05, + "loss": 0.0301, + "step": 3071 + }, + { + "epoch": 0.67, + "grad_norm": 0.3415746784045918, + "learning_rate": 1.011110227354363e-05, + "loss": 0.047, + "step": 3072 + }, + { + "epoch": 0.67, + "grad_norm": 0.2912476657806638, + "learning_rate": 1.0098737473776781e-05, + "loss": 0.0398, + "step": 3073 + }, + { + "epoch": 0.68, + "grad_norm": 0.24532364133573092, + "learning_rate": 1.0086377685099578e-05, + "loss": 0.0244, + "step": 3074 + }, + { + "epoch": 0.68, + "grad_norm": 0.32993961565654917, + "learning_rate": 1.0074022913767411e-05, + "loss": 0.0449, + "step": 3075 + }, + { + "epoch": 0.68, + "grad_norm": 0.3160682568379425, + "learning_rate": 1.006167316603309e-05, + "loss": 0.042, + "step": 3076 + }, + { + "epoch": 0.68, + "grad_norm": 0.36620874022778316, + "learning_rate": 1.0049328448146908e-05, + "loss": 0.0427, + "step": 3077 + }, + { + "epoch": 0.68, + "grad_norm": 0.2819488901785845, + "learning_rate": 1.0036988766356592e-05, + "loss": 0.0289, + "step": 3078 + }, + { + "epoch": 0.68, + "grad_norm": 0.3035661039266114, + "learning_rate": 1.0024654126907343e-05, + "loss": 0.0415, + "step": 3079 + }, + { + "epoch": 0.68, + "grad_norm": 0.2880209639389726, + "learning_rate": 1.0012324536041781e-05, + "loss": 0.0335, + "step": 3080 + }, + { + "epoch": 0.68, + "grad_norm": 0.2384137582520422, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0306, + "step": 3081 + }, + { + "epoch": 0.68, + "grad_norm": 0.3243717499020729, + "learning_rate": 9.987680525019521e-06, + "loss": 0.0373, + "step": 3082 + }, + { + "epoch": 0.68, + "grad_norm": 0.37508876067967445, + "learning_rate": 9.975366117335301e-06, + "loss": 0.0556, + "step": 3083 + }, + { + "epoch": 0.68, + "grad_norm": 0.25705657300879875, + "learning_rate": 9.96305678317975e-06, + "loss": 0.0302, + "step": 3084 + }, + { + "epoch": 0.68, + "grad_norm": 0.2486956514718051, + "learning_rate": 9.950752528782679e-06, + "loss": 0.0277, + "step": 3085 + }, + { + "epoch": 0.68, + "grad_norm": 0.31257510474818884, + "learning_rate": 9.938453360371363e-06, + "loss": 0.0405, + "step": 3086 + }, + { + "epoch": 0.68, + "grad_norm": 0.2958070972835022, + "learning_rate": 9.926159284170471e-06, + "loss": 0.0378, + "step": 3087 + }, + { + "epoch": 0.68, + "grad_norm": 0.2978037311107582, + "learning_rate": 9.913870306402129e-06, + "loss": 0.0359, + "step": 3088 + }, + { + "epoch": 0.68, + "grad_norm": 0.24685634047575178, + "learning_rate": 9.901586433285845e-06, + "loss": 0.0246, + "step": 3089 + }, + { + "epoch": 0.68, + "grad_norm": 0.29914513770600243, + "learning_rate": 9.889307671038579e-06, + "loss": 0.0297, + "step": 3090 + }, + { + "epoch": 0.68, + "grad_norm": 0.25373522060085896, + "learning_rate": 9.877034025874675e-06, + "loss": 0.0279, + "step": 3091 + }, + { + "epoch": 0.68, + "grad_norm": 0.28816754753091417, + "learning_rate": 9.864765504005901e-06, + "loss": 0.0401, + "step": 3092 + }, + { + "epoch": 0.68, + "grad_norm": 0.2882007434774494, + "learning_rate": 9.852502111641438e-06, + "loss": 0.0426, + "step": 3093 + }, + { + "epoch": 0.68, + "grad_norm": 0.25232177201688216, + "learning_rate": 9.840243854987868e-06, + "loss": 0.0345, + "step": 3094 + }, + { + "epoch": 0.68, + "grad_norm": 0.24840152348033523, + "learning_rate": 9.827990740249156e-06, + "loss": 0.03, + "step": 3095 + }, + { + "epoch": 0.68, + "grad_norm": 0.2506674410256598, + "learning_rate": 9.815742773626693e-06, + "loss": 0.0342, + "step": 3096 + }, + { + "epoch": 0.68, + "grad_norm": 0.23513415096530796, + "learning_rate": 9.803499961319234e-06, + "loss": 0.0199, + "step": 3097 + }, + { + "epoch": 0.68, + "grad_norm": 0.3594340193010657, + "learning_rate": 9.791262309522959e-06, + "loss": 0.0534, + "step": 3098 + }, + { + "epoch": 0.68, + "grad_norm": 0.26262614250660854, + "learning_rate": 9.779029824431403e-06, + "loss": 0.0328, + "step": 3099 + }, + { + "epoch": 0.68, + "grad_norm": 0.3819389106850532, + "learning_rate": 9.766802512235507e-06, + "loss": 0.0734, + "step": 3100 + }, + { + "epoch": 0.68, + "grad_norm": 0.3023854983385063, + "learning_rate": 9.75458037912359e-06, + "loss": 0.0339, + "step": 3101 + }, + { + "epoch": 0.68, + "grad_norm": 0.3121884819870665, + "learning_rate": 9.742363431281356e-06, + "loss": 0.0437, + "step": 3102 + }, + { + "epoch": 0.68, + "grad_norm": 0.25550331208570076, + "learning_rate": 9.73015167489186e-06, + "loss": 0.0253, + "step": 3103 + }, + { + "epoch": 0.68, + "grad_norm": 0.3430055880911585, + "learning_rate": 9.717945116135568e-06, + "loss": 0.0373, + "step": 3104 + }, + { + "epoch": 0.68, + "grad_norm": 0.31624401313030304, + "learning_rate": 9.705743761190273e-06, + "loss": 0.0406, + "step": 3105 + }, + { + "epoch": 0.68, + "grad_norm": 0.2834614200655747, + "learning_rate": 9.693547616231173e-06, + "loss": 0.0361, + "step": 3106 + }, + { + "epoch": 0.68, + "grad_norm": 0.3513387179755282, + "learning_rate": 9.681356687430798e-06, + "loss": 0.0434, + "step": 3107 + }, + { + "epoch": 0.68, + "grad_norm": 0.28196666062468867, + "learning_rate": 9.669170980959063e-06, + "loss": 0.0335, + "step": 3108 + }, + { + "epoch": 0.68, + "grad_norm": 0.33908513780252647, + "learning_rate": 9.656990502983216e-06, + "loss": 0.0317, + "step": 3109 + }, + { + "epoch": 0.68, + "grad_norm": 0.313221265991117, + "learning_rate": 9.644815259667881e-06, + "loss": 0.035, + "step": 3110 + }, + { + "epoch": 0.68, + "grad_norm": 0.26410877623606116, + "learning_rate": 9.632645257175027e-06, + "loss": 0.0354, + "step": 3111 + }, + { + "epoch": 0.68, + "grad_norm": 0.28774523227496873, + "learning_rate": 9.620480501663954e-06, + "loss": 0.0274, + "step": 3112 + }, + { + "epoch": 0.68, + "grad_norm": 0.3178335188120151, + "learning_rate": 9.608320999291333e-06, + "loss": 0.0353, + "step": 3113 + }, + { + "epoch": 0.68, + "grad_norm": 0.3229114765344905, + "learning_rate": 9.59616675621115e-06, + "loss": 0.0352, + "step": 3114 + }, + { + "epoch": 0.68, + "grad_norm": 0.27446791096771567, + "learning_rate": 9.58401777857475e-06, + "loss": 0.026, + "step": 3115 + }, + { + "epoch": 0.68, + "grad_norm": 0.28892722701999873, + "learning_rate": 9.571874072530809e-06, + "loss": 0.0355, + "step": 3116 + }, + { + "epoch": 0.68, + "grad_norm": 0.31541426299089276, + "learning_rate": 9.559735644225316e-06, + "loss": 0.0263, + "step": 3117 + }, + { + "epoch": 0.68, + "grad_norm": 0.25088864578936493, + "learning_rate": 9.547602499801616e-06, + "loss": 0.026, + "step": 3118 + }, + { + "epoch": 0.69, + "grad_norm": 0.30759405624372854, + "learning_rate": 9.53547464540037e-06, + "loss": 0.0459, + "step": 3119 + }, + { + "epoch": 0.69, + "grad_norm": 0.2569584862348946, + "learning_rate": 9.523352087159548e-06, + "loss": 0.0328, + "step": 3120 + }, + { + "epoch": 0.69, + "grad_norm": 0.2631748601423334, + "learning_rate": 9.511234831214464e-06, + "loss": 0.0301, + "step": 3121 + }, + { + "epoch": 0.69, + "grad_norm": 0.22610547568516606, + "learning_rate": 9.499122883697724e-06, + "loss": 0.031, + "step": 3122 + }, + { + "epoch": 0.69, + "grad_norm": 0.30749758862891285, + "learning_rate": 9.487016250739269e-06, + "loss": 0.0391, + "step": 3123 + }, + { + "epoch": 0.69, + "grad_norm": 0.2865122859893621, + "learning_rate": 9.474914938466328e-06, + "loss": 0.0266, + "step": 3124 + }, + { + "epoch": 0.69, + "grad_norm": 0.3161344066840338, + "learning_rate": 9.462818953003465e-06, + "loss": 0.0372, + "step": 3125 + }, + { + "epoch": 0.69, + "grad_norm": 0.3230785751011246, + "learning_rate": 9.45072830047251e-06, + "loss": 0.0471, + "step": 3126 + }, + { + "epoch": 0.69, + "grad_norm": 0.3371732737427641, + "learning_rate": 9.438642986992641e-06, + "loss": 0.0389, + "step": 3127 + }, + { + "epoch": 0.69, + "grad_norm": 0.25125590412357773, + "learning_rate": 9.426563018680293e-06, + "loss": 0.0219, + "step": 3128 + }, + { + "epoch": 0.69, + "grad_norm": 0.3338988345174679, + "learning_rate": 9.414488401649227e-06, + "loss": 0.0361, + "step": 3129 + }, + { + "epoch": 0.69, + "grad_norm": 0.3217851467083712, + "learning_rate": 9.40241914201046e-06, + "loss": 0.0336, + "step": 3130 + }, + { + "epoch": 0.69, + "grad_norm": 0.35707314994576855, + "learning_rate": 9.390355245872337e-06, + "loss": 0.0482, + "step": 3131 + }, + { + "epoch": 0.69, + "grad_norm": 0.2898732748699959, + "learning_rate": 9.378296719340459e-06, + "loss": 0.0295, + "step": 3132 + }, + { + "epoch": 0.69, + "grad_norm": 0.2828287214822192, + "learning_rate": 9.366243568517726e-06, + "loss": 0.0373, + "step": 3133 + }, + { + "epoch": 0.69, + "grad_norm": 0.2516824465950768, + "learning_rate": 9.354195799504305e-06, + "loss": 0.0264, + "step": 3134 + }, + { + "epoch": 0.69, + "grad_norm": 0.2505504389748185, + "learning_rate": 9.342153418397647e-06, + "loss": 0.0372, + "step": 3135 + }, + { + "epoch": 0.69, + "grad_norm": 0.29056852673906025, + "learning_rate": 9.330116431292478e-06, + "loss": 0.0407, + "step": 3136 + }, + { + "epoch": 0.69, + "grad_norm": 0.26675860352481984, + "learning_rate": 9.318084844280798e-06, + "loss": 0.0393, + "step": 3137 + }, + { + "epoch": 0.69, + "grad_norm": 0.26130883298179075, + "learning_rate": 9.306058663451852e-06, + "loss": 0.0258, + "step": 3138 + }, + { + "epoch": 0.69, + "grad_norm": 0.2881059025457402, + "learning_rate": 9.294037894892178e-06, + "loss": 0.0365, + "step": 3139 + }, + { + "epoch": 0.69, + "grad_norm": 0.3193162684551438, + "learning_rate": 9.28202254468555e-06, + "loss": 0.0387, + "step": 3140 + }, + { + "epoch": 0.69, + "grad_norm": 0.31524669429337604, + "learning_rate": 9.270012618913018e-06, + "loss": 0.0431, + "step": 3141 + }, + { + "epoch": 0.69, + "grad_norm": 0.29463283952180785, + "learning_rate": 9.258008123652868e-06, + "loss": 0.0413, + "step": 3142 + }, + { + "epoch": 0.69, + "grad_norm": 0.3625294895344755, + "learning_rate": 9.246009064980657e-06, + "loss": 0.0479, + "step": 3143 + }, + { + "epoch": 0.69, + "grad_norm": 0.29733082010017053, + "learning_rate": 9.23401544896919e-06, + "loss": 0.0338, + "step": 3144 + }, + { + "epoch": 0.69, + "grad_norm": 0.2953035650970452, + "learning_rate": 9.22202728168849e-06, + "loss": 0.0384, + "step": 3145 + }, + { + "epoch": 0.69, + "grad_norm": 0.26778874148341164, + "learning_rate": 9.210044569205863e-06, + "loss": 0.0262, + "step": 3146 + }, + { + "epoch": 0.69, + "grad_norm": 0.2789062088611049, + "learning_rate": 9.198067317585816e-06, + "loss": 0.0368, + "step": 3147 + }, + { + "epoch": 0.69, + "grad_norm": 0.27855798043463303, + "learning_rate": 9.186095532890121e-06, + "loss": 0.0423, + "step": 3148 + }, + { + "epoch": 0.69, + "grad_norm": 0.24564879928196806, + "learning_rate": 9.174129221177762e-06, + "loss": 0.034, + "step": 3149 + }, + { + "epoch": 0.69, + "grad_norm": 0.2798131159847416, + "learning_rate": 9.162168388504972e-06, + "loss": 0.0291, + "step": 3150 + }, + { + "epoch": 0.69, + "grad_norm": 0.29632821676191, + "learning_rate": 9.150213040925193e-06, + "loss": 0.0328, + "step": 3151 + }, + { + "epoch": 0.69, + "grad_norm": 0.42087156559602495, + "learning_rate": 9.138263184489104e-06, + "loss": 0.0482, + "step": 3152 + }, + { + "epoch": 0.69, + "grad_norm": 0.30010754823131197, + "learning_rate": 9.1263188252446e-06, + "loss": 0.0341, + "step": 3153 + }, + { + "epoch": 0.69, + "grad_norm": 0.3008349234511671, + "learning_rate": 9.114379969236802e-06, + "loss": 0.0331, + "step": 3154 + }, + { + "epoch": 0.69, + "grad_norm": 0.2628119822938154, + "learning_rate": 9.102446622508025e-06, + "loss": 0.0341, + "step": 3155 + }, + { + "epoch": 0.69, + "grad_norm": 0.2439599627576946, + "learning_rate": 9.090518791097822e-06, + "loss": 0.0304, + "step": 3156 + }, + { + "epoch": 0.69, + "grad_norm": 0.3738203169466667, + "learning_rate": 9.078596481042927e-06, + "loss": 0.0526, + "step": 3157 + }, + { + "epoch": 0.69, + "grad_norm": 0.28116721683341717, + "learning_rate": 9.066679698377311e-06, + "loss": 0.0219, + "step": 3158 + }, + { + "epoch": 0.69, + "grad_norm": 0.26721846640871477, + "learning_rate": 9.054768449132115e-06, + "loss": 0.0354, + "step": 3159 + }, + { + "epoch": 0.69, + "grad_norm": 0.30702480620011646, + "learning_rate": 9.042862739335707e-06, + "loss": 0.0344, + "step": 3160 + }, + { + "epoch": 0.69, + "grad_norm": 0.26577193740057614, + "learning_rate": 9.030962575013622e-06, + "loss": 0.0343, + "step": 3161 + }, + { + "epoch": 0.69, + "grad_norm": 0.27656957541860233, + "learning_rate": 9.019067962188634e-06, + "loss": 0.0274, + "step": 3162 + }, + { + "epoch": 0.69, + "grad_norm": 0.24374774601701155, + "learning_rate": 9.007178906880655e-06, + "loss": 0.0234, + "step": 3163 + }, + { + "epoch": 0.69, + "grad_norm": 0.2787367800070968, + "learning_rate": 8.995295415106829e-06, + "loss": 0.0299, + "step": 3164 + }, + { + "epoch": 0.7, + "grad_norm": 0.2559452349619959, + "learning_rate": 8.983417492881443e-06, + "loss": 0.0324, + "step": 3165 + }, + { + "epoch": 0.7, + "grad_norm": 0.2008699540297915, + "learning_rate": 8.971545146216005e-06, + "loss": 0.0235, + "step": 3166 + }, + { + "epoch": 0.7, + "grad_norm": 0.23118591258238033, + "learning_rate": 8.959678381119166e-06, + "loss": 0.0305, + "step": 3167 + }, + { + "epoch": 0.7, + "grad_norm": 0.32931410946889816, + "learning_rate": 8.947817203596785e-06, + "loss": 0.0272, + "step": 3168 + }, + { + "epoch": 0.7, + "grad_norm": 0.27844357354404126, + "learning_rate": 8.935961619651859e-06, + "loss": 0.0347, + "step": 3169 + }, + { + "epoch": 0.7, + "grad_norm": 0.3423909501339809, + "learning_rate": 8.924111635284582e-06, + "loss": 0.0372, + "step": 3170 + }, + { + "epoch": 0.7, + "grad_norm": 0.36290249512540756, + "learning_rate": 8.91226725649231e-06, + "loss": 0.0532, + "step": 3171 + }, + { + "epoch": 0.7, + "grad_norm": 0.2738942418836486, + "learning_rate": 8.900428489269541e-06, + "loss": 0.0275, + "step": 3172 + }, + { + "epoch": 0.7, + "grad_norm": 0.3044333253627893, + "learning_rate": 8.888595339607961e-06, + "loss": 0.0354, + "step": 3173 + }, + { + "epoch": 0.7, + "grad_norm": 0.22113366457544695, + "learning_rate": 8.876767813496388e-06, + "loss": 0.0244, + "step": 3174 + }, + { + "epoch": 0.7, + "grad_norm": 0.33908047958026905, + "learning_rate": 8.86494591692081e-06, + "loss": 0.0385, + "step": 3175 + }, + { + "epoch": 0.7, + "grad_norm": 0.2877423063537248, + "learning_rate": 8.85312965586437e-06, + "loss": 0.0375, + "step": 3176 + }, + { + "epoch": 0.7, + "grad_norm": 0.29744477556872667, + "learning_rate": 8.841319036307334e-06, + "loss": 0.0412, + "step": 3177 + }, + { + "epoch": 0.7, + "grad_norm": 0.39751737601611026, + "learning_rate": 8.829514064227138e-06, + "loss": 0.0539, + "step": 3178 + }, + { + "epoch": 0.7, + "grad_norm": 0.28259364348374433, + "learning_rate": 8.817714745598358e-06, + "loss": 0.0314, + "step": 3179 + }, + { + "epoch": 0.7, + "grad_norm": 0.25615170210053695, + "learning_rate": 8.805921086392686e-06, + "loss": 0.0305, + "step": 3180 + }, + { + "epoch": 0.7, + "grad_norm": 0.27951022982259094, + "learning_rate": 8.79413309257898e-06, + "loss": 0.0299, + "step": 3181 + }, + { + "epoch": 0.7, + "grad_norm": 0.27529645892553, + "learning_rate": 8.782350770123202e-06, + "loss": 0.033, + "step": 3182 + }, + { + "epoch": 0.7, + "grad_norm": 0.27557445369722716, + "learning_rate": 8.770574124988474e-06, + "loss": 0.0348, + "step": 3183 + }, + { + "epoch": 0.7, + "grad_norm": 0.2881043250471307, + "learning_rate": 8.758803163135008e-06, + "loss": 0.0392, + "step": 3184 + }, + { + "epoch": 0.7, + "grad_norm": 0.25932623335834537, + "learning_rate": 8.74703789052018e-06, + "loss": 0.0322, + "step": 3185 + }, + { + "epoch": 0.7, + "grad_norm": 0.28605773253215566, + "learning_rate": 8.73527831309844e-06, + "loss": 0.0305, + "step": 3186 + }, + { + "epoch": 0.7, + "grad_norm": 0.3632188353707027, + "learning_rate": 8.723524436821418e-06, + "loss": 0.0359, + "step": 3187 + }, + { + "epoch": 0.7, + "grad_norm": 0.30155488632157246, + "learning_rate": 8.711776267637794e-06, + "loss": 0.0304, + "step": 3188 + }, + { + "epoch": 0.7, + "grad_norm": 0.29998057173364495, + "learning_rate": 8.700033811493407e-06, + "loss": 0.0343, + "step": 3189 + }, + { + "epoch": 0.7, + "grad_norm": 0.3343495163164025, + "learning_rate": 8.688297074331171e-06, + "loss": 0.0513, + "step": 3190 + }, + { + "epoch": 0.7, + "grad_norm": 0.2502815627983121, + "learning_rate": 8.676566062091135e-06, + "loss": 0.0284, + "step": 3191 + }, + { + "epoch": 0.7, + "grad_norm": 0.27547491420485215, + "learning_rate": 8.66484078071042e-06, + "loss": 0.0343, + "step": 3192 + }, + { + "epoch": 0.7, + "grad_norm": 0.3151790461619625, + "learning_rate": 8.653121236123278e-06, + "loss": 0.0319, + "step": 3193 + }, + { + "epoch": 0.7, + "grad_norm": 0.2732800984344735, + "learning_rate": 8.641407434261031e-06, + "loss": 0.0316, + "step": 3194 + }, + { + "epoch": 0.7, + "grad_norm": 0.3315842255890883, + "learning_rate": 8.62969938105211e-06, + "loss": 0.0378, + "step": 3195 + }, + { + "epoch": 0.7, + "grad_norm": 0.26288651665797536, + "learning_rate": 8.617997082422031e-06, + "loss": 0.0311, + "step": 3196 + }, + { + "epoch": 0.7, + "grad_norm": 0.2804867268488633, + "learning_rate": 8.606300544293412e-06, + "loss": 0.0364, + "step": 3197 + }, + { + "epoch": 0.7, + "grad_norm": 0.34573039793205407, + "learning_rate": 8.594609772585922e-06, + "loss": 0.0449, + "step": 3198 + }, + { + "epoch": 0.7, + "grad_norm": 0.24380722932954038, + "learning_rate": 8.582924773216353e-06, + "loss": 0.0244, + "step": 3199 + }, + { + "epoch": 0.7, + "grad_norm": 0.27146298166414884, + "learning_rate": 8.571245552098533e-06, + "loss": 0.0351, + "step": 3200 + }, + { + "epoch": 0.7, + "grad_norm": 0.291641356300972, + "learning_rate": 8.559572115143406e-06, + "loss": 0.0251, + "step": 3201 + }, + { + "epoch": 0.7, + "grad_norm": 0.2927027066573506, + "learning_rate": 8.547904468258957e-06, + "loss": 0.0287, + "step": 3202 + }, + { + "epoch": 0.7, + "grad_norm": 0.26510552491525224, + "learning_rate": 8.536242617350265e-06, + "loss": 0.0352, + "step": 3203 + }, + { + "epoch": 0.7, + "grad_norm": 0.286560637094647, + "learning_rate": 8.524586568319451e-06, + "loss": 0.041, + "step": 3204 + }, + { + "epoch": 0.7, + "grad_norm": 0.2858820001489877, + "learning_rate": 8.51293632706572e-06, + "loss": 0.0278, + "step": 3205 + }, + { + "epoch": 0.7, + "grad_norm": 0.38231843088964895, + "learning_rate": 8.501291899485337e-06, + "loss": 0.0433, + "step": 3206 + }, + { + "epoch": 0.7, + "grad_norm": 0.4060582662170565, + "learning_rate": 8.489653291471607e-06, + "loss": 0.0549, + "step": 3207 + }, + { + "epoch": 0.7, + "grad_norm": 0.28511768237017565, + "learning_rate": 8.47802050891491e-06, + "loss": 0.035, + "step": 3208 + }, + { + "epoch": 0.7, + "grad_norm": 0.2831822880939062, + "learning_rate": 8.466393557702659e-06, + "loss": 0.0385, + "step": 3209 + }, + { + "epoch": 0.71, + "grad_norm": 0.3482690938759456, + "learning_rate": 8.454772443719339e-06, + "loss": 0.0435, + "step": 3210 + }, + { + "epoch": 0.71, + "grad_norm": 0.2907088281881407, + "learning_rate": 8.443157172846448e-06, + "loss": 0.0307, + "step": 3211 + }, + { + "epoch": 0.71, + "grad_norm": 0.26492070343199503, + "learning_rate": 8.43154775096256e-06, + "loss": 0.0347, + "step": 3212 + }, + { + "epoch": 0.71, + "grad_norm": 0.3079662703787326, + "learning_rate": 8.419944183943266e-06, + "loss": 0.0357, + "step": 3213 + }, + { + "epoch": 0.71, + "grad_norm": 0.2970429372118634, + "learning_rate": 8.408346477661218e-06, + "loss": 0.029, + "step": 3214 + }, + { + "epoch": 0.71, + "grad_norm": 0.295109165165925, + "learning_rate": 8.39675463798607e-06, + "loss": 0.0315, + "step": 3215 + }, + { + "epoch": 0.71, + "grad_norm": 0.26527102109089074, + "learning_rate": 8.385168670784532e-06, + "loss": 0.0337, + "step": 3216 + }, + { + "epoch": 0.71, + "grad_norm": 0.3220551699472881, + "learning_rate": 8.373588581920325e-06, + "loss": 0.0315, + "step": 3217 + }, + { + "epoch": 0.71, + "grad_norm": 0.2518849539387254, + "learning_rate": 8.362014377254213e-06, + "loss": 0.0275, + "step": 3218 + }, + { + "epoch": 0.71, + "grad_norm": 0.2867164128387573, + "learning_rate": 8.35044606264396e-06, + "loss": 0.035, + "step": 3219 + }, + { + "epoch": 0.71, + "grad_norm": 0.3426814485669959, + "learning_rate": 8.338883643944375e-06, + "loss": 0.0445, + "step": 3220 + }, + { + "epoch": 0.71, + "grad_norm": 0.22337205335160346, + "learning_rate": 8.327327127007247e-06, + "loss": 0.0261, + "step": 3221 + }, + { + "epoch": 0.71, + "grad_norm": 0.31098813071247333, + "learning_rate": 8.315776517681428e-06, + "loss": 0.0372, + "step": 3222 + }, + { + "epoch": 0.71, + "grad_norm": 0.27028504987487467, + "learning_rate": 8.304231821812733e-06, + "loss": 0.0302, + "step": 3223 + }, + { + "epoch": 0.71, + "grad_norm": 0.3178525060794622, + "learning_rate": 8.292693045244016e-06, + "loss": 0.0364, + "step": 3224 + }, + { + "epoch": 0.71, + "grad_norm": 0.3628950219355067, + "learning_rate": 8.281160193815108e-06, + "loss": 0.045, + "step": 3225 + }, + { + "epoch": 0.71, + "grad_norm": 0.44063406048921605, + "learning_rate": 8.269633273362872e-06, + "loss": 0.0511, + "step": 3226 + }, + { + "epoch": 0.71, + "grad_norm": 0.2285235476954415, + "learning_rate": 8.258112289721134e-06, + "loss": 0.021, + "step": 3227 + }, + { + "epoch": 0.71, + "grad_norm": 0.3376012235303488, + "learning_rate": 8.246597248720756e-06, + "loss": 0.0408, + "step": 3228 + }, + { + "epoch": 0.71, + "grad_norm": 0.2039304225668567, + "learning_rate": 8.23508815618955e-06, + "loss": 0.0212, + "step": 3229 + }, + { + "epoch": 0.71, + "grad_norm": 0.357302119303284, + "learning_rate": 8.22358501795235e-06, + "loss": 0.0481, + "step": 3230 + }, + { + "epoch": 0.71, + "grad_norm": 0.31132384935073115, + "learning_rate": 8.212087839830968e-06, + "loss": 0.0267, + "step": 3231 + }, + { + "epoch": 0.71, + "grad_norm": 0.2606308660144364, + "learning_rate": 8.200596627644187e-06, + "loss": 0.0324, + "step": 3232 + }, + { + "epoch": 0.71, + "grad_norm": 0.30183494841210046, + "learning_rate": 8.189111387207782e-06, + "loss": 0.0378, + "step": 3233 + }, + { + "epoch": 0.71, + "grad_norm": 0.2965052083452161, + "learning_rate": 8.177632124334513e-06, + "loss": 0.05, + "step": 3234 + }, + { + "epoch": 0.71, + "grad_norm": 0.3487436348153068, + "learning_rate": 8.16615884483409e-06, + "loss": 0.0393, + "step": 3235 + }, + { + "epoch": 0.71, + "grad_norm": 0.24122902188482195, + "learning_rate": 8.154691554513228e-06, + "loss": 0.0295, + "step": 3236 + }, + { + "epoch": 0.71, + "grad_norm": 0.3403639155904013, + "learning_rate": 8.143230259175574e-06, + "loss": 0.0389, + "step": 3237 + }, + { + "epoch": 0.71, + "grad_norm": 0.2303562859744233, + "learning_rate": 8.13177496462177e-06, + "loss": 0.0251, + "step": 3238 + }, + { + "epoch": 0.71, + "grad_norm": 0.28828770958504507, + "learning_rate": 8.120325676649416e-06, + "loss": 0.0324, + "step": 3239 + }, + { + "epoch": 0.71, + "grad_norm": 0.3063426422594093, + "learning_rate": 8.108882401053055e-06, + "loss": 0.0243, + "step": 3240 + }, + { + "epoch": 0.71, + "grad_norm": 0.31969377026789886, + "learning_rate": 8.09744514362421e-06, + "loss": 0.0453, + "step": 3241 + }, + { + "epoch": 0.71, + "grad_norm": 0.28191090126797036, + "learning_rate": 8.086013910151334e-06, + "loss": 0.0288, + "step": 3242 + }, + { + "epoch": 0.71, + "grad_norm": 0.2359073376480803, + "learning_rate": 8.07458870641986e-06, + "loss": 0.0261, + "step": 3243 + }, + { + "epoch": 0.71, + "grad_norm": 0.31954006938375107, + "learning_rate": 8.063169538212139e-06, + "loss": 0.0395, + "step": 3244 + }, + { + "epoch": 0.71, + "grad_norm": 0.2573348880995303, + "learning_rate": 8.051756411307494e-06, + "loss": 0.0297, + "step": 3245 + }, + { + "epoch": 0.71, + "grad_norm": 0.22732547837337838, + "learning_rate": 8.040349331482167e-06, + "loss": 0.0315, + "step": 3246 + }, + { + "epoch": 0.71, + "grad_norm": 0.2483432531029029, + "learning_rate": 8.028948304509356e-06, + "loss": 0.0351, + "step": 3247 + }, + { + "epoch": 0.71, + "grad_norm": 0.36573764989291757, + "learning_rate": 8.017553336159192e-06, + "loss": 0.0394, + "step": 3248 + }, + { + "epoch": 0.71, + "grad_norm": 0.2565588816574629, + "learning_rate": 8.006164432198747e-06, + "loss": 0.0366, + "step": 3249 + }, + { + "epoch": 0.71, + "grad_norm": 0.28825887887882923, + "learning_rate": 7.994781598391995e-06, + "loss": 0.0302, + "step": 3250 + }, + { + "epoch": 0.71, + "grad_norm": 0.3507524907614378, + "learning_rate": 7.983404840499882e-06, + "loss": 0.0483, + "step": 3251 + }, + { + "epoch": 0.71, + "grad_norm": 0.2725297711015889, + "learning_rate": 7.972034164280231e-06, + "loss": 0.0336, + "step": 3252 + }, + { + "epoch": 0.71, + "grad_norm": 0.2762138156797559, + "learning_rate": 7.96066957548783e-06, + "loss": 0.0378, + "step": 3253 + }, + { + "epoch": 0.71, + "grad_norm": 0.2705830355830464, + "learning_rate": 7.949311079874352e-06, + "loss": 0.0311, + "step": 3254 + }, + { + "epoch": 0.71, + "grad_norm": 0.3045217601223277, + "learning_rate": 7.937958683188407e-06, + "loss": 0.0322, + "step": 3255 + }, + { + "epoch": 0.72, + "grad_norm": 0.26781638181581713, + "learning_rate": 7.926612391175516e-06, + "loss": 0.0326, + "step": 3256 + }, + { + "epoch": 0.72, + "grad_norm": 0.3282592589589826, + "learning_rate": 7.915272209578112e-06, + "loss": 0.0357, + "step": 3257 + }, + { + "epoch": 0.72, + "grad_norm": 0.25635829358510154, + "learning_rate": 7.903938144135515e-06, + "loss": 0.0265, + "step": 3258 + }, + { + "epoch": 0.72, + "grad_norm": 0.2292502081203452, + "learning_rate": 7.892610200583979e-06, + "loss": 0.0252, + "step": 3259 + }, + { + "epoch": 0.72, + "grad_norm": 0.2563374980548708, + "learning_rate": 7.881288384656634e-06, + "loss": 0.0226, + "step": 3260 + }, + { + "epoch": 0.72, + "grad_norm": 0.3228283910172808, + "learning_rate": 7.869972702083532e-06, + "loss": 0.0306, + "step": 3261 + }, + { + "epoch": 0.72, + "grad_norm": 0.2937913662617634, + "learning_rate": 7.8586631585916e-06, + "loss": 0.0297, + "step": 3262 + }, + { + "epoch": 0.72, + "grad_norm": 0.27880336936959554, + "learning_rate": 7.847359759904675e-06, + "loss": 0.0303, + "step": 3263 + }, + { + "epoch": 0.72, + "grad_norm": 0.2654205404899196, + "learning_rate": 7.836062511743468e-06, + "loss": 0.0291, + "step": 3264 + }, + { + "epoch": 0.72, + "grad_norm": 0.35197320895624984, + "learning_rate": 7.824771419825588e-06, + "loss": 0.0528, + "step": 3265 + }, + { + "epoch": 0.72, + "grad_norm": 0.3555739110978662, + "learning_rate": 7.813486489865534e-06, + "loss": 0.0394, + "step": 3266 + }, + { + "epoch": 0.72, + "grad_norm": 0.27169094871366306, + "learning_rate": 7.802207727574665e-06, + "loss": 0.029, + "step": 3267 + }, + { + "epoch": 0.72, + "grad_norm": 0.24856026153573255, + "learning_rate": 7.790935138661246e-06, + "loss": 0.0354, + "step": 3268 + }, + { + "epoch": 0.72, + "grad_norm": 0.2589370304520512, + "learning_rate": 7.779668728830389e-06, + "loss": 0.0287, + "step": 3269 + }, + { + "epoch": 0.72, + "grad_norm": 0.2636273240492979, + "learning_rate": 7.768408503784108e-06, + "loss": 0.0259, + "step": 3270 + }, + { + "epoch": 0.72, + "grad_norm": 0.27505692087975914, + "learning_rate": 7.757154469221257e-06, + "loss": 0.035, + "step": 3271 + }, + { + "epoch": 0.72, + "grad_norm": 0.26944460374515017, + "learning_rate": 7.745906630837586e-06, + "loss": 0.0297, + "step": 3272 + }, + { + "epoch": 0.72, + "grad_norm": 0.2780305927222107, + "learning_rate": 7.734664994325672e-06, + "loss": 0.0289, + "step": 3273 + }, + { + "epoch": 0.72, + "grad_norm": 0.33649667510576525, + "learning_rate": 7.723429565375006e-06, + "loss": 0.0413, + "step": 3274 + }, + { + "epoch": 0.72, + "grad_norm": 0.3260934244253876, + "learning_rate": 7.71220034967189e-06, + "loss": 0.0342, + "step": 3275 + }, + { + "epoch": 0.72, + "grad_norm": 0.34872161845458016, + "learning_rate": 7.700977352899506e-06, + "loss": 0.0428, + "step": 3276 + }, + { + "epoch": 0.72, + "grad_norm": 0.29277538276010856, + "learning_rate": 7.68976058073787e-06, + "loss": 0.0297, + "step": 3277 + }, + { + "epoch": 0.72, + "grad_norm": 0.27543608648700496, + "learning_rate": 7.678550038863877e-06, + "loss": 0.0332, + "step": 3278 + }, + { + "epoch": 0.72, + "grad_norm": 0.2683624967035358, + "learning_rate": 7.667345732951233e-06, + "loss": 0.0341, + "step": 3279 + }, + { + "epoch": 0.72, + "grad_norm": 0.29395466532988246, + "learning_rate": 7.656147668670519e-06, + "loss": 0.0345, + "step": 3280 + }, + { + "epoch": 0.72, + "grad_norm": 0.2438548131709034, + "learning_rate": 7.644955851689129e-06, + "loss": 0.0278, + "step": 3281 + }, + { + "epoch": 0.72, + "grad_norm": 0.23469584280035205, + "learning_rate": 7.63377028767133e-06, + "loss": 0.0252, + "step": 3282 + }, + { + "epoch": 0.72, + "grad_norm": 0.26294336342626873, + "learning_rate": 7.622590982278189e-06, + "loss": 0.0297, + "step": 3283 + }, + { + "epoch": 0.72, + "grad_norm": 0.2637056117197377, + "learning_rate": 7.611417941167634e-06, + "loss": 0.0414, + "step": 3284 + }, + { + "epoch": 0.72, + "grad_norm": 0.24425409350483882, + "learning_rate": 7.600251169994392e-06, + "loss": 0.0275, + "step": 3285 + }, + { + "epoch": 0.72, + "grad_norm": 0.3440145969738492, + "learning_rate": 7.589090674410056e-06, + "loss": 0.0407, + "step": 3286 + }, + { + "epoch": 0.72, + "grad_norm": 0.32265773384916846, + "learning_rate": 7.577936460063e-06, + "loss": 0.0457, + "step": 3287 + }, + { + "epoch": 0.72, + "grad_norm": 0.21486533663414784, + "learning_rate": 7.566788532598457e-06, + "loss": 0.0242, + "step": 3288 + }, + { + "epoch": 0.72, + "grad_norm": 0.3426672398743925, + "learning_rate": 7.555646897658448e-06, + "loss": 0.0418, + "step": 3289 + }, + { + "epoch": 0.72, + "grad_norm": 0.2978583663413471, + "learning_rate": 7.544511560881829e-06, + "loss": 0.0507, + "step": 3290 + }, + { + "epoch": 0.72, + "grad_norm": 0.29657699033028645, + "learning_rate": 7.533382527904263e-06, + "loss": 0.0372, + "step": 3291 + }, + { + "epoch": 0.72, + "grad_norm": 0.30881177467626963, + "learning_rate": 7.5222598043582274e-06, + "loss": 0.0374, + "step": 3292 + }, + { + "epoch": 0.72, + "grad_norm": 0.26934463841005973, + "learning_rate": 7.511143395872986e-06, + "loss": 0.0333, + "step": 3293 + }, + { + "epoch": 0.72, + "grad_norm": 0.28768593124767466, + "learning_rate": 7.500033308074639e-06, + "loss": 0.0252, + "step": 3294 + }, + { + "epoch": 0.72, + "grad_norm": 0.3364082852313981, + "learning_rate": 7.488929546586053e-06, + "loss": 0.037, + "step": 3295 + }, + { + "epoch": 0.72, + "grad_norm": 0.22541449997196128, + "learning_rate": 7.477832117026924e-06, + "loss": 0.0245, + "step": 3296 + }, + { + "epoch": 0.72, + "grad_norm": 0.24872789984440977, + "learning_rate": 7.466741025013715e-06, + "loss": 0.0285, + "step": 3297 + }, + { + "epoch": 0.72, + "grad_norm": 0.22554073458836682, + "learning_rate": 7.455656276159713e-06, + "loss": 0.025, + "step": 3298 + }, + { + "epoch": 0.72, + "grad_norm": 0.29784291989654366, + "learning_rate": 7.444577876074956e-06, + "loss": 0.0274, + "step": 3299 + }, + { + "epoch": 0.72, + "grad_norm": 0.2899906362057722, + "learning_rate": 7.4335058303663056e-06, + "loss": 0.0322, + "step": 3300 + }, + { + "epoch": 0.73, + "grad_norm": 0.29269440839549676, + "learning_rate": 7.422440144637395e-06, + "loss": 0.0427, + "step": 3301 + }, + { + "epoch": 0.73, + "grad_norm": 0.3068639585835522, + "learning_rate": 7.411380824488621e-06, + "loss": 0.0359, + "step": 3302 + }, + { + "epoch": 0.73, + "grad_norm": 0.25979565324964654, + "learning_rate": 7.400327875517188e-06, + "loss": 0.03, + "step": 3303 + }, + { + "epoch": 0.73, + "grad_norm": 0.31275196646339726, + "learning_rate": 7.389281303317046e-06, + "loss": 0.0399, + "step": 3304 + }, + { + "epoch": 0.73, + "grad_norm": 0.247167598282481, + "learning_rate": 7.37824111347895e-06, + "loss": 0.0331, + "step": 3305 + }, + { + "epoch": 0.73, + "grad_norm": 0.25559863425551843, + "learning_rate": 7.367207311590392e-06, + "loss": 0.0323, + "step": 3306 + }, + { + "epoch": 0.73, + "grad_norm": 0.2560772877837243, + "learning_rate": 7.356179903235654e-06, + "loss": 0.0357, + "step": 3307 + }, + { + "epoch": 0.73, + "grad_norm": 0.25963678248029315, + "learning_rate": 7.345158893995774e-06, + "loss": 0.0271, + "step": 3308 + }, + { + "epoch": 0.73, + "grad_norm": 0.30030508869803, + "learning_rate": 7.33414428944856e-06, + "loss": 0.0376, + "step": 3309 + }, + { + "epoch": 0.73, + "grad_norm": 0.3277322712263529, + "learning_rate": 7.3231360951685574e-06, + "loss": 0.0448, + "step": 3310 + }, + { + "epoch": 0.73, + "grad_norm": 0.23406026688899606, + "learning_rate": 7.312134316727093e-06, + "loss": 0.0227, + "step": 3311 + }, + { + "epoch": 0.73, + "grad_norm": 0.27825149220668854, + "learning_rate": 7.301138959692225e-06, + "loss": 0.0339, + "step": 3312 + }, + { + "epoch": 0.73, + "grad_norm": 0.2623883975162797, + "learning_rate": 7.290150029628777e-06, + "loss": 0.0306, + "step": 3313 + }, + { + "epoch": 0.73, + "grad_norm": 0.31569686298159066, + "learning_rate": 7.2791675320983076e-06, + "loss": 0.0406, + "step": 3314 + }, + { + "epoch": 0.73, + "grad_norm": 0.2641515959588397, + "learning_rate": 7.268191472659136e-06, + "loss": 0.0256, + "step": 3315 + }, + { + "epoch": 0.73, + "grad_norm": 0.26511555792736424, + "learning_rate": 7.257221856866295e-06, + "loss": 0.0291, + "step": 3316 + }, + { + "epoch": 0.73, + "grad_norm": 0.25792506679317007, + "learning_rate": 7.246258690271599e-06, + "loss": 0.0268, + "step": 3317 + }, + { + "epoch": 0.73, + "grad_norm": 0.5339755656834957, + "learning_rate": 7.235301978423555e-06, + "loss": 0.0309, + "step": 3318 + }, + { + "epoch": 0.73, + "grad_norm": 0.2740130090122586, + "learning_rate": 7.224351726867433e-06, + "loss": 0.032, + "step": 3319 + }, + { + "epoch": 0.73, + "grad_norm": 0.22585862262855183, + "learning_rate": 7.213407941145214e-06, + "loss": 0.0195, + "step": 3320 + }, + { + "epoch": 0.73, + "grad_norm": 0.2225228462116293, + "learning_rate": 7.202470626795626e-06, + "loss": 0.0243, + "step": 3321 + }, + { + "epoch": 0.73, + "grad_norm": 0.2747098746967936, + "learning_rate": 7.191539789354096e-06, + "loss": 0.0319, + "step": 3322 + }, + { + "epoch": 0.73, + "grad_norm": 0.24156826594726816, + "learning_rate": 7.180615434352802e-06, + "loss": 0.0217, + "step": 3323 + }, + { + "epoch": 0.73, + "grad_norm": 0.2107368857465454, + "learning_rate": 7.1696975673206125e-06, + "loss": 0.0284, + "step": 3324 + }, + { + "epoch": 0.73, + "grad_norm": 0.2335866384795216, + "learning_rate": 7.158786193783138e-06, + "loss": 0.0234, + "step": 3325 + }, + { + "epoch": 0.73, + "grad_norm": 0.2748073054489228, + "learning_rate": 7.147881319262695e-06, + "loss": 0.0379, + "step": 3326 + }, + { + "epoch": 0.73, + "grad_norm": 0.2523174461142641, + "learning_rate": 7.136982949278293e-06, + "loss": 0.0279, + "step": 3327 + }, + { + "epoch": 0.73, + "grad_norm": 0.25476723458425904, + "learning_rate": 7.126091089345679e-06, + "loss": 0.0313, + "step": 3328 + }, + { + "epoch": 0.73, + "grad_norm": 0.26992223549141114, + "learning_rate": 7.115205744977276e-06, + "loss": 0.0309, + "step": 3329 + }, + { + "epoch": 0.73, + "grad_norm": 0.28147717414007484, + "learning_rate": 7.104326921682236e-06, + "loss": 0.0256, + "step": 3330 + }, + { + "epoch": 0.73, + "grad_norm": 0.2750101911217013, + "learning_rate": 7.093454624966387e-06, + "loss": 0.0323, + "step": 3331 + }, + { + "epoch": 0.73, + "grad_norm": 0.2562045469910545, + "learning_rate": 7.082588860332271e-06, + "loss": 0.0309, + "step": 3332 + }, + { + "epoch": 0.73, + "grad_norm": 0.27352463832409785, + "learning_rate": 7.071729633279118e-06, + "loss": 0.0288, + "step": 3333 + }, + { + "epoch": 0.73, + "grad_norm": 0.2512556075898497, + "learning_rate": 7.060876949302855e-06, + "loss": 0.0236, + "step": 3334 + }, + { + "epoch": 0.73, + "grad_norm": 0.2677385726256808, + "learning_rate": 7.050030813896078e-06, + "loss": 0.0278, + "step": 3335 + }, + { + "epoch": 0.73, + "grad_norm": 0.28504892184507447, + "learning_rate": 7.0391912325481e-06, + "loss": 0.0325, + "step": 3336 + }, + { + "epoch": 0.73, + "grad_norm": 0.36006169432920915, + "learning_rate": 7.028358210744881e-06, + "loss": 0.0477, + "step": 3337 + }, + { + "epoch": 0.73, + "grad_norm": 0.21469547183354776, + "learning_rate": 7.017531753969098e-06, + "loss": 0.024, + "step": 3338 + }, + { + "epoch": 0.73, + "grad_norm": 0.259636954657241, + "learning_rate": 7.006711867700069e-06, + "loss": 0.0307, + "step": 3339 + }, + { + "epoch": 0.73, + "grad_norm": 0.24688085446478022, + "learning_rate": 6.995898557413823e-06, + "loss": 0.0296, + "step": 3340 + }, + { + "epoch": 0.73, + "grad_norm": 0.2831645150009331, + "learning_rate": 6.985091828583024e-06, + "loss": 0.0363, + "step": 3341 + }, + { + "epoch": 0.73, + "grad_norm": 0.278823985728811, + "learning_rate": 6.974291686677035e-06, + "loss": 0.0413, + "step": 3342 + }, + { + "epoch": 0.73, + "grad_norm": 0.2746523572511557, + "learning_rate": 6.96349813716187e-06, + "loss": 0.0298, + "step": 3343 + }, + { + "epoch": 0.73, + "grad_norm": 0.2719467808247288, + "learning_rate": 6.952711185500223e-06, + "loss": 0.0312, + "step": 3344 + }, + { + "epoch": 0.73, + "grad_norm": 0.23624465193321353, + "learning_rate": 6.941930837151416e-06, + "loss": 0.0232, + "step": 3345 + }, + { + "epoch": 0.73, + "grad_norm": 0.21903564332030248, + "learning_rate": 6.931157097571468e-06, + "loss": 0.0294, + "step": 3346 + }, + { + "epoch": 0.74, + "grad_norm": 0.2691619388462333, + "learning_rate": 6.920389972213017e-06, + "loss": 0.0357, + "step": 3347 + }, + { + "epoch": 0.74, + "grad_norm": 0.28274667722063346, + "learning_rate": 6.909629466525389e-06, + "loss": 0.0351, + "step": 3348 + }, + { + "epoch": 0.74, + "grad_norm": 0.30060128612135306, + "learning_rate": 6.898875585954527e-06, + "loss": 0.0315, + "step": 3349 + }, + { + "epoch": 0.74, + "grad_norm": 0.3506542341179284, + "learning_rate": 6.888128335943041e-06, + "loss": 0.046, + "step": 3350 + }, + { + "epoch": 0.74, + "grad_norm": 0.3748067914417686, + "learning_rate": 6.877387721930182e-06, + "loss": 0.0425, + "step": 3351 + }, + { + "epoch": 0.74, + "grad_norm": 0.3375190279153201, + "learning_rate": 6.866653749351846e-06, + "loss": 0.0399, + "step": 3352 + }, + { + "epoch": 0.74, + "grad_norm": 0.29486608925001806, + "learning_rate": 6.855926423640549e-06, + "loss": 0.0347, + "step": 3353 + }, + { + "epoch": 0.74, + "grad_norm": 0.280396105674734, + "learning_rate": 6.84520575022547e-06, + "loss": 0.0352, + "step": 3354 + }, + { + "epoch": 0.74, + "grad_norm": 0.28398588460864693, + "learning_rate": 6.8344917345323935e-06, + "loss": 0.0439, + "step": 3355 + }, + { + "epoch": 0.74, + "grad_norm": 0.31521055564088574, + "learning_rate": 6.823784381983764e-06, + "loss": 0.0381, + "step": 3356 + }, + { + "epoch": 0.74, + "grad_norm": 0.2925692186413323, + "learning_rate": 6.8130836979986236e-06, + "loss": 0.0437, + "step": 3357 + }, + { + "epoch": 0.74, + "grad_norm": 0.2670446214626016, + "learning_rate": 6.802389687992666e-06, + "loss": 0.0302, + "step": 3358 + }, + { + "epoch": 0.74, + "grad_norm": 0.3065052462551992, + "learning_rate": 6.791702357378185e-06, + "loss": 0.0364, + "step": 3359 + }, + { + "epoch": 0.74, + "grad_norm": 0.30816710228955385, + "learning_rate": 6.781021711564107e-06, + "loss": 0.0401, + "step": 3360 + }, + { + "epoch": 0.74, + "grad_norm": 0.26630850813961854, + "learning_rate": 6.770347755955982e-06, + "loss": 0.0307, + "step": 3361 + }, + { + "epoch": 0.74, + "grad_norm": 0.26040227214448936, + "learning_rate": 6.7596804959559494e-06, + "loss": 0.0314, + "step": 3362 + }, + { + "epoch": 0.74, + "grad_norm": 0.23877831503346528, + "learning_rate": 6.749019936962791e-06, + "loss": 0.0354, + "step": 3363 + }, + { + "epoch": 0.74, + "grad_norm": 0.2416854392082955, + "learning_rate": 6.7383660843718635e-06, + "loss": 0.0322, + "step": 3364 + }, + { + "epoch": 0.74, + "grad_norm": 0.3053072470737272, + "learning_rate": 6.727718943575161e-06, + "loss": 0.0345, + "step": 3365 + }, + { + "epoch": 0.74, + "grad_norm": 0.2022255644424565, + "learning_rate": 6.717078519961257e-06, + "loss": 0.0174, + "step": 3366 + }, + { + "epoch": 0.74, + "grad_norm": 0.2389068286781463, + "learning_rate": 6.706444818915345e-06, + "loss": 0.024, + "step": 3367 + }, + { + "epoch": 0.74, + "grad_norm": 0.30114718879034874, + "learning_rate": 6.695817845819188e-06, + "loss": 0.0386, + "step": 3368 + }, + { + "epoch": 0.74, + "grad_norm": 0.2640765581395411, + "learning_rate": 6.68519760605119e-06, + "loss": 0.0272, + "step": 3369 + }, + { + "epoch": 0.74, + "grad_norm": 0.254462659503167, + "learning_rate": 6.674584104986295e-06, + "loss": 0.0303, + "step": 3370 + }, + { + "epoch": 0.74, + "grad_norm": 0.353073391476766, + "learning_rate": 6.66397734799608e-06, + "loss": 0.0402, + "step": 3371 + }, + { + "epoch": 0.74, + "grad_norm": 0.2137880832225216, + "learning_rate": 6.653377340448673e-06, + "loss": 0.0215, + "step": 3372 + }, + { + "epoch": 0.74, + "grad_norm": 0.2648998909920871, + "learning_rate": 6.642784087708814e-06, + "loss": 0.0256, + "step": 3373 + }, + { + "epoch": 0.74, + "grad_norm": 0.2506035433595592, + "learning_rate": 6.6321975951378034e-06, + "loss": 0.0279, + "step": 3374 + }, + { + "epoch": 0.74, + "grad_norm": 0.22480728717961235, + "learning_rate": 6.621617868093544e-06, + "loss": 0.0218, + "step": 3375 + }, + { + "epoch": 0.74, + "grad_norm": 0.2735224591828485, + "learning_rate": 6.611044911930477e-06, + "loss": 0.0343, + "step": 3376 + }, + { + "epoch": 0.74, + "grad_norm": 0.23876396906016184, + "learning_rate": 6.6004787319996714e-06, + "loss": 0.0291, + "step": 3377 + }, + { + "epoch": 0.74, + "grad_norm": 0.3272250071635811, + "learning_rate": 6.589919333648711e-06, + "loss": 0.0406, + "step": 3378 + }, + { + "epoch": 0.74, + "grad_norm": 0.2897342145880577, + "learning_rate": 6.579366722221789e-06, + "loss": 0.0368, + "step": 3379 + }, + { + "epoch": 0.74, + "grad_norm": 0.3224209613385021, + "learning_rate": 6.568820903059632e-06, + "loss": 0.0395, + "step": 3380 + }, + { + "epoch": 0.74, + "grad_norm": 0.26667334739442633, + "learning_rate": 6.558281881499556e-06, + "loss": 0.0408, + "step": 3381 + }, + { + "epoch": 0.74, + "grad_norm": 0.2810547733903214, + "learning_rate": 6.547749662875411e-06, + "loss": 0.0356, + "step": 3382 + }, + { + "epoch": 0.74, + "grad_norm": 0.27188158136930185, + "learning_rate": 6.537224252517633e-06, + "loss": 0.0232, + "step": 3383 + }, + { + "epoch": 0.74, + "grad_norm": 0.24156934543400135, + "learning_rate": 6.526705655753183e-06, + "loss": 0.034, + "step": 3384 + }, + { + "epoch": 0.74, + "grad_norm": 0.23455296753242358, + "learning_rate": 6.516193877905592e-06, + "loss": 0.029, + "step": 3385 + }, + { + "epoch": 0.74, + "grad_norm": 0.3370798117890245, + "learning_rate": 6.505688924294944e-06, + "loss": 0.0356, + "step": 3386 + }, + { + "epoch": 0.74, + "grad_norm": 0.2723097159370534, + "learning_rate": 6.495190800237845e-06, + "loss": 0.0322, + "step": 3387 + }, + { + "epoch": 0.74, + "grad_norm": 0.23418612817301912, + "learning_rate": 6.484699511047474e-06, + "loss": 0.0225, + "step": 3388 + }, + { + "epoch": 0.74, + "grad_norm": 0.2419803256251913, + "learning_rate": 6.474215062033527e-06, + "loss": 0.0231, + "step": 3389 + }, + { + "epoch": 0.74, + "grad_norm": 0.30069783527957566, + "learning_rate": 6.463737458502255e-06, + "loss": 0.0358, + "step": 3390 + }, + { + "epoch": 0.74, + "grad_norm": 0.26570232051378745, + "learning_rate": 6.453266705756427e-06, + "loss": 0.03, + "step": 3391 + }, + { + "epoch": 0.75, + "grad_norm": 0.2755429272846124, + "learning_rate": 6.442802809095363e-06, + "loss": 0.0344, + "step": 3392 + }, + { + "epoch": 0.75, + "grad_norm": 0.2552849093532269, + "learning_rate": 6.4323457738149034e-06, + "loss": 0.0264, + "step": 3393 + }, + { + "epoch": 0.75, + "grad_norm": 0.3070690782068707, + "learning_rate": 6.421895605207427e-06, + "loss": 0.0228, + "step": 3394 + }, + { + "epoch": 0.75, + "grad_norm": 0.21608959849101006, + "learning_rate": 6.41145230856181e-06, + "loss": 0.0219, + "step": 3395 + }, + { + "epoch": 0.75, + "grad_norm": 0.3256684718703453, + "learning_rate": 6.401015889163489e-06, + "loss": 0.0457, + "step": 3396 + }, + { + "epoch": 0.75, + "grad_norm": 0.27969183039376333, + "learning_rate": 6.3905863522943786e-06, + "loss": 0.0294, + "step": 3397 + }, + { + "epoch": 0.75, + "grad_norm": 0.32161444562324737, + "learning_rate": 6.380163703232953e-06, + "loss": 0.0523, + "step": 3398 + }, + { + "epoch": 0.75, + "grad_norm": 0.30481554666388455, + "learning_rate": 6.369747947254159e-06, + "loss": 0.038, + "step": 3399 + }, + { + "epoch": 0.75, + "grad_norm": 0.2658260510208518, + "learning_rate": 6.35933908962949e-06, + "loss": 0.0303, + "step": 3400 + }, + { + "epoch": 0.75, + "grad_norm": 0.2595659506770729, + "learning_rate": 6.348937135626922e-06, + "loss": 0.0311, + "step": 3401 + }, + { + "epoch": 0.75, + "grad_norm": 0.20705685367249238, + "learning_rate": 6.338542090510951e-06, + "loss": 0.022, + "step": 3402 + }, + { + "epoch": 0.75, + "grad_norm": 0.28922769620260047, + "learning_rate": 6.328153959542573e-06, + "loss": 0.0294, + "step": 3403 + }, + { + "epoch": 0.75, + "grad_norm": 0.32985597391757204, + "learning_rate": 6.3177727479792914e-06, + "loss": 0.0394, + "step": 3404 + }, + { + "epoch": 0.75, + "grad_norm": 0.23242230775937095, + "learning_rate": 6.307398461075091e-06, + "loss": 0.0285, + "step": 3405 + }, + { + "epoch": 0.75, + "grad_norm": 0.27369074674458943, + "learning_rate": 6.297031104080471e-06, + "loss": 0.0238, + "step": 3406 + }, + { + "epoch": 0.75, + "grad_norm": 0.262031994012767, + "learning_rate": 6.286670682242404e-06, + "loss": 0.0308, + "step": 3407 + }, + { + "epoch": 0.75, + "grad_norm": 0.26526526326030525, + "learning_rate": 6.276317200804376e-06, + "loss": 0.0274, + "step": 3408 + }, + { + "epoch": 0.75, + "grad_norm": 0.3540060319507838, + "learning_rate": 6.265970665006334e-06, + "loss": 0.0469, + "step": 3409 + }, + { + "epoch": 0.75, + "grad_norm": 0.2258842114700864, + "learning_rate": 6.255631080084735e-06, + "loss": 0.027, + "step": 3410 + }, + { + "epoch": 0.75, + "grad_norm": 0.31748298824722165, + "learning_rate": 6.245298451272486e-06, + "loss": 0.0358, + "step": 3411 + }, + { + "epoch": 0.75, + "grad_norm": 0.2888240733028864, + "learning_rate": 6.234972783799023e-06, + "loss": 0.0362, + "step": 3412 + }, + { + "epoch": 0.75, + "grad_norm": 0.2723556916884701, + "learning_rate": 6.224654082890207e-06, + "loss": 0.03, + "step": 3413 + }, + { + "epoch": 0.75, + "grad_norm": 0.2978367536021447, + "learning_rate": 6.214342353768412e-06, + "loss": 0.0324, + "step": 3414 + }, + { + "epoch": 0.75, + "grad_norm": 0.29954612094059346, + "learning_rate": 6.2040376016524506e-06, + "loss": 0.0492, + "step": 3415 + }, + { + "epoch": 0.75, + "grad_norm": 0.30504933904023807, + "learning_rate": 6.193739831757637e-06, + "loss": 0.0362, + "step": 3416 + }, + { + "epoch": 0.75, + "grad_norm": 0.3184118403165737, + "learning_rate": 6.183449049295722e-06, + "loss": 0.0334, + "step": 3417 + }, + { + "epoch": 0.75, + "grad_norm": 0.2485101608567878, + "learning_rate": 6.1731652594749465e-06, + "loss": 0.0288, + "step": 3418 + }, + { + "epoch": 0.75, + "grad_norm": 0.23786661938343767, + "learning_rate": 6.162888467499988e-06, + "loss": 0.0282, + "step": 3419 + }, + { + "epoch": 0.75, + "grad_norm": 0.29388923997343264, + "learning_rate": 6.152618678571996e-06, + "loss": 0.0257, + "step": 3420 + }, + { + "epoch": 0.75, + "grad_norm": 0.414291264659481, + "learning_rate": 6.1423558978885836e-06, + "loss": 0.0706, + "step": 3421 + }, + { + "epoch": 0.75, + "grad_norm": 0.24517493308692564, + "learning_rate": 6.1321001306437946e-06, + "loss": 0.0268, + "step": 3422 + }, + { + "epoch": 0.75, + "grad_norm": 0.27522385058717846, + "learning_rate": 6.121851382028146e-06, + "loss": 0.0316, + "step": 3423 + }, + { + "epoch": 0.75, + "grad_norm": 0.2731825730117969, + "learning_rate": 6.111609657228581e-06, + "loss": 0.04, + "step": 3424 + }, + { + "epoch": 0.75, + "grad_norm": 0.3421778054801919, + "learning_rate": 6.101374961428512e-06, + "loss": 0.0295, + "step": 3425 + }, + { + "epoch": 0.75, + "grad_norm": 0.2716664314464027, + "learning_rate": 6.091147299807769e-06, + "loss": 0.0325, + "step": 3426 + }, + { + "epoch": 0.75, + "grad_norm": 0.37625543729839095, + "learning_rate": 6.080926677542646e-06, + "loss": 0.0461, + "step": 3427 + }, + { + "epoch": 0.75, + "grad_norm": 0.2710786161128589, + "learning_rate": 6.070713099805845e-06, + "loss": 0.0327, + "step": 3428 + }, + { + "epoch": 0.75, + "grad_norm": 0.2750153250108817, + "learning_rate": 6.0605065717665445e-06, + "loss": 0.0272, + "step": 3429 + }, + { + "epoch": 0.75, + "grad_norm": 0.27125907715758363, + "learning_rate": 6.050307098590311e-06, + "loss": 0.0327, + "step": 3430 + }, + { + "epoch": 0.75, + "grad_norm": 0.24568234978981157, + "learning_rate": 6.040114685439175e-06, + "loss": 0.0278, + "step": 3431 + }, + { + "epoch": 0.75, + "grad_norm": 0.26473579513348117, + "learning_rate": 6.029929337471565e-06, + "loss": 0.0251, + "step": 3432 + }, + { + "epoch": 0.75, + "grad_norm": 0.2821258180316564, + "learning_rate": 6.019751059842362e-06, + "loss": 0.0373, + "step": 3433 + }, + { + "epoch": 0.75, + "grad_norm": 0.2629471744972731, + "learning_rate": 6.009579857702843e-06, + "loss": 0.0273, + "step": 3434 + }, + { + "epoch": 0.75, + "grad_norm": 0.2585106845912787, + "learning_rate": 5.999415736200724e-06, + "loss": 0.0311, + "step": 3435 + }, + { + "epoch": 0.75, + "grad_norm": 0.3633553009034456, + "learning_rate": 5.98925870048012e-06, + "loss": 0.0366, + "step": 3436 + }, + { + "epoch": 0.75, + "grad_norm": 0.35135252298030384, + "learning_rate": 5.979108755681575e-06, + "loss": 0.0429, + "step": 3437 + }, + { + "epoch": 0.76, + "grad_norm": 0.2768956715291986, + "learning_rate": 5.968965906942039e-06, + "loss": 0.0386, + "step": 3438 + }, + { + "epoch": 0.76, + "grad_norm": 0.2373179757750094, + "learning_rate": 5.958830159394875e-06, + "loss": 0.0292, + "step": 3439 + }, + { + "epoch": 0.76, + "grad_norm": 0.27913695872408384, + "learning_rate": 5.948701518169835e-06, + "loss": 0.0353, + "step": 3440 + }, + { + "epoch": 0.76, + "grad_norm": 0.26605618765585465, + "learning_rate": 5.938579988393099e-06, + "loss": 0.0366, + "step": 3441 + }, + { + "epoch": 0.76, + "grad_norm": 0.23421451319195605, + "learning_rate": 5.928465575187221e-06, + "loss": 0.0304, + "step": 3442 + }, + { + "epoch": 0.76, + "grad_norm": 0.2615844894609264, + "learning_rate": 5.918358283671182e-06, + "loss": 0.0386, + "step": 3443 + }, + { + "epoch": 0.76, + "grad_norm": 0.27336979436518566, + "learning_rate": 5.90825811896033e-06, + "loss": 0.0379, + "step": 3444 + }, + { + "epoch": 0.76, + "grad_norm": 0.365486384468251, + "learning_rate": 5.89816508616643e-06, + "loss": 0.0427, + "step": 3445 + }, + { + "epoch": 0.76, + "grad_norm": 0.29642779145819365, + "learning_rate": 5.888079190397628e-06, + "loss": 0.0477, + "step": 3446 + }, + { + "epoch": 0.76, + "grad_norm": 0.22215723703476092, + "learning_rate": 5.878000436758453e-06, + "loss": 0.0203, + "step": 3447 + }, + { + "epoch": 0.76, + "grad_norm": 0.21405507026085965, + "learning_rate": 5.86792883034983e-06, + "loss": 0.0232, + "step": 3448 + }, + { + "epoch": 0.76, + "grad_norm": 0.2628546164396591, + "learning_rate": 5.857864376269051e-06, + "loss": 0.0271, + "step": 3449 + }, + { + "epoch": 0.76, + "grad_norm": 0.2734873453757123, + "learning_rate": 5.847807079609804e-06, + "loss": 0.0305, + "step": 3450 + }, + { + "epoch": 0.76, + "grad_norm": 0.22837991841975297, + "learning_rate": 5.837756945462154e-06, + "loss": 0.0294, + "step": 3451 + }, + { + "epoch": 0.76, + "grad_norm": 0.2840929849505443, + "learning_rate": 5.827713978912524e-06, + "loss": 0.0252, + "step": 3452 + }, + { + "epoch": 0.76, + "grad_norm": 0.21994348626473945, + "learning_rate": 5.817678185043733e-06, + "loss": 0.0356, + "step": 3453 + }, + { + "epoch": 0.76, + "grad_norm": 0.2650177690558583, + "learning_rate": 5.807649568934945e-06, + "loss": 0.0319, + "step": 3454 + }, + { + "epoch": 0.76, + "grad_norm": 0.2915132735135509, + "learning_rate": 5.79762813566171e-06, + "loss": 0.0342, + "step": 3455 + }, + { + "epoch": 0.76, + "grad_norm": 0.2420591891239867, + "learning_rate": 5.7876138902959445e-06, + "loss": 0.0258, + "step": 3456 + }, + { + "epoch": 0.76, + "grad_norm": 0.272814886513111, + "learning_rate": 5.777606837905905e-06, + "loss": 0.0363, + "step": 3457 + }, + { + "epoch": 0.76, + "grad_norm": 0.22757156696554626, + "learning_rate": 5.767606983556237e-06, + "loss": 0.0327, + "step": 3458 + }, + { + "epoch": 0.76, + "grad_norm": 0.25811792835034714, + "learning_rate": 5.757614332307912e-06, + "loss": 0.025, + "step": 3459 + }, + { + "epoch": 0.76, + "grad_norm": 0.2748675417510592, + "learning_rate": 5.7476288892182905e-06, + "loss": 0.0406, + "step": 3460 + }, + { + "epoch": 0.76, + "grad_norm": 0.29713312020558075, + "learning_rate": 5.737650659341048e-06, + "loss": 0.0299, + "step": 3461 + }, + { + "epoch": 0.76, + "grad_norm": 0.27745443472639314, + "learning_rate": 5.7276796477262365e-06, + "loss": 0.0344, + "step": 3462 + }, + { + "epoch": 0.76, + "grad_norm": 0.17985481267953204, + "learning_rate": 5.717715859420246e-06, + "loss": 0.018, + "step": 3463 + }, + { + "epoch": 0.76, + "grad_norm": 0.19989098468629776, + "learning_rate": 5.707759299465816e-06, + "loss": 0.0179, + "step": 3464 + }, + { + "epoch": 0.76, + "grad_norm": 0.2669343513181597, + "learning_rate": 5.6978099729020105e-06, + "loss": 0.0304, + "step": 3465 + }, + { + "epoch": 0.76, + "grad_norm": 0.21945043379538026, + "learning_rate": 5.68786788476426e-06, + "loss": 0.0249, + "step": 3466 + }, + { + "epoch": 0.76, + "grad_norm": 0.27184858522676725, + "learning_rate": 5.6779330400843e-06, + "loss": 0.0297, + "step": 3467 + }, + { + "epoch": 0.76, + "grad_norm": 0.26262306439441285, + "learning_rate": 5.66800544389023e-06, + "loss": 0.023, + "step": 3468 + }, + { + "epoch": 0.76, + "grad_norm": 0.32370174504924254, + "learning_rate": 5.658085101206456e-06, + "loss": 0.0523, + "step": 3469 + }, + { + "epoch": 0.76, + "grad_norm": 0.27778446749075747, + "learning_rate": 5.648172017053737e-06, + "loss": 0.0287, + "step": 3470 + }, + { + "epoch": 0.76, + "grad_norm": 0.4630715543716684, + "learning_rate": 5.638266196449123e-06, + "loss": 0.0579, + "step": 3471 + }, + { + "epoch": 0.76, + "grad_norm": 0.2009417597347354, + "learning_rate": 5.628367644406039e-06, + "loss": 0.0205, + "step": 3472 + }, + { + "epoch": 0.76, + "grad_norm": 0.3200980375658781, + "learning_rate": 5.618476365934184e-06, + "loss": 0.0349, + "step": 3473 + }, + { + "epoch": 0.76, + "grad_norm": 0.28005963986738336, + "learning_rate": 5.608592366039607e-06, + "loss": 0.0403, + "step": 3474 + }, + { + "epoch": 0.76, + "grad_norm": 0.25529178241170714, + "learning_rate": 5.598715649724647e-06, + "loss": 0.0283, + "step": 3475 + }, + { + "epoch": 0.76, + "grad_norm": 0.2284862303779472, + "learning_rate": 5.588846221987982e-06, + "loss": 0.0334, + "step": 3476 + }, + { + "epoch": 0.76, + "grad_norm": 0.23788867724142732, + "learning_rate": 5.578984087824581e-06, + "loss": 0.0219, + "step": 3477 + }, + { + "epoch": 0.76, + "grad_norm": 0.2614670582312282, + "learning_rate": 5.569129252225745e-06, + "loss": 0.0258, + "step": 3478 + }, + { + "epoch": 0.76, + "grad_norm": 0.24618739662400665, + "learning_rate": 5.559281720179046e-06, + "loss": 0.0221, + "step": 3479 + }, + { + "epoch": 0.76, + "grad_norm": 0.2834080185115215, + "learning_rate": 5.549441496668393e-06, + "loss": 0.0385, + "step": 3480 + }, + { + "epoch": 0.76, + "grad_norm": 0.20664769757287502, + "learning_rate": 5.539608586673988e-06, + "loss": 0.0302, + "step": 3481 + }, + { + "epoch": 0.76, + "grad_norm": 0.25517500283450656, + "learning_rate": 5.529782995172315e-06, + "loss": 0.0257, + "step": 3482 + }, + { + "epoch": 0.76, + "grad_norm": 0.2605643648285357, + "learning_rate": 5.519964727136178e-06, + "loss": 0.0308, + "step": 3483 + }, + { + "epoch": 0.77, + "grad_norm": 0.2539689646809514, + "learning_rate": 5.510153787534651e-06, + "loss": 0.0201, + "step": 3484 + }, + { + "epoch": 0.77, + "grad_norm": 0.2587773590320521, + "learning_rate": 5.500350181333121e-06, + "loss": 0.034, + "step": 3485 + }, + { + "epoch": 0.77, + "grad_norm": 0.24054653541860618, + "learning_rate": 5.490553913493242e-06, + "loss": 0.0216, + "step": 3486 + }, + { + "epoch": 0.77, + "grad_norm": 0.33018462590373493, + "learning_rate": 5.48076498897298e-06, + "loss": 0.0315, + "step": 3487 + }, + { + "epoch": 0.77, + "grad_norm": 0.27005919291089847, + "learning_rate": 5.470983412726547e-06, + "loss": 0.0232, + "step": 3488 + }, + { + "epoch": 0.77, + "grad_norm": 0.20541222015878996, + "learning_rate": 5.461209189704486e-06, + "loss": 0.0172, + "step": 3489 + }, + { + "epoch": 0.77, + "grad_norm": 0.23395304844311773, + "learning_rate": 5.451442324853571e-06, + "loss": 0.0257, + "step": 3490 + }, + { + "epoch": 0.77, + "grad_norm": 0.24403252306408385, + "learning_rate": 5.441682823116887e-06, + "loss": 0.0237, + "step": 3491 + }, + { + "epoch": 0.77, + "grad_norm": 0.2592484625124243, + "learning_rate": 5.431930689433762e-06, + "loss": 0.0246, + "step": 3492 + }, + { + "epoch": 0.77, + "grad_norm": 0.26405085419475327, + "learning_rate": 5.422185928739827e-06, + "loss": 0.0247, + "step": 3493 + }, + { + "epoch": 0.77, + "grad_norm": 0.23156080301978307, + "learning_rate": 5.4124485459669485e-06, + "loss": 0.0264, + "step": 3494 + }, + { + "epoch": 0.77, + "grad_norm": 0.2927319905143484, + "learning_rate": 5.402718546043293e-06, + "loss": 0.0275, + "step": 3495 + }, + { + "epoch": 0.77, + "grad_norm": 0.3207400538826871, + "learning_rate": 5.392995933893255e-06, + "loss": 0.0259, + "step": 3496 + }, + { + "epoch": 0.77, + "grad_norm": 0.28107603308849083, + "learning_rate": 5.383280714437518e-06, + "loss": 0.0252, + "step": 3497 + }, + { + "epoch": 0.77, + "grad_norm": 0.28216600923457863, + "learning_rate": 5.373572892593013e-06, + "loss": 0.0379, + "step": 3498 + }, + { + "epoch": 0.77, + "grad_norm": 0.22892613503596634, + "learning_rate": 5.363872473272935e-06, + "loss": 0.0222, + "step": 3499 + }, + { + "epoch": 0.77, + "grad_norm": 0.24876891199660664, + "learning_rate": 5.354179461386712e-06, + "loss": 0.039, + "step": 3500 + }, + { + "epoch": 0.77, + "grad_norm": 0.38444579333155693, + "learning_rate": 5.3444938618400524e-06, + "loss": 0.0455, + "step": 3501 + }, + { + "epoch": 0.77, + "grad_norm": 0.26324160755669274, + "learning_rate": 5.334815679534882e-06, + "loss": 0.0241, + "step": 3502 + }, + { + "epoch": 0.77, + "grad_norm": 0.24986183550745963, + "learning_rate": 5.325144919369398e-06, + "loss": 0.0287, + "step": 3503 + }, + { + "epoch": 0.77, + "grad_norm": 0.2715735803501177, + "learning_rate": 5.315481586238025e-06, + "loss": 0.0253, + "step": 3504 + }, + { + "epoch": 0.77, + "grad_norm": 0.2895472817426594, + "learning_rate": 5.305825685031445e-06, + "loss": 0.0296, + "step": 3505 + }, + { + "epoch": 0.77, + "grad_norm": 0.25753565576129356, + "learning_rate": 5.296177220636556e-06, + "loss": 0.0263, + "step": 3506 + }, + { + "epoch": 0.77, + "grad_norm": 0.25278871061247116, + "learning_rate": 5.286536197936512e-06, + "loss": 0.0349, + "step": 3507 + }, + { + "epoch": 0.77, + "grad_norm": 0.3275757120245091, + "learning_rate": 5.276902621810691e-06, + "loss": 0.0298, + "step": 3508 + }, + { + "epoch": 0.77, + "grad_norm": 0.2513118834223532, + "learning_rate": 5.267276497134715e-06, + "loss": 0.0292, + "step": 3509 + }, + { + "epoch": 0.77, + "grad_norm": 0.2804987463676603, + "learning_rate": 5.257657828780409e-06, + "loss": 0.0289, + "step": 3510 + }, + { + "epoch": 0.77, + "grad_norm": 0.2999060270032961, + "learning_rate": 5.2480466216158565e-06, + "loss": 0.0376, + "step": 3511 + }, + { + "epoch": 0.77, + "grad_norm": 0.2903810148621985, + "learning_rate": 5.2384428805053325e-06, + "loss": 0.0302, + "step": 3512 + }, + { + "epoch": 0.77, + "grad_norm": 0.3013708108225558, + "learning_rate": 5.228846610309359e-06, + "loss": 0.0365, + "step": 3513 + }, + { + "epoch": 0.77, + "grad_norm": 0.3061993089879895, + "learning_rate": 5.219257815884662e-06, + "loss": 0.0314, + "step": 3514 + }, + { + "epoch": 0.77, + "grad_norm": 0.3321456487709721, + "learning_rate": 5.209676502084191e-06, + "loss": 0.0341, + "step": 3515 + }, + { + "epoch": 0.77, + "grad_norm": 0.2771528150926492, + "learning_rate": 5.200102673757115e-06, + "loss": 0.033, + "step": 3516 + }, + { + "epoch": 0.77, + "grad_norm": 0.2362254842017043, + "learning_rate": 5.190536335748792e-06, + "loss": 0.0235, + "step": 3517 + }, + { + "epoch": 0.77, + "grad_norm": 0.32312161596950045, + "learning_rate": 5.180977492900823e-06, + "loss": 0.0386, + "step": 3518 + }, + { + "epoch": 0.77, + "grad_norm": 0.2725945558677096, + "learning_rate": 5.171426150050977e-06, + "loss": 0.0316, + "step": 3519 + }, + { + "epoch": 0.77, + "grad_norm": 0.35931874954513726, + "learning_rate": 5.161882312033264e-06, + "loss": 0.0579, + "step": 3520 + }, + { + "epoch": 0.77, + "grad_norm": 0.23317933272713304, + "learning_rate": 5.152345983677866e-06, + "loss": 0.0182, + "step": 3521 + }, + { + "epoch": 0.77, + "grad_norm": 0.24431754758865426, + "learning_rate": 5.142817169811189e-06, + "loss": 0.0259, + "step": 3522 + }, + { + "epoch": 0.77, + "grad_norm": 0.23309584279315684, + "learning_rate": 5.133295875255808e-06, + "loss": 0.0228, + "step": 3523 + }, + { + "epoch": 0.77, + "grad_norm": 0.24801257038945074, + "learning_rate": 5.1237821048305305e-06, + "loss": 0.0295, + "step": 3524 + }, + { + "epoch": 0.77, + "grad_norm": 0.22829640298379072, + "learning_rate": 5.114275863350313e-06, + "loss": 0.0222, + "step": 3525 + }, + { + "epoch": 0.77, + "grad_norm": 0.29244651024135737, + "learning_rate": 5.104777155626341e-06, + "loss": 0.0262, + "step": 3526 + }, + { + "epoch": 0.77, + "grad_norm": 0.2948691845239951, + "learning_rate": 5.095285986465952e-06, + "loss": 0.0307, + "step": 3527 + }, + { + "epoch": 0.77, + "grad_norm": 0.25266239963235654, + "learning_rate": 5.085802360672701e-06, + "loss": 0.0258, + "step": 3528 + }, + { + "epoch": 0.78, + "grad_norm": 0.19868677769673954, + "learning_rate": 5.076326283046291e-06, + "loss": 0.02, + "step": 3529 + }, + { + "epoch": 0.78, + "grad_norm": 0.361861026704966, + "learning_rate": 5.066857758382642e-06, + "loss": 0.0369, + "step": 3530 + }, + { + "epoch": 0.78, + "grad_norm": 0.2777206385309938, + "learning_rate": 5.057396791473807e-06, + "loss": 0.027, + "step": 3531 + }, + { + "epoch": 0.78, + "grad_norm": 0.24217204844498547, + "learning_rate": 5.047943387108072e-06, + "loss": 0.0343, + "step": 3532 + }, + { + "epoch": 0.78, + "grad_norm": 0.27128613046774036, + "learning_rate": 5.038497550069836e-06, + "loss": 0.0323, + "step": 3533 + }, + { + "epoch": 0.78, + "grad_norm": 0.25555773418961775, + "learning_rate": 5.029059285139715e-06, + "loss": 0.0259, + "step": 3534 + }, + { + "epoch": 0.78, + "grad_norm": 0.26456868282566975, + "learning_rate": 5.019628597094455e-06, + "loss": 0.0216, + "step": 3535 + }, + { + "epoch": 0.78, + "grad_norm": 0.1992101760964003, + "learning_rate": 5.010205490706998e-06, + "loss": 0.0241, + "step": 3536 + }, + { + "epoch": 0.78, + "grad_norm": 0.2213759420941348, + "learning_rate": 5.000789970746427e-06, + "loss": 0.0179, + "step": 3537 + }, + { + "epoch": 0.78, + "grad_norm": 0.3218004163420789, + "learning_rate": 4.9913820419780035e-06, + "loss": 0.0291, + "step": 3538 + }, + { + "epoch": 0.78, + "grad_norm": 0.24229933094903675, + "learning_rate": 4.981981709163126e-06, + "loss": 0.025, + "step": 3539 + }, + { + "epoch": 0.78, + "grad_norm": 0.26500260138134524, + "learning_rate": 4.972588977059369e-06, + "loss": 0.0312, + "step": 3540 + }, + { + "epoch": 0.78, + "grad_norm": 0.2412525558089232, + "learning_rate": 4.963203850420455e-06, + "loss": 0.0268, + "step": 3541 + }, + { + "epoch": 0.78, + "grad_norm": 0.2521682973733539, + "learning_rate": 4.953826333996243e-06, + "loss": 0.0292, + "step": 3542 + }, + { + "epoch": 0.78, + "grad_norm": 0.2608333813928269, + "learning_rate": 4.944456432532765e-06, + "loss": 0.0319, + "step": 3543 + }, + { + "epoch": 0.78, + "grad_norm": 0.2768863210021235, + "learning_rate": 4.93509415077217e-06, + "loss": 0.0426, + "step": 3544 + }, + { + "epoch": 0.78, + "grad_norm": 0.28003648051964447, + "learning_rate": 4.925739493452783e-06, + "loss": 0.0377, + "step": 3545 + }, + { + "epoch": 0.78, + "grad_norm": 0.2550543284744202, + "learning_rate": 4.916392465309037e-06, + "loss": 0.0293, + "step": 3546 + }, + { + "epoch": 0.78, + "grad_norm": 0.35349700415028396, + "learning_rate": 4.907053071071535e-06, + "loss": 0.0384, + "step": 3547 + }, + { + "epoch": 0.78, + "grad_norm": 0.2566989489840341, + "learning_rate": 4.89772131546699e-06, + "loss": 0.0259, + "step": 3548 + }, + { + "epoch": 0.78, + "grad_norm": 0.21212567254501036, + "learning_rate": 4.888397203218265e-06, + "loss": 0.0248, + "step": 3549 + }, + { + "epoch": 0.78, + "grad_norm": 0.28878337683718536, + "learning_rate": 4.879080739044351e-06, + "loss": 0.0309, + "step": 3550 + }, + { + "epoch": 0.78, + "grad_norm": 0.24755283025469316, + "learning_rate": 4.869771927660371e-06, + "loss": 0.0331, + "step": 3551 + }, + { + "epoch": 0.78, + "grad_norm": 0.21122812630478413, + "learning_rate": 4.860470773777566e-06, + "loss": 0.0178, + "step": 3552 + }, + { + "epoch": 0.78, + "grad_norm": 0.2693034328794336, + "learning_rate": 4.851177282103312e-06, + "loss": 0.0347, + "step": 3553 + }, + { + "epoch": 0.78, + "grad_norm": 0.23671622762815486, + "learning_rate": 4.841891457341095e-06, + "loss": 0.0317, + "step": 3554 + }, + { + "epoch": 0.78, + "grad_norm": 0.26564499835858235, + "learning_rate": 4.832613304190537e-06, + "loss": 0.0306, + "step": 3555 + }, + { + "epoch": 0.78, + "grad_norm": 0.276596284408431, + "learning_rate": 4.823342827347357e-06, + "loss": 0.0291, + "step": 3556 + }, + { + "epoch": 0.78, + "grad_norm": 0.2284591882249527, + "learning_rate": 4.814080031503407e-06, + "loss": 0.0222, + "step": 3557 + }, + { + "epoch": 0.78, + "grad_norm": 0.288734481898206, + "learning_rate": 4.804824921346645e-06, + "loss": 0.0365, + "step": 3558 + }, + { + "epoch": 0.78, + "grad_norm": 0.3201243964751489, + "learning_rate": 4.795577501561144e-06, + "loss": 0.0397, + "step": 3559 + }, + { + "epoch": 0.78, + "grad_norm": 0.24722216014706502, + "learning_rate": 4.786337776827066e-06, + "loss": 0.0272, + "step": 3560 + }, + { + "epoch": 0.78, + "grad_norm": 0.217336635706269, + "learning_rate": 4.777105751820708e-06, + "loss": 0.0205, + "step": 3561 + }, + { + "epoch": 0.78, + "grad_norm": 0.22388667061618933, + "learning_rate": 4.767881431214441e-06, + "loss": 0.0247, + "step": 3562 + }, + { + "epoch": 0.78, + "grad_norm": 0.2911109512169312, + "learning_rate": 4.758664819676759e-06, + "loss": 0.0424, + "step": 3563 + }, + { + "epoch": 0.78, + "grad_norm": 0.2558790738042841, + "learning_rate": 4.7494559218722395e-06, + "loss": 0.0295, + "step": 3564 + }, + { + "epoch": 0.78, + "grad_norm": 0.271038856225442, + "learning_rate": 4.74025474246157e-06, + "loss": 0.0316, + "step": 3565 + }, + { + "epoch": 0.78, + "grad_norm": 0.21785685733395657, + "learning_rate": 4.7310612861015125e-06, + "loss": 0.0276, + "step": 3566 + }, + { + "epoch": 0.78, + "grad_norm": 0.3025124505530959, + "learning_rate": 4.7218755574449394e-06, + "loss": 0.0348, + "step": 3567 + }, + { + "epoch": 0.78, + "grad_norm": 0.2646960535885467, + "learning_rate": 4.712697561140802e-06, + "loss": 0.0415, + "step": 3568 + }, + { + "epoch": 0.78, + "grad_norm": 0.2712372677535224, + "learning_rate": 4.703527301834148e-06, + "loss": 0.0301, + "step": 3569 + }, + { + "epoch": 0.78, + "grad_norm": 0.3025306754676668, + "learning_rate": 4.69436478416609e-06, + "loss": 0.0361, + "step": 3570 + }, + { + "epoch": 0.78, + "grad_norm": 0.2173128089024891, + "learning_rate": 4.685210012773844e-06, + "loss": 0.0255, + "step": 3571 + }, + { + "epoch": 0.78, + "grad_norm": 0.279395026186865, + "learning_rate": 4.676062992290686e-06, + "loss": 0.0243, + "step": 3572 + }, + { + "epoch": 0.78, + "grad_norm": 0.3091795467682432, + "learning_rate": 4.666923727345991e-06, + "loss": 0.0309, + "step": 3573 + }, + { + "epoch": 0.78, + "grad_norm": 0.2500044405065996, + "learning_rate": 4.657792222565185e-06, + "loss": 0.0299, + "step": 3574 + }, + { + "epoch": 0.79, + "grad_norm": 0.27663942010892373, + "learning_rate": 4.6486684825697845e-06, + "loss": 0.0426, + "step": 3575 + }, + { + "epoch": 0.79, + "grad_norm": 0.32554082300480536, + "learning_rate": 4.639552511977374e-06, + "loss": 0.0415, + "step": 3576 + }, + { + "epoch": 0.79, + "grad_norm": 0.23337846135799326, + "learning_rate": 4.630444315401594e-06, + "loss": 0.0297, + "step": 3577 + }, + { + "epoch": 0.79, + "grad_norm": 0.21699514000752693, + "learning_rate": 4.621343897452169e-06, + "loss": 0.0246, + "step": 3578 + }, + { + "epoch": 0.79, + "grad_norm": 0.2296995565273345, + "learning_rate": 4.612251262734864e-06, + "loss": 0.0282, + "step": 3579 + }, + { + "epoch": 0.79, + "grad_norm": 0.2650624798273476, + "learning_rate": 4.603166415851527e-06, + "loss": 0.0381, + "step": 3580 + }, + { + "epoch": 0.79, + "grad_norm": 0.24872530874554657, + "learning_rate": 4.594089361400047e-06, + "loss": 0.0313, + "step": 3581 + }, + { + "epoch": 0.79, + "grad_norm": 0.22568069053377085, + "learning_rate": 4.585020103974387e-06, + "loss": 0.0187, + "step": 3582 + }, + { + "epoch": 0.79, + "grad_norm": 0.2579961469440179, + "learning_rate": 4.575958648164536e-06, + "loss": 0.0306, + "step": 3583 + }, + { + "epoch": 0.79, + "grad_norm": 0.25226387554943125, + "learning_rate": 4.5669049985565735e-06, + "loss": 0.0305, + "step": 3584 + }, + { + "epoch": 0.79, + "grad_norm": 0.2301970755656676, + "learning_rate": 4.5578591597325935e-06, + "loss": 0.0235, + "step": 3585 + }, + { + "epoch": 0.79, + "grad_norm": 0.273026429858453, + "learning_rate": 4.54882113627076e-06, + "loss": 0.0295, + "step": 3586 + }, + { + "epoch": 0.79, + "grad_norm": 0.25671518772600055, + "learning_rate": 4.53979093274526e-06, + "loss": 0.0295, + "step": 3587 + }, + { + "epoch": 0.79, + "grad_norm": 0.24103823359464405, + "learning_rate": 4.530768553726348e-06, + "loss": 0.0281, + "step": 3588 + }, + { + "epoch": 0.79, + "grad_norm": 0.2895857997258674, + "learning_rate": 4.521754003780294e-06, + "loss": 0.0335, + "step": 3589 + }, + { + "epoch": 0.79, + "grad_norm": 0.23744475387192743, + "learning_rate": 4.512747287469426e-06, + "loss": 0.0282, + "step": 3590 + }, + { + "epoch": 0.79, + "grad_norm": 0.22834568551656584, + "learning_rate": 4.503748409352089e-06, + "loss": 0.0291, + "step": 3591 + }, + { + "epoch": 0.79, + "grad_norm": 0.24846882112866903, + "learning_rate": 4.494757373982674e-06, + "loss": 0.0255, + "step": 3592 + }, + { + "epoch": 0.79, + "grad_norm": 0.24942828909284545, + "learning_rate": 4.4857741859116024e-06, + "loss": 0.028, + "step": 3593 + }, + { + "epoch": 0.79, + "grad_norm": 0.2613375150307862, + "learning_rate": 4.476798849685322e-06, + "loss": 0.0219, + "step": 3594 + }, + { + "epoch": 0.79, + "grad_norm": 0.26663221804814036, + "learning_rate": 4.467831369846301e-06, + "loss": 0.0359, + "step": 3595 + }, + { + "epoch": 0.79, + "grad_norm": 0.21777006753956984, + "learning_rate": 4.458871750933038e-06, + "loss": 0.0219, + "step": 3596 + }, + { + "epoch": 0.79, + "grad_norm": 0.25977810134181817, + "learning_rate": 4.449919997480047e-06, + "loss": 0.0283, + "step": 3597 + }, + { + "epoch": 0.79, + "grad_norm": 0.276873794837206, + "learning_rate": 4.4409761140178765e-06, + "loss": 0.026, + "step": 3598 + }, + { + "epoch": 0.79, + "grad_norm": 0.31331411650670776, + "learning_rate": 4.432040105073065e-06, + "loss": 0.0369, + "step": 3599 + }, + { + "epoch": 0.79, + "grad_norm": 0.25877502691246485, + "learning_rate": 4.4231119751681885e-06, + "loss": 0.0414, + "step": 3600 + }, + { + "epoch": 0.79, + "grad_norm": 0.2487674519392905, + "learning_rate": 4.414191728821838e-06, + "loss": 0.0343, + "step": 3601 + }, + { + "epoch": 0.79, + "grad_norm": 0.234888951728711, + "learning_rate": 4.405279370548587e-06, + "loss": 0.0313, + "step": 3602 + }, + { + "epoch": 0.79, + "grad_norm": 0.2263207543502551, + "learning_rate": 4.396374904859051e-06, + "loss": 0.0324, + "step": 3603 + }, + { + "epoch": 0.79, + "grad_norm": 0.34164101452982987, + "learning_rate": 4.387478336259821e-06, + "loss": 0.0471, + "step": 3604 + }, + { + "epoch": 0.79, + "grad_norm": 0.2783808859779696, + "learning_rate": 4.3785896692535165e-06, + "loss": 0.0417, + "step": 3605 + }, + { + "epoch": 0.79, + "grad_norm": 0.3006577671775209, + "learning_rate": 4.369708908338735e-06, + "loss": 0.0285, + "step": 3606 + }, + { + "epoch": 0.79, + "grad_norm": 0.20801565665914815, + "learning_rate": 4.360836058010096e-06, + "loss": 0.0216, + "step": 3607 + }, + { + "epoch": 0.79, + "grad_norm": 0.2478282971656624, + "learning_rate": 4.351971122758194e-06, + "loss": 0.0242, + "step": 3608 + }, + { + "epoch": 0.79, + "grad_norm": 0.30743639038019077, + "learning_rate": 4.343114107069628e-06, + "loss": 0.0414, + "step": 3609 + }, + { + "epoch": 0.79, + "grad_norm": 0.28202020301212055, + "learning_rate": 4.334265015426993e-06, + "loss": 0.0363, + "step": 3610 + }, + { + "epoch": 0.79, + "grad_norm": 0.3470685577777562, + "learning_rate": 4.3254238523088695e-06, + "loss": 0.0313, + "step": 3611 + }, + { + "epoch": 0.79, + "grad_norm": 0.28720296255665057, + "learning_rate": 4.316590622189815e-06, + "loss": 0.0329, + "step": 3612 + }, + { + "epoch": 0.79, + "grad_norm": 0.2573910794628964, + "learning_rate": 4.307765329540394e-06, + "loss": 0.0252, + "step": 3613 + }, + { + "epoch": 0.79, + "grad_norm": 0.23434111032246852, + "learning_rate": 4.298947978827128e-06, + "loss": 0.033, + "step": 3614 + }, + { + "epoch": 0.79, + "grad_norm": 0.28824083722117905, + "learning_rate": 4.290138574512546e-06, + "loss": 0.0279, + "step": 3615 + }, + { + "epoch": 0.79, + "grad_norm": 0.24409707253133334, + "learning_rate": 4.2813371210551294e-06, + "loss": 0.0283, + "step": 3616 + }, + { + "epoch": 0.79, + "grad_norm": 0.21835811457671117, + "learning_rate": 4.272543622909355e-06, + "loss": 0.0252, + "step": 3617 + }, + { + "epoch": 0.79, + "grad_norm": 0.1936975565814078, + "learning_rate": 4.263758084525656e-06, + "loss": 0.0187, + "step": 3618 + }, + { + "epoch": 0.79, + "grad_norm": 0.27033475993678024, + "learning_rate": 4.254980510350464e-06, + "loss": 0.0257, + "step": 3619 + }, + { + "epoch": 0.8, + "grad_norm": 0.2880795106332577, + "learning_rate": 4.246210904826149e-06, + "loss": 0.0263, + "step": 3620 + }, + { + "epoch": 0.8, + "grad_norm": 0.21054254750349924, + "learning_rate": 4.237449272391072e-06, + "loss": 0.0221, + "step": 3621 + }, + { + "epoch": 0.8, + "grad_norm": 0.23125726971927127, + "learning_rate": 4.228695617479541e-06, + "loss": 0.0217, + "step": 3622 + }, + { + "epoch": 0.8, + "grad_norm": 0.22920103881009266, + "learning_rate": 4.219949944521842e-06, + "loss": 0.0282, + "step": 3623 + }, + { + "epoch": 0.8, + "grad_norm": 0.2115455622915119, + "learning_rate": 4.2112122579442015e-06, + "loss": 0.0296, + "step": 3624 + }, + { + "epoch": 0.8, + "grad_norm": 0.2582877143193991, + "learning_rate": 4.202482562168832e-06, + "loss": 0.0284, + "step": 3625 + }, + { + "epoch": 0.8, + "grad_norm": 0.257448546398354, + "learning_rate": 4.193760861613865e-06, + "loss": 0.0287, + "step": 3626 + }, + { + "epoch": 0.8, + "grad_norm": 0.2342688001988582, + "learning_rate": 4.185047160693432e-06, + "loss": 0.0344, + "step": 3627 + }, + { + "epoch": 0.8, + "grad_norm": 0.19579876449193206, + "learning_rate": 4.176341463817573e-06, + "loss": 0.0227, + "step": 3628 + }, + { + "epoch": 0.8, + "grad_norm": 0.20072648826742953, + "learning_rate": 4.167643775392305e-06, + "loss": 0.0227, + "step": 3629 + }, + { + "epoch": 0.8, + "grad_norm": 0.27461042878774905, + "learning_rate": 4.1589540998195695e-06, + "loss": 0.0346, + "step": 3630 + }, + { + "epoch": 0.8, + "grad_norm": 0.23646947085972797, + "learning_rate": 4.150272441497276e-06, + "loss": 0.0307, + "step": 3631 + }, + { + "epoch": 0.8, + "grad_norm": 0.2571087392858843, + "learning_rate": 4.141598804819256e-06, + "loss": 0.0355, + "step": 3632 + }, + { + "epoch": 0.8, + "grad_norm": 0.20020142522644224, + "learning_rate": 4.132933194175299e-06, + "loss": 0.0203, + "step": 3633 + }, + { + "epoch": 0.8, + "grad_norm": 0.40853096605429784, + "learning_rate": 4.124275613951114e-06, + "loss": 0.0491, + "step": 3634 + }, + { + "epoch": 0.8, + "grad_norm": 0.36319887601451634, + "learning_rate": 4.115626068528362e-06, + "loss": 0.0279, + "step": 3635 + }, + { + "epoch": 0.8, + "grad_norm": 0.27277745555992244, + "learning_rate": 4.106984562284633e-06, + "loss": 0.0317, + "step": 3636 + }, + { + "epoch": 0.8, + "grad_norm": 0.21533601026253893, + "learning_rate": 4.0983510995934365e-06, + "loss": 0.0253, + "step": 3637 + }, + { + "epoch": 0.8, + "grad_norm": 0.28834596956729397, + "learning_rate": 4.089725684824235e-06, + "loss": 0.029, + "step": 3638 + }, + { + "epoch": 0.8, + "grad_norm": 0.23269264092864095, + "learning_rate": 4.081108322342389e-06, + "loss": 0.0249, + "step": 3639 + }, + { + "epoch": 0.8, + "grad_norm": 0.2555612909769337, + "learning_rate": 4.07249901650921e-06, + "loss": 0.0271, + "step": 3640 + }, + { + "epoch": 0.8, + "grad_norm": 0.32616875698403835, + "learning_rate": 4.0638977716819105e-06, + "loss": 0.0393, + "step": 3641 + }, + { + "epoch": 0.8, + "grad_norm": 0.22646813236034966, + "learning_rate": 4.055304592213645e-06, + "loss": 0.0183, + "step": 3642 + }, + { + "epoch": 0.8, + "grad_norm": 0.2554617697085855, + "learning_rate": 4.046719482453461e-06, + "loss": 0.0242, + "step": 3643 + }, + { + "epoch": 0.8, + "grad_norm": 0.32966515704916, + "learning_rate": 4.038142446746342e-06, + "loss": 0.0483, + "step": 3644 + }, + { + "epoch": 0.8, + "grad_norm": 0.22952046321610411, + "learning_rate": 4.029573489433179e-06, + "loss": 0.0205, + "step": 3645 + }, + { + "epoch": 0.8, + "grad_norm": 0.17493135242590682, + "learning_rate": 4.021012614850779e-06, + "loss": 0.0172, + "step": 3646 + }, + { + "epoch": 0.8, + "grad_norm": 0.1928107541498316, + "learning_rate": 4.012459827331841e-06, + "loss": 0.0206, + "step": 3647 + }, + { + "epoch": 0.8, + "grad_norm": 0.2356499327867729, + "learning_rate": 4.003915131204996e-06, + "loss": 0.0277, + "step": 3648 + }, + { + "epoch": 0.8, + "grad_norm": 0.39366916856177986, + "learning_rate": 3.995378530794754e-06, + "loss": 0.05, + "step": 3649 + }, + { + "epoch": 0.8, + "grad_norm": 0.23756317691259934, + "learning_rate": 3.986850030421554e-06, + "loss": 0.0241, + "step": 3650 + }, + { + "epoch": 0.8, + "grad_norm": 0.29357554855893153, + "learning_rate": 3.97832963440171e-06, + "loss": 0.0287, + "step": 3651 + }, + { + "epoch": 0.8, + "grad_norm": 0.2522710228865532, + "learning_rate": 3.969817347047451e-06, + "loss": 0.0264, + "step": 3652 + }, + { + "epoch": 0.8, + "grad_norm": 0.2546196525397989, + "learning_rate": 3.961313172666898e-06, + "loss": 0.0284, + "step": 3653 + }, + { + "epoch": 0.8, + "grad_norm": 0.19070338607451412, + "learning_rate": 3.952817115564076e-06, + "loss": 0.0169, + "step": 3654 + }, + { + "epoch": 0.8, + "grad_norm": 0.2269978205123538, + "learning_rate": 3.944329180038875e-06, + "loss": 0.0285, + "step": 3655 + }, + { + "epoch": 0.8, + "grad_norm": 0.2821964657057303, + "learning_rate": 3.935849370387104e-06, + "loss": 0.0272, + "step": 3656 + }, + { + "epoch": 0.8, + "grad_norm": 0.22076824375348655, + "learning_rate": 3.927377690900436e-06, + "loss": 0.0247, + "step": 3657 + }, + { + "epoch": 0.8, + "grad_norm": 0.21780141300077158, + "learning_rate": 3.91891414586645e-06, + "loss": 0.0282, + "step": 3658 + }, + { + "epoch": 0.8, + "grad_norm": 0.3060406446126063, + "learning_rate": 3.91045873956859e-06, + "loss": 0.0352, + "step": 3659 + }, + { + "epoch": 0.8, + "grad_norm": 0.2893880608826457, + "learning_rate": 3.902011476286196e-06, + "loss": 0.0318, + "step": 3660 + }, + { + "epoch": 0.8, + "grad_norm": 0.25120418334859473, + "learning_rate": 3.893572360294471e-06, + "loss": 0.0281, + "step": 3661 + }, + { + "epoch": 0.8, + "grad_norm": 0.27315269351638816, + "learning_rate": 3.885141395864509e-06, + "loss": 0.0232, + "step": 3662 + }, + { + "epoch": 0.8, + "grad_norm": 0.27229528279275966, + "learning_rate": 3.876718587263278e-06, + "loss": 0.0318, + "step": 3663 + }, + { + "epoch": 0.8, + "grad_norm": 0.2782702484612129, + "learning_rate": 3.868303938753599e-06, + "loss": 0.0335, + "step": 3664 + }, + { + "epoch": 0.8, + "grad_norm": 0.18907673797905694, + "learning_rate": 3.859897454594192e-06, + "loss": 0.0235, + "step": 3665 + }, + { + "epoch": 0.81, + "grad_norm": 0.23696425237289911, + "learning_rate": 3.851499139039618e-06, + "loss": 0.0255, + "step": 3666 + }, + { + "epoch": 0.81, + "grad_norm": 0.2530445441962811, + "learning_rate": 3.843108996340323e-06, + "loss": 0.0231, + "step": 3667 + }, + { + "epoch": 0.81, + "grad_norm": 0.1827468819515304, + "learning_rate": 3.834727030742613e-06, + "loss": 0.0173, + "step": 3668 + }, + { + "epoch": 0.81, + "grad_norm": 0.25870133838473247, + "learning_rate": 3.826353246488641e-06, + "loss": 0.0331, + "step": 3669 + }, + { + "epoch": 0.81, + "grad_norm": 0.26622786548554345, + "learning_rate": 3.817987647816437e-06, + "loss": 0.0368, + "step": 3670 + }, + { + "epoch": 0.81, + "grad_norm": 0.32275130924443896, + "learning_rate": 3.809630238959887e-06, + "loss": 0.0367, + "step": 3671 + }, + { + "epoch": 0.81, + "grad_norm": 0.25701007813794574, + "learning_rate": 3.8012810241487175e-06, + "loss": 0.0246, + "step": 3672 + }, + { + "epoch": 0.81, + "grad_norm": 0.22454690479132108, + "learning_rate": 3.7929400076085255e-06, + "loss": 0.0302, + "step": 3673 + }, + { + "epoch": 0.81, + "grad_norm": 0.26587275563033913, + "learning_rate": 3.7846071935607408e-06, + "loss": 0.0326, + "step": 3674 + }, + { + "epoch": 0.81, + "grad_norm": 0.2474751604624967, + "learning_rate": 3.7762825862226637e-06, + "loss": 0.0312, + "step": 3675 + }, + { + "epoch": 0.81, + "grad_norm": 0.30273048644491024, + "learning_rate": 3.767966189807415e-06, + "loss": 0.0369, + "step": 3676 + }, + { + "epoch": 0.81, + "grad_norm": 0.2579598294846661, + "learning_rate": 3.7596580085239897e-06, + "loss": 0.0294, + "step": 3677 + }, + { + "epoch": 0.81, + "grad_norm": 0.2648196493565275, + "learning_rate": 3.7513580465771893e-06, + "loss": 0.0258, + "step": 3678 + }, + { + "epoch": 0.81, + "grad_norm": 0.3436867699123688, + "learning_rate": 3.7430663081676977e-06, + "loss": 0.0559, + "step": 3679 + }, + { + "epoch": 0.81, + "grad_norm": 0.3231971801587294, + "learning_rate": 3.734782797491998e-06, + "loss": 0.0412, + "step": 3680 + }, + { + "epoch": 0.81, + "grad_norm": 0.21947458017702118, + "learning_rate": 3.7265075187424373e-06, + "loss": 0.0209, + "step": 3681 + }, + { + "epoch": 0.81, + "grad_norm": 0.23357392740506586, + "learning_rate": 3.7182404761071735e-06, + "loss": 0.0304, + "step": 3682 + }, + { + "epoch": 0.81, + "grad_norm": 0.23760398389086507, + "learning_rate": 3.7099816737702197e-06, + "loss": 0.0267, + "step": 3683 + }, + { + "epoch": 0.81, + "grad_norm": 0.29003768817820047, + "learning_rate": 3.7017311159113956e-06, + "loss": 0.0271, + "step": 3684 + }, + { + "epoch": 0.81, + "grad_norm": 0.24743537921386893, + "learning_rate": 3.6934888067063667e-06, + "loss": 0.0256, + "step": 3685 + }, + { + "epoch": 0.81, + "grad_norm": 0.2274675625296988, + "learning_rate": 3.68525475032661e-06, + "loss": 0.0284, + "step": 3686 + }, + { + "epoch": 0.81, + "grad_norm": 0.2360116794638995, + "learning_rate": 3.677028950939434e-06, + "loss": 0.023, + "step": 3687 + }, + { + "epoch": 0.81, + "grad_norm": 0.24521057659949916, + "learning_rate": 3.6688114127079665e-06, + "loss": 0.0228, + "step": 3688 + }, + { + "epoch": 0.81, + "grad_norm": 0.2635946796584002, + "learning_rate": 3.6606021397911605e-06, + "loss": 0.0273, + "step": 3689 + }, + { + "epoch": 0.81, + "grad_norm": 0.24556618863648857, + "learning_rate": 3.652401136343768e-06, + "loss": 0.03, + "step": 3690 + }, + { + "epoch": 0.81, + "grad_norm": 0.2609608908122948, + "learning_rate": 3.6442084065163784e-06, + "loss": 0.0344, + "step": 3691 + }, + { + "epoch": 0.81, + "grad_norm": 0.28327570018181836, + "learning_rate": 3.636023954455372e-06, + "loss": 0.0301, + "step": 3692 + }, + { + "epoch": 0.81, + "grad_norm": 0.23041111823593596, + "learning_rate": 3.6278477843029603e-06, + "loss": 0.0287, + "step": 3693 + }, + { + "epoch": 0.81, + "grad_norm": 0.2625289730703892, + "learning_rate": 3.6196799001971416e-06, + "loss": 0.0256, + "step": 3694 + }, + { + "epoch": 0.81, + "grad_norm": 0.1630802210645409, + "learning_rate": 3.6115203062717386e-06, + "loss": 0.0228, + "step": 3695 + }, + { + "epoch": 0.81, + "grad_norm": 0.3190133451470103, + "learning_rate": 3.6033690066563765e-06, + "loss": 0.0414, + "step": 3696 + }, + { + "epoch": 0.81, + "grad_norm": 0.2435520707321006, + "learning_rate": 3.5952260054764663e-06, + "loss": 0.0283, + "step": 3697 + }, + { + "epoch": 0.81, + "grad_norm": 0.22822638897501876, + "learning_rate": 3.5870913068532455e-06, + "loss": 0.0257, + "step": 3698 + }, + { + "epoch": 0.81, + "grad_norm": 0.2763722961816296, + "learning_rate": 3.5789649149037197e-06, + "loss": 0.0305, + "step": 3699 + }, + { + "epoch": 0.81, + "grad_norm": 0.2695684269130349, + "learning_rate": 3.5708468337407177e-06, + "loss": 0.0291, + "step": 3700 + }, + { + "epoch": 0.81, + "grad_norm": 0.20398215857725774, + "learning_rate": 3.562737067472841e-06, + "loss": 0.0195, + "step": 3701 + }, + { + "epoch": 0.81, + "grad_norm": 0.2502541740080814, + "learning_rate": 3.554635620204503e-06, + "loss": 0.0315, + "step": 3702 + }, + { + "epoch": 0.81, + "grad_norm": 0.24771777420698718, + "learning_rate": 3.546542496035883e-06, + "loss": 0.0328, + "step": 3703 + }, + { + "epoch": 0.81, + "grad_norm": 0.25344037083741355, + "learning_rate": 3.5384576990629672e-06, + "loss": 0.0224, + "step": 3704 + }, + { + "epoch": 0.81, + "grad_norm": 0.26966825403295247, + "learning_rate": 3.53038123337752e-06, + "loss": 0.0457, + "step": 3705 + }, + { + "epoch": 0.81, + "grad_norm": 0.4245201824319466, + "learning_rate": 3.5223131030670942e-06, + "loss": 0.0387, + "step": 3706 + }, + { + "epoch": 0.81, + "grad_norm": 0.2605416601937401, + "learning_rate": 3.5142533122150147e-06, + "loss": 0.0341, + "step": 3707 + }, + { + "epoch": 0.81, + "grad_norm": 0.2730210811890883, + "learning_rate": 3.506201864900396e-06, + "loss": 0.0277, + "step": 3708 + }, + { + "epoch": 0.81, + "grad_norm": 0.2157021971740995, + "learning_rate": 3.4981587651981185e-06, + "loss": 0.0288, + "step": 3709 + }, + { + "epoch": 0.81, + "grad_norm": 0.2760890194326243, + "learning_rate": 3.490124017178851e-06, + "loss": 0.0322, + "step": 3710 + }, + { + "epoch": 0.82, + "grad_norm": 0.27453212981064085, + "learning_rate": 3.482097624909022e-06, + "loss": 0.0333, + "step": 3711 + }, + { + "epoch": 0.82, + "grad_norm": 0.253931058992105, + "learning_rate": 3.474079592450845e-06, + "loss": 0.0282, + "step": 3712 + }, + { + "epoch": 0.82, + "grad_norm": 0.24944489661127137, + "learning_rate": 3.466069923862283e-06, + "loss": 0.0248, + "step": 3713 + }, + { + "epoch": 0.82, + "grad_norm": 0.2315743324149911, + "learning_rate": 3.458068623197097e-06, + "loss": 0.0299, + "step": 3714 + }, + { + "epoch": 0.82, + "grad_norm": 0.23188678469876328, + "learning_rate": 3.4500756945047774e-06, + "loss": 0.0175, + "step": 3715 + }, + { + "epoch": 0.82, + "grad_norm": 0.23487875366002967, + "learning_rate": 3.442091141830608e-06, + "loss": 0.026, + "step": 3716 + }, + { + "epoch": 0.82, + "grad_norm": 0.2520699045582025, + "learning_rate": 3.4341149692156074e-06, + "loss": 0.0262, + "step": 3717 + }, + { + "epoch": 0.82, + "grad_norm": 0.22399663690028476, + "learning_rate": 3.426147180696577e-06, + "loss": 0.0334, + "step": 3718 + }, + { + "epoch": 0.82, + "grad_norm": 0.25049011051335446, + "learning_rate": 3.4181877803060528e-06, + "loss": 0.0299, + "step": 3719 + }, + { + "epoch": 0.82, + "grad_norm": 0.2646873698859137, + "learning_rate": 3.4102367720723438e-06, + "loss": 0.0315, + "step": 3720 + }, + { + "epoch": 0.82, + "grad_norm": 0.2442497314795245, + "learning_rate": 3.402294160019499e-06, + "loss": 0.0297, + "step": 3721 + }, + { + "epoch": 0.82, + "grad_norm": 0.3376258407979168, + "learning_rate": 3.394359948167325e-06, + "loss": 0.0407, + "step": 3722 + }, + { + "epoch": 0.82, + "grad_norm": 0.2851709205302547, + "learning_rate": 3.386434140531378e-06, + "loss": 0.0291, + "step": 3723 + }, + { + "epoch": 0.82, + "grad_norm": 0.3110857792115305, + "learning_rate": 3.3785167411229523e-06, + "loss": 0.0428, + "step": 3724 + }, + { + "epoch": 0.82, + "grad_norm": 0.24018609717101114, + "learning_rate": 3.3706077539490933e-06, + "loss": 0.0259, + "step": 3725 + }, + { + "epoch": 0.82, + "grad_norm": 0.24668818539972917, + "learning_rate": 3.362707183012597e-06, + "loss": 0.0232, + "step": 3726 + }, + { + "epoch": 0.82, + "grad_norm": 0.23018877351239234, + "learning_rate": 3.354815032311978e-06, + "loss": 0.0303, + "step": 3727 + }, + { + "epoch": 0.82, + "grad_norm": 0.22833097773982017, + "learning_rate": 3.34693130584151e-06, + "loss": 0.0215, + "step": 3728 + }, + { + "epoch": 0.82, + "grad_norm": 0.24650305059085142, + "learning_rate": 3.3390560075911906e-06, + "loss": 0.0277, + "step": 3729 + }, + { + "epoch": 0.82, + "grad_norm": 0.17666722183901298, + "learning_rate": 3.331189141546758e-06, + "loss": 0.0202, + "step": 3730 + }, + { + "epoch": 0.82, + "grad_norm": 0.26592299631376487, + "learning_rate": 3.3233307116896874e-06, + "loss": 0.0443, + "step": 3731 + }, + { + "epoch": 0.82, + "grad_norm": 0.22041344832436627, + "learning_rate": 3.3154807219971684e-06, + "loss": 0.024, + "step": 3732 + }, + { + "epoch": 0.82, + "grad_norm": 0.34140833978428003, + "learning_rate": 3.307639176442137e-06, + "loss": 0.0325, + "step": 3733 + }, + { + "epoch": 0.82, + "grad_norm": 0.23034179552941636, + "learning_rate": 3.299806078993242e-06, + "loss": 0.0202, + "step": 3734 + }, + { + "epoch": 0.82, + "grad_norm": 0.1835631081191456, + "learning_rate": 3.2919814336148657e-06, + "loss": 0.0205, + "step": 3735 + }, + { + "epoch": 0.82, + "grad_norm": 0.20295680858908874, + "learning_rate": 3.2841652442671033e-06, + "loss": 0.0222, + "step": 3736 + }, + { + "epoch": 0.82, + "grad_norm": 0.2775041089741753, + "learning_rate": 3.276357514905788e-06, + "loss": 0.0322, + "step": 3737 + }, + { + "epoch": 0.82, + "grad_norm": 0.22677679450652197, + "learning_rate": 3.2685582494824386e-06, + "loss": 0.0285, + "step": 3738 + }, + { + "epoch": 0.82, + "grad_norm": 0.28460758151071236, + "learning_rate": 3.260767451944338e-06, + "loss": 0.0383, + "step": 3739 + }, + { + "epoch": 0.82, + "grad_norm": 0.23390609406113763, + "learning_rate": 3.252985126234434e-06, + "loss": 0.0309, + "step": 3740 + }, + { + "epoch": 0.82, + "grad_norm": 0.20212903136937763, + "learning_rate": 3.245211276291427e-06, + "loss": 0.0258, + "step": 3741 + }, + { + "epoch": 0.82, + "grad_norm": 0.26395564880049716, + "learning_rate": 3.237445906049694e-06, + "loss": 0.0457, + "step": 3742 + }, + { + "epoch": 0.82, + "grad_norm": 0.29903038510175467, + "learning_rate": 3.229689019439348e-06, + "loss": 0.0315, + "step": 3743 + }, + { + "epoch": 0.82, + "grad_norm": 0.25464142224590275, + "learning_rate": 3.2219406203861903e-06, + "loss": 0.0321, + "step": 3744 + }, + { + "epoch": 0.82, + "grad_norm": 0.28081431020683456, + "learning_rate": 3.2142007128117393e-06, + "loss": 0.0265, + "step": 3745 + }, + { + "epoch": 0.82, + "grad_norm": 0.281312485217518, + "learning_rate": 3.2064693006332013e-06, + "loss": 0.028, + "step": 3746 + }, + { + "epoch": 0.82, + "grad_norm": 0.22573948873242403, + "learning_rate": 3.1987463877634962e-06, + "loss": 0.0204, + "step": 3747 + }, + { + "epoch": 0.82, + "grad_norm": 0.22625983509268627, + "learning_rate": 3.1910319781112364e-06, + "loss": 0.0235, + "step": 3748 + }, + { + "epoch": 0.82, + "grad_norm": 0.2566258691874267, + "learning_rate": 3.1833260755807392e-06, + "loss": 0.0261, + "step": 3749 + }, + { + "epoch": 0.82, + "grad_norm": 0.216310359009026, + "learning_rate": 3.1756286840719987e-06, + "loss": 0.0293, + "step": 3750 + }, + { + "epoch": 0.82, + "grad_norm": 0.2803729209720778, + "learning_rate": 3.16793980748072e-06, + "loss": 0.0373, + "step": 3751 + }, + { + "epoch": 0.82, + "grad_norm": 0.2628738760749563, + "learning_rate": 3.160259449698282e-06, + "loss": 0.0312, + "step": 3752 + }, + { + "epoch": 0.82, + "grad_norm": 0.23420699774006093, + "learning_rate": 3.1525876146117707e-06, + "loss": 0.0315, + "step": 3753 + }, + { + "epoch": 0.82, + "grad_norm": 0.2373883684596302, + "learning_rate": 3.144924306103938e-06, + "loss": 0.0253, + "step": 3754 + }, + { + "epoch": 0.82, + "grad_norm": 0.21807232689411402, + "learning_rate": 3.1372695280532415e-06, + "loss": 0.0314, + "step": 3755 + }, + { + "epoch": 0.82, + "grad_norm": 0.26653987743804225, + "learning_rate": 3.129623284333805e-06, + "loss": 0.0357, + "step": 3756 + }, + { + "epoch": 0.83, + "grad_norm": 0.25989470465701053, + "learning_rate": 3.1219855788154385e-06, + "loss": 0.0293, + "step": 3757 + }, + { + "epoch": 0.83, + "grad_norm": 0.23634236663278327, + "learning_rate": 3.1143564153636395e-06, + "loss": 0.0179, + "step": 3758 + }, + { + "epoch": 0.83, + "grad_norm": 0.23741298135441422, + "learning_rate": 3.1067357978395663e-06, + "loss": 0.0263, + "step": 3759 + }, + { + "epoch": 0.83, + "grad_norm": 0.20682910731405757, + "learning_rate": 3.0991237301000664e-06, + "loss": 0.0218, + "step": 3760 + }, + { + "epoch": 0.83, + "grad_norm": 0.26802454553833704, + "learning_rate": 3.0915202159976453e-06, + "loss": 0.0309, + "step": 3761 + }, + { + "epoch": 0.83, + "grad_norm": 0.21710761028010478, + "learning_rate": 3.083925259380498e-06, + "loss": 0.0217, + "step": 3762 + }, + { + "epoch": 0.83, + "grad_norm": 0.2641650215307287, + "learning_rate": 3.0763388640924698e-06, + "loss": 0.032, + "step": 3763 + }, + { + "epoch": 0.83, + "grad_norm": 0.24072832019634527, + "learning_rate": 3.068761033973087e-06, + "loss": 0.0305, + "step": 3764 + }, + { + "epoch": 0.83, + "grad_norm": 0.27969197690566716, + "learning_rate": 3.0611917728575347e-06, + "loss": 0.0324, + "step": 3765 + }, + { + "epoch": 0.83, + "grad_norm": 0.20708361449276919, + "learning_rate": 3.053631084576667e-06, + "loss": 0.0265, + "step": 3766 + }, + { + "epoch": 0.83, + "grad_norm": 0.2899234297188242, + "learning_rate": 3.046078972956985e-06, + "loss": 0.0331, + "step": 3767 + }, + { + "epoch": 0.83, + "grad_norm": 0.24798203168945265, + "learning_rate": 3.038535441820669e-06, + "loss": 0.0205, + "step": 3768 + }, + { + "epoch": 0.83, + "grad_norm": 0.20767294205972725, + "learning_rate": 3.0310004949855366e-06, + "loss": 0.025, + "step": 3769 + }, + { + "epoch": 0.83, + "grad_norm": 0.21678776764856741, + "learning_rate": 3.0234741362650787e-06, + "loss": 0.0276, + "step": 3770 + }, + { + "epoch": 0.83, + "grad_norm": 0.3020937016231474, + "learning_rate": 3.0159563694684245e-06, + "loss": 0.0505, + "step": 3771 + }, + { + "epoch": 0.83, + "grad_norm": 0.23336752682863882, + "learning_rate": 3.008447198400368e-06, + "loss": 0.0223, + "step": 3772 + }, + { + "epoch": 0.83, + "grad_norm": 0.21756051435717852, + "learning_rate": 3.0009466268613384e-06, + "loss": 0.027, + "step": 3773 + }, + { + "epoch": 0.83, + "grad_norm": 0.2660732974190698, + "learning_rate": 2.9934546586474346e-06, + "loss": 0.0283, + "step": 3774 + }, + { + "epoch": 0.83, + "grad_norm": 0.18405623723241546, + "learning_rate": 2.985971297550374e-06, + "loss": 0.0187, + "step": 3775 + }, + { + "epoch": 0.83, + "grad_norm": 0.21215769391278613, + "learning_rate": 2.9784965473575434e-06, + "loss": 0.0231, + "step": 3776 + }, + { + "epoch": 0.83, + "grad_norm": 0.19297202644192088, + "learning_rate": 2.9710304118519473e-06, + "loss": 0.0198, + "step": 3777 + }, + { + "epoch": 0.83, + "grad_norm": 0.2579171806283081, + "learning_rate": 2.9635728948122542e-06, + "loss": 0.023, + "step": 3778 + }, + { + "epoch": 0.83, + "grad_norm": 0.28781569138454693, + "learning_rate": 2.95612400001275e-06, + "loss": 0.0362, + "step": 3779 + }, + { + "epoch": 0.83, + "grad_norm": 0.24115771119516768, + "learning_rate": 2.9486837312233742e-06, + "loss": 0.0222, + "step": 3780 + }, + { + "epoch": 0.83, + "grad_norm": 0.26588425886678607, + "learning_rate": 2.9412520922096834e-06, + "loss": 0.0305, + "step": 3781 + }, + { + "epoch": 0.83, + "grad_norm": 0.25585837036401493, + "learning_rate": 2.93382908673288e-06, + "loss": 0.0304, + "step": 3782 + }, + { + "epoch": 0.83, + "grad_norm": 0.3162272853760593, + "learning_rate": 2.926414718549797e-06, + "loss": 0.0363, + "step": 3783 + }, + { + "epoch": 0.83, + "grad_norm": 0.2875859075196345, + "learning_rate": 2.9190089914128837e-06, + "loss": 0.0312, + "step": 3784 + }, + { + "epoch": 0.83, + "grad_norm": 0.2679100483916866, + "learning_rate": 2.911611909070229e-06, + "loss": 0.0248, + "step": 3785 + }, + { + "epoch": 0.83, + "grad_norm": 0.23112737974506606, + "learning_rate": 2.9042234752655417e-06, + "loss": 0.0205, + "step": 3786 + }, + { + "epoch": 0.83, + "grad_norm": 0.20902051345813139, + "learning_rate": 2.8968436937381515e-06, + "loss": 0.024, + "step": 3787 + }, + { + "epoch": 0.83, + "grad_norm": 0.28679984138719145, + "learning_rate": 2.889472568223015e-06, + "loss": 0.0317, + "step": 3788 + }, + { + "epoch": 0.83, + "grad_norm": 0.38947517921805536, + "learning_rate": 2.8821101024506947e-06, + "loss": 0.0397, + "step": 3789 + }, + { + "epoch": 0.83, + "grad_norm": 0.2625566137662427, + "learning_rate": 2.874756300147388e-06, + "loss": 0.0259, + "step": 3790 + }, + { + "epoch": 0.83, + "grad_norm": 0.3466975462705927, + "learning_rate": 2.867411165034901e-06, + "loss": 0.0433, + "step": 3791 + }, + { + "epoch": 0.83, + "grad_norm": 0.30872740446645297, + "learning_rate": 2.8600747008306417e-06, + "loss": 0.0287, + "step": 3792 + }, + { + "epoch": 0.83, + "grad_norm": 0.24879147066498217, + "learning_rate": 2.8527469112476524e-06, + "loss": 0.0271, + "step": 3793 + }, + { + "epoch": 0.83, + "grad_norm": 0.22946090485031406, + "learning_rate": 2.8454277999945603e-06, + "loss": 0.0186, + "step": 3794 + }, + { + "epoch": 0.83, + "grad_norm": 0.3252357251599501, + "learning_rate": 2.8381173707756214e-06, + "loss": 0.0337, + "step": 3795 + }, + { + "epoch": 0.83, + "grad_norm": 0.22712515976584763, + "learning_rate": 2.8308156272906794e-06, + "loss": 0.0229, + "step": 3796 + }, + { + "epoch": 0.83, + "grad_norm": 0.20611607509466556, + "learning_rate": 2.8235225732352043e-06, + "loss": 0.031, + "step": 3797 + }, + { + "epoch": 0.83, + "grad_norm": 0.2323427626660013, + "learning_rate": 2.8162382123002418e-06, + "loss": 0.0179, + "step": 3798 + }, + { + "epoch": 0.83, + "grad_norm": 0.2634233928672666, + "learning_rate": 2.8089625481724604e-06, + "loss": 0.027, + "step": 3799 + }, + { + "epoch": 0.83, + "grad_norm": 0.23105383007828828, + "learning_rate": 2.8016955845341143e-06, + "loss": 0.0235, + "step": 3800 + }, + { + "epoch": 0.83, + "grad_norm": 0.2677735359976356, + "learning_rate": 2.794437325063064e-06, + "loss": 0.0227, + "step": 3801 + }, + { + "epoch": 0.84, + "grad_norm": 0.24656583525090428, + "learning_rate": 2.7871877734327514e-06, + "loss": 0.0297, + "step": 3802 + }, + { + "epoch": 0.84, + "grad_norm": 0.23939083838791006, + "learning_rate": 2.7799469333122275e-06, + "loss": 0.0255, + "step": 3803 + }, + { + "epoch": 0.84, + "grad_norm": 0.24008755733334192, + "learning_rate": 2.772714808366115e-06, + "loss": 0.0316, + "step": 3804 + }, + { + "epoch": 0.84, + "grad_norm": 0.2231639043966675, + "learning_rate": 2.76549140225465e-06, + "loss": 0.0303, + "step": 3805 + }, + { + "epoch": 0.84, + "grad_norm": 0.25595756277460696, + "learning_rate": 2.758276718633628e-06, + "loss": 0.027, + "step": 3806 + }, + { + "epoch": 0.84, + "grad_norm": 0.211849651128429, + "learning_rate": 2.751070761154453e-06, + "loss": 0.0201, + "step": 3807 + }, + { + "epoch": 0.84, + "grad_norm": 0.26129634418740444, + "learning_rate": 2.743873533464105e-06, + "loss": 0.0265, + "step": 3808 + }, + { + "epoch": 0.84, + "grad_norm": 0.24285469137835775, + "learning_rate": 2.7366850392051468e-06, + "loss": 0.0235, + "step": 3809 + }, + { + "epoch": 0.84, + "grad_norm": 0.28917501167290083, + "learning_rate": 2.7295052820157097e-06, + "loss": 0.029, + "step": 3810 + }, + { + "epoch": 0.84, + "grad_norm": 0.277645363473503, + "learning_rate": 2.722334265529527e-06, + "loss": 0.0318, + "step": 3811 + }, + { + "epoch": 0.84, + "grad_norm": 0.2101197108159598, + "learning_rate": 2.715171993375878e-06, + "loss": 0.0219, + "step": 3812 + }, + { + "epoch": 0.84, + "grad_norm": 0.3028963455553397, + "learning_rate": 2.7080184691796474e-06, + "loss": 0.0292, + "step": 3813 + }, + { + "epoch": 0.84, + "grad_norm": 0.22889463668631502, + "learning_rate": 2.7008736965612658e-06, + "loss": 0.018, + "step": 3814 + }, + { + "epoch": 0.84, + "grad_norm": 0.24035723576142085, + "learning_rate": 2.6937376791367566e-06, + "loss": 0.0256, + "step": 3815 + }, + { + "epoch": 0.84, + "grad_norm": 0.2424393006312234, + "learning_rate": 2.6866104205176925e-06, + "loss": 0.0333, + "step": 3816 + }, + { + "epoch": 0.84, + "grad_norm": 0.24892566481864783, + "learning_rate": 2.679491924311226e-06, + "loss": 0.0237, + "step": 3817 + }, + { + "epoch": 0.84, + "grad_norm": 0.2524314302875021, + "learning_rate": 2.67238219412008e-06, + "loss": 0.0269, + "step": 3818 + }, + { + "epoch": 0.84, + "grad_norm": 0.3008899987543034, + "learning_rate": 2.6652812335425184e-06, + "loss": 0.0384, + "step": 3819 + }, + { + "epoch": 0.84, + "grad_norm": 0.2663023530727557, + "learning_rate": 2.6581890461723925e-06, + "loss": 0.0364, + "step": 3820 + }, + { + "epoch": 0.84, + "grad_norm": 0.23916030765402485, + "learning_rate": 2.651105635599094e-06, + "loss": 0.0288, + "step": 3821 + }, + { + "epoch": 0.84, + "grad_norm": 0.314366417534471, + "learning_rate": 2.6440310054075877e-06, + "loss": 0.0396, + "step": 3822 + }, + { + "epoch": 0.84, + "grad_norm": 0.19393635409315052, + "learning_rate": 2.6369651591783774e-06, + "loss": 0.0222, + "step": 3823 + }, + { + "epoch": 0.84, + "grad_norm": 0.2970231715412625, + "learning_rate": 2.629908100487544e-06, + "loss": 0.0367, + "step": 3824 + }, + { + "epoch": 0.84, + "grad_norm": 0.2730624897084373, + "learning_rate": 2.6228598329066902e-06, + "loss": 0.0329, + "step": 3825 + }, + { + "epoch": 0.84, + "grad_norm": 0.2221304474051573, + "learning_rate": 2.6158203600030076e-06, + "loss": 0.0208, + "step": 3826 + }, + { + "epoch": 0.84, + "grad_norm": 0.21295674198523606, + "learning_rate": 2.6087896853392037e-06, + "loss": 0.0213, + "step": 3827 + }, + { + "epoch": 0.84, + "grad_norm": 0.2634736057395541, + "learning_rate": 2.6017678124735545e-06, + "loss": 0.0249, + "step": 3828 + }, + { + "epoch": 0.84, + "grad_norm": 0.2801856325424695, + "learning_rate": 2.594754744959862e-06, + "loss": 0.0331, + "step": 3829 + }, + { + "epoch": 0.84, + "grad_norm": 0.38273736644602885, + "learning_rate": 2.5877504863474933e-06, + "loss": 0.0441, + "step": 3830 + }, + { + "epoch": 0.84, + "grad_norm": 0.23987318038548458, + "learning_rate": 2.58075504018134e-06, + "loss": 0.0247, + "step": 3831 + }, + { + "epoch": 0.84, + "grad_norm": 0.3165671210408318, + "learning_rate": 2.5737684100018446e-06, + "loss": 0.0504, + "step": 3832 + }, + { + "epoch": 0.84, + "grad_norm": 0.26761475194029594, + "learning_rate": 2.566790599344973e-06, + "loss": 0.0264, + "step": 3833 + }, + { + "epoch": 0.84, + "grad_norm": 0.28997572133362387, + "learning_rate": 2.5598216117422547e-06, + "loss": 0.0292, + "step": 3834 + }, + { + "epoch": 0.84, + "grad_norm": 0.23359317715679853, + "learning_rate": 2.552861450720725e-06, + "loss": 0.0239, + "step": 3835 + }, + { + "epoch": 0.84, + "grad_norm": 0.225136801073581, + "learning_rate": 2.5459101198029724e-06, + "loss": 0.0188, + "step": 3836 + }, + { + "epoch": 0.84, + "grad_norm": 0.2199612868449696, + "learning_rate": 2.538967622507098e-06, + "loss": 0.0228, + "step": 3837 + }, + { + "epoch": 0.84, + "grad_norm": 0.263409052655707, + "learning_rate": 2.532033962346754e-06, + "loss": 0.0244, + "step": 3838 + }, + { + "epoch": 0.84, + "grad_norm": 0.23126387425634926, + "learning_rate": 2.525109142831095e-06, + "loss": 0.0264, + "step": 3839 + }, + { + "epoch": 0.84, + "grad_norm": 0.22309392442868756, + "learning_rate": 2.5181931674648265e-06, + "loss": 0.022, + "step": 3840 + }, + { + "epoch": 0.84, + "grad_norm": 0.24698059805352435, + "learning_rate": 2.5112860397481553e-06, + "loss": 0.0276, + "step": 3841 + }, + { + "epoch": 0.84, + "grad_norm": 0.4268208681918181, + "learning_rate": 2.50438776317683e-06, + "loss": 0.0494, + "step": 3842 + }, + { + "epoch": 0.84, + "grad_norm": 0.2269383387911252, + "learning_rate": 2.497498341242104e-06, + "loss": 0.0291, + "step": 3843 + }, + { + "epoch": 0.84, + "grad_norm": 0.25532006069273766, + "learning_rate": 2.490617777430766e-06, + "loss": 0.0289, + "step": 3844 + }, + { + "epoch": 0.84, + "grad_norm": 0.19921207416793138, + "learning_rate": 2.4837460752251e-06, + "loss": 0.0219, + "step": 3845 + }, + { + "epoch": 0.84, + "grad_norm": 0.23851431720373856, + "learning_rate": 2.476883238102925e-06, + "loss": 0.0269, + "step": 3846 + }, + { + "epoch": 0.84, + "grad_norm": 0.28687080540218757, + "learning_rate": 2.4700292695375596e-06, + "loss": 0.0336, + "step": 3847 + }, + { + "epoch": 0.85, + "grad_norm": 0.2877835383842551, + "learning_rate": 2.4631841729978435e-06, + "loss": 0.0386, + "step": 3848 + }, + { + "epoch": 0.85, + "grad_norm": 0.21595209508503585, + "learning_rate": 2.456347951948115e-06, + "loss": 0.027, + "step": 3849 + }, + { + "epoch": 0.85, + "grad_norm": 0.25590705427331534, + "learning_rate": 2.449520609848237e-06, + "loss": 0.0285, + "step": 3850 + }, + { + "epoch": 0.85, + "grad_norm": 0.2914738983095523, + "learning_rate": 2.442702150153562e-06, + "loss": 0.0302, + "step": 3851 + }, + { + "epoch": 0.85, + "grad_norm": 0.22516376244575387, + "learning_rate": 2.4358925763149557e-06, + "loss": 0.0266, + "step": 3852 + }, + { + "epoch": 0.85, + "grad_norm": 0.2483287171253741, + "learning_rate": 2.4290918917787876e-06, + "loss": 0.0265, + "step": 3853 + }, + { + "epoch": 0.85, + "grad_norm": 0.32799978364742516, + "learning_rate": 2.4223000999869227e-06, + "loss": 0.0379, + "step": 3854 + }, + { + "epoch": 0.85, + "grad_norm": 0.21206941491648676, + "learning_rate": 2.4155172043767337e-06, + "loss": 0.0253, + "step": 3855 + }, + { + "epoch": 0.85, + "grad_norm": 0.2549331586084214, + "learning_rate": 2.4087432083810792e-06, + "loss": 0.0241, + "step": 3856 + }, + { + "epoch": 0.85, + "grad_norm": 0.26321166106479543, + "learning_rate": 2.401978115428325e-06, + "loss": 0.0232, + "step": 3857 + }, + { + "epoch": 0.85, + "grad_norm": 0.2478637469868672, + "learning_rate": 2.395221928942322e-06, + "loss": 0.029, + "step": 3858 + }, + { + "epoch": 0.85, + "grad_norm": 0.25795024024458996, + "learning_rate": 2.388474652342416e-06, + "loss": 0.0296, + "step": 3859 + }, + { + "epoch": 0.85, + "grad_norm": 0.2210709536496926, + "learning_rate": 2.3817362890434526e-06, + "loss": 0.0384, + "step": 3860 + }, + { + "epoch": 0.85, + "grad_norm": 0.27839401340087205, + "learning_rate": 2.375006842455756e-06, + "loss": 0.0324, + "step": 3861 + }, + { + "epoch": 0.85, + "grad_norm": 0.29559086088363035, + "learning_rate": 2.3682863159851377e-06, + "loss": 0.0322, + "step": 3862 + }, + { + "epoch": 0.85, + "grad_norm": 0.258958163884098, + "learning_rate": 2.3615747130329013e-06, + "loss": 0.045, + "step": 3863 + }, + { + "epoch": 0.85, + "grad_norm": 0.23485219378085304, + "learning_rate": 2.3548720369958256e-06, + "loss": 0.0288, + "step": 3864 + }, + { + "epoch": 0.85, + "grad_norm": 0.2157679964756096, + "learning_rate": 2.3481782912661788e-06, + "loss": 0.0228, + "step": 3865 + }, + { + "epoch": 0.85, + "grad_norm": 0.3899911506272396, + "learning_rate": 2.3414934792317047e-06, + "loss": 0.0413, + "step": 3866 + }, + { + "epoch": 0.85, + "grad_norm": 0.2272902507691261, + "learning_rate": 2.334817604275632e-06, + "loss": 0.0268, + "step": 3867 + }, + { + "epoch": 0.85, + "grad_norm": 0.22339759350247212, + "learning_rate": 2.3281506697766522e-06, + "loss": 0.0252, + "step": 3868 + }, + { + "epoch": 0.85, + "grad_norm": 0.2745319398362205, + "learning_rate": 2.3214926791089563e-06, + "loss": 0.0263, + "step": 3869 + }, + { + "epoch": 0.85, + "grad_norm": 0.3004511222885687, + "learning_rate": 2.3148436356421813e-06, + "loss": 0.0405, + "step": 3870 + }, + { + "epoch": 0.85, + "grad_norm": 0.18937862543847042, + "learning_rate": 2.3082035427414585e-06, + "loss": 0.0212, + "step": 3871 + }, + { + "epoch": 0.85, + "grad_norm": 0.2781247439008241, + "learning_rate": 2.301572403767369e-06, + "loss": 0.028, + "step": 3872 + }, + { + "epoch": 0.85, + "grad_norm": 0.25762808590872704, + "learning_rate": 2.2949502220759866e-06, + "loss": 0.0247, + "step": 3873 + }, + { + "epoch": 0.85, + "grad_norm": 0.2648198885000602, + "learning_rate": 2.2883370010188232e-06, + "loss": 0.0364, + "step": 3874 + }, + { + "epoch": 0.85, + "grad_norm": 0.22559464012633032, + "learning_rate": 2.2817327439428836e-06, + "loss": 0.0267, + "step": 3875 + }, + { + "epoch": 0.85, + "grad_norm": 0.2344780456991246, + "learning_rate": 2.2751374541906122e-06, + "loss": 0.0327, + "step": 3876 + }, + { + "epoch": 0.85, + "grad_norm": 0.22897060453096788, + "learning_rate": 2.26855113509993e-06, + "loss": 0.0278, + "step": 3877 + }, + { + "epoch": 0.85, + "grad_norm": 0.23947667601923092, + "learning_rate": 2.261973790004217e-06, + "loss": 0.0281, + "step": 3878 + }, + { + "epoch": 0.85, + "grad_norm": 0.2541858893822508, + "learning_rate": 2.2554054222323018e-06, + "loss": 0.0265, + "step": 3879 + }, + { + "epoch": 0.85, + "grad_norm": 0.2625970621268977, + "learning_rate": 2.2488460351084827e-06, + "loss": 0.0302, + "step": 3880 + }, + { + "epoch": 0.85, + "grad_norm": 0.23400592354466196, + "learning_rate": 2.242295631952496e-06, + "loss": 0.0231, + "step": 3881 + }, + { + "epoch": 0.85, + "grad_norm": 0.24455926478727372, + "learning_rate": 2.235754216079551e-06, + "loss": 0.0295, + "step": 3882 + }, + { + "epoch": 0.85, + "grad_norm": 0.29506134373317705, + "learning_rate": 2.229221790800291e-06, + "loss": 0.0319, + "step": 3883 + }, + { + "epoch": 0.85, + "grad_norm": 0.25740314988611324, + "learning_rate": 2.2226983594208187e-06, + "loss": 0.0342, + "step": 3884 + }, + { + "epoch": 0.85, + "grad_norm": 0.32531390161423046, + "learning_rate": 2.216183925242681e-06, + "loss": 0.0268, + "step": 3885 + }, + { + "epoch": 0.85, + "grad_norm": 0.25434132493678374, + "learning_rate": 2.209678491562881e-06, + "loss": 0.0227, + "step": 3886 + }, + { + "epoch": 0.85, + "grad_norm": 0.2397500094154205, + "learning_rate": 2.2031820616738477e-06, + "loss": 0.0202, + "step": 3887 + }, + { + "epoch": 0.85, + "grad_norm": 0.2139124016890787, + "learning_rate": 2.1966946388634746e-06, + "loss": 0.0291, + "step": 3888 + }, + { + "epoch": 0.85, + "grad_norm": 0.2131140088158562, + "learning_rate": 2.190216226415074e-06, + "loss": 0.0207, + "step": 3889 + }, + { + "epoch": 0.85, + "grad_norm": 0.2961527295114186, + "learning_rate": 2.1837468276074227e-06, + "loss": 0.0418, + "step": 3890 + }, + { + "epoch": 0.85, + "grad_norm": 0.23153809405938847, + "learning_rate": 2.1772864457147126e-06, + "loss": 0.0268, + "step": 3891 + }, + { + "epoch": 0.85, + "grad_norm": 0.28574910654176955, + "learning_rate": 2.1708350840065927e-06, + "loss": 0.0344, + "step": 3892 + }, + { + "epoch": 0.86, + "grad_norm": 0.23484012227863682, + "learning_rate": 2.164392745748125e-06, + "loss": 0.0257, + "step": 3893 + }, + { + "epoch": 0.86, + "grad_norm": 0.23499878683943798, + "learning_rate": 2.1579594341998235e-06, + "loss": 0.0337, + "step": 3894 + }, + { + "epoch": 0.86, + "grad_norm": 0.2245127594514266, + "learning_rate": 2.151535152617625e-06, + "loss": 0.0246, + "step": 3895 + }, + { + "epoch": 0.86, + "grad_norm": 0.22720426068665847, + "learning_rate": 2.1451199042529035e-06, + "loss": 0.0211, + "step": 3896 + }, + { + "epoch": 0.86, + "grad_norm": 0.17705131105658264, + "learning_rate": 2.1387136923524475e-06, + "loss": 0.017, + "step": 3897 + }, + { + "epoch": 0.86, + "grad_norm": 0.25607005756259693, + "learning_rate": 2.1323165201584863e-06, + "loss": 0.0269, + "step": 3898 + }, + { + "epoch": 0.86, + "grad_norm": 0.19983322148628807, + "learning_rate": 2.125928390908658e-06, + "loss": 0.023, + "step": 3899 + }, + { + "epoch": 0.86, + "grad_norm": 0.2594664579030888, + "learning_rate": 2.1195493078360486e-06, + "loss": 0.0205, + "step": 3900 + }, + { + "epoch": 0.86, + "grad_norm": 0.2109651459131922, + "learning_rate": 2.113179274169137e-06, + "loss": 0.0227, + "step": 3901 + }, + { + "epoch": 0.86, + "grad_norm": 0.24432628438712112, + "learning_rate": 2.1068182931318424e-06, + "loss": 0.0229, + "step": 3902 + }, + { + "epoch": 0.86, + "grad_norm": 0.19328943586785619, + "learning_rate": 2.1004663679434987e-06, + "loss": 0.0148, + "step": 3903 + }, + { + "epoch": 0.86, + "grad_norm": 0.22900019726057594, + "learning_rate": 2.0941235018188543e-06, + "loss": 0.0318, + "step": 3904 + }, + { + "epoch": 0.86, + "grad_norm": 0.27477682985090945, + "learning_rate": 2.0877896979680654e-06, + "loss": 0.0248, + "step": 3905 + }, + { + "epoch": 0.86, + "grad_norm": 0.26366659088491357, + "learning_rate": 2.0814649595967194e-06, + "loss": 0.0333, + "step": 3906 + }, + { + "epoch": 0.86, + "grad_norm": 0.21312036279908078, + "learning_rate": 2.0751492899057957e-06, + "loss": 0.0204, + "step": 3907 + }, + { + "epoch": 0.86, + "grad_norm": 0.20255649311395135, + "learning_rate": 2.0688426920916992e-06, + "loss": 0.0229, + "step": 3908 + }, + { + "epoch": 0.86, + "grad_norm": 0.29645066302079465, + "learning_rate": 2.062545169346235e-06, + "loss": 0.0228, + "step": 3909 + }, + { + "epoch": 0.86, + "grad_norm": 0.29417591617827665, + "learning_rate": 2.05625672485662e-06, + "loss": 0.032, + "step": 3910 + }, + { + "epoch": 0.86, + "grad_norm": 0.2254233331599424, + "learning_rate": 2.049977361805471e-06, + "loss": 0.0238, + "step": 3911 + }, + { + "epoch": 0.86, + "grad_norm": 0.28845767590186433, + "learning_rate": 2.043707083370814e-06, + "loss": 0.0301, + "step": 3912 + }, + { + "epoch": 0.86, + "grad_norm": 0.21304851766358, + "learning_rate": 2.03744589272608e-06, + "loss": 0.0307, + "step": 3913 + }, + { + "epoch": 0.86, + "grad_norm": 0.3292309877269268, + "learning_rate": 2.031193793040087e-06, + "loss": 0.0344, + "step": 3914 + }, + { + "epoch": 0.86, + "grad_norm": 0.20086938838265608, + "learning_rate": 2.0249507874770714e-06, + "loss": 0.0248, + "step": 3915 + }, + { + "epoch": 0.86, + "grad_norm": 0.2349814278411253, + "learning_rate": 2.018716879196645e-06, + "loss": 0.0286, + "step": 3916 + }, + { + "epoch": 0.86, + "grad_norm": 0.3030606388311023, + "learning_rate": 2.0124920713538378e-06, + "loss": 0.0334, + "step": 3917 + }, + { + "epoch": 0.86, + "grad_norm": 0.2677680128398089, + "learning_rate": 2.006276367099054e-06, + "loss": 0.0301, + "step": 3918 + }, + { + "epoch": 0.86, + "grad_norm": 0.18866716146011084, + "learning_rate": 2.000069769578108e-06, + "loss": 0.02, + "step": 3919 + }, + { + "epoch": 0.86, + "grad_norm": 0.2781445132064626, + "learning_rate": 1.9938722819321854e-06, + "loss": 0.0259, + "step": 3920 + }, + { + "epoch": 0.86, + "grad_norm": 0.21489925534501872, + "learning_rate": 1.987683907297888e-06, + "loss": 0.0226, + "step": 3921 + }, + { + "epoch": 0.86, + "grad_norm": 0.22646346887563482, + "learning_rate": 1.9815046488071774e-06, + "loss": 0.0239, + "step": 3922 + }, + { + "epoch": 0.86, + "grad_norm": 0.2401493666437388, + "learning_rate": 1.9753345095874234e-06, + "loss": 0.0238, + "step": 3923 + }, + { + "epoch": 0.86, + "grad_norm": 0.29931022052689477, + "learning_rate": 1.9691734927613625e-06, + "loss": 0.0297, + "step": 3924 + }, + { + "epoch": 0.86, + "grad_norm": 0.22429214133725534, + "learning_rate": 1.9630216014471326e-06, + "loss": 0.023, + "step": 3925 + }, + { + "epoch": 0.86, + "grad_norm": 0.2424895187698589, + "learning_rate": 1.9568788387582338e-06, + "loss": 0.0251, + "step": 3926 + }, + { + "epoch": 0.86, + "grad_norm": 0.2084886071605321, + "learning_rate": 1.950745207803566e-06, + "loss": 0.0137, + "step": 3927 + }, + { + "epoch": 0.86, + "grad_norm": 0.26072221315148936, + "learning_rate": 1.9446207116873815e-06, + "loss": 0.0333, + "step": 3928 + }, + { + "epoch": 0.86, + "grad_norm": 0.178197163888115, + "learning_rate": 1.9385053535093455e-06, + "loss": 0.0212, + "step": 3929 + }, + { + "epoch": 0.86, + "grad_norm": 0.2847289973644325, + "learning_rate": 1.9323991363644645e-06, + "loss": 0.0288, + "step": 3930 + }, + { + "epoch": 0.86, + "grad_norm": 0.27226567535181007, + "learning_rate": 1.9263020633431416e-06, + "loss": 0.0214, + "step": 3931 + }, + { + "epoch": 0.86, + "grad_norm": 0.31075256061803264, + "learning_rate": 1.9202141375311335e-06, + "loss": 0.0388, + "step": 3932 + }, + { + "epoch": 0.86, + "grad_norm": 0.2823310579403776, + "learning_rate": 1.9141353620095835e-06, + "loss": 0.0372, + "step": 3933 + }, + { + "epoch": 0.86, + "grad_norm": 0.18758217679707598, + "learning_rate": 1.9080657398549916e-06, + "loss": 0.0209, + "step": 3934 + }, + { + "epoch": 0.86, + "grad_norm": 0.19805705065045512, + "learning_rate": 1.902005274139238e-06, + "loss": 0.02, + "step": 3935 + }, + { + "epoch": 0.86, + "grad_norm": 0.23035887345797168, + "learning_rate": 1.8959539679295536e-06, + "loss": 0.0225, + "step": 3936 + }, + { + "epoch": 0.86, + "grad_norm": 0.22014119339755225, + "learning_rate": 1.8899118242885462e-06, + "loss": 0.0318, + "step": 3937 + }, + { + "epoch": 0.86, + "grad_norm": 0.230950296961087, + "learning_rate": 1.8838788462741852e-06, + "loss": 0.024, + "step": 3938 + }, + { + "epoch": 0.87, + "grad_norm": 0.24027613145127033, + "learning_rate": 1.8778550369397886e-06, + "loss": 0.0231, + "step": 3939 + }, + { + "epoch": 0.87, + "grad_norm": 0.27697689347173, + "learning_rate": 1.8718403993340528e-06, + "loss": 0.024, + "step": 3940 + }, + { + "epoch": 0.87, + "grad_norm": 0.2174718847409269, + "learning_rate": 1.865834936501012e-06, + "loss": 0.0245, + "step": 3941 + }, + { + "epoch": 0.87, + "grad_norm": 0.19060331401042857, + "learning_rate": 1.8598386514800793e-06, + "loss": 0.0176, + "step": 3942 + }, + { + "epoch": 0.87, + "grad_norm": 0.3065493380980718, + "learning_rate": 1.8538515473060026e-06, + "loss": 0.0351, + "step": 3943 + }, + { + "epoch": 0.87, + "grad_norm": 0.2501655715787742, + "learning_rate": 1.847873627008896e-06, + "loss": 0.0307, + "step": 3944 + }, + { + "epoch": 0.87, + "grad_norm": 0.22951989513899249, + "learning_rate": 1.8419048936142191e-06, + "loss": 0.0315, + "step": 3945 + }, + { + "epoch": 0.87, + "grad_norm": 0.21929841608063866, + "learning_rate": 1.8359453501427916e-06, + "loss": 0.0228, + "step": 3946 + }, + { + "epoch": 0.87, + "grad_norm": 0.2332128800695291, + "learning_rate": 1.8299949996107646e-06, + "loss": 0.0281, + "step": 3947 + }, + { + "epoch": 0.87, + "grad_norm": 0.19833979529150592, + "learning_rate": 1.8240538450296563e-06, + "loss": 0.026, + "step": 3948 + }, + { + "epoch": 0.87, + "grad_norm": 0.2472978983664071, + "learning_rate": 1.8181218894063146e-06, + "loss": 0.0397, + "step": 3949 + }, + { + "epoch": 0.87, + "grad_norm": 0.2029199757606438, + "learning_rate": 1.8121991357429425e-06, + "loss": 0.0159, + "step": 3950 + }, + { + "epoch": 0.87, + "grad_norm": 0.24460092258866165, + "learning_rate": 1.8062855870370798e-06, + "loss": 0.0306, + "step": 3951 + }, + { + "epoch": 0.87, + "grad_norm": 0.19939599847379869, + "learning_rate": 1.8003812462816127e-06, + "loss": 0.0234, + "step": 3952 + }, + { + "epoch": 0.87, + "grad_norm": 0.27247723493064596, + "learning_rate": 1.7944861164647576e-06, + "loss": 0.0372, + "step": 3953 + }, + { + "epoch": 0.87, + "grad_norm": 0.22841116971338493, + "learning_rate": 1.788600200570083e-06, + "loss": 0.0255, + "step": 3954 + }, + { + "epoch": 0.87, + "grad_norm": 0.2392800145649786, + "learning_rate": 1.782723501576482e-06, + "loss": 0.0317, + "step": 3955 + }, + { + "epoch": 0.87, + "grad_norm": 0.25418792652631794, + "learning_rate": 1.7768560224581955e-06, + "loss": 0.0335, + "step": 3956 + }, + { + "epoch": 0.87, + "grad_norm": 0.21703919623420148, + "learning_rate": 1.770997766184781e-06, + "loss": 0.0245, + "step": 3957 + }, + { + "epoch": 0.87, + "grad_norm": 0.16896704715874653, + "learning_rate": 1.7651487357211472e-06, + "loss": 0.0108, + "step": 3958 + }, + { + "epoch": 0.87, + "grad_norm": 0.2984374241054898, + "learning_rate": 1.7593089340275149e-06, + "loss": 0.0298, + "step": 3959 + }, + { + "epoch": 0.87, + "grad_norm": 0.22443900073523565, + "learning_rate": 1.7534783640594533e-06, + "loss": 0.0202, + "step": 3960 + }, + { + "epoch": 0.87, + "grad_norm": 0.20475501992519377, + "learning_rate": 1.7476570287678396e-06, + "loss": 0.0191, + "step": 3961 + }, + { + "epoch": 0.87, + "grad_norm": 0.20974471025253075, + "learning_rate": 1.741844931098895e-06, + "loss": 0.0205, + "step": 3962 + }, + { + "epoch": 0.87, + "grad_norm": 0.2384955974233491, + "learning_rate": 1.7360420739941486e-06, + "loss": 0.0263, + "step": 3963 + }, + { + "epoch": 0.87, + "grad_norm": 0.22069584994858502, + "learning_rate": 1.7302484603904756e-06, + "loss": 0.0314, + "step": 3964 + }, + { + "epoch": 0.87, + "grad_norm": 0.19494000199605882, + "learning_rate": 1.7244640932200484e-06, + "loss": 0.0234, + "step": 3965 + }, + { + "epoch": 0.87, + "grad_norm": 0.2347603887776966, + "learning_rate": 1.7186889754103763e-06, + "loss": 0.0297, + "step": 3966 + }, + { + "epoch": 0.87, + "grad_norm": 0.23574376172896755, + "learning_rate": 1.7129231098842791e-06, + "loss": 0.0297, + "step": 3967 + }, + { + "epoch": 0.87, + "grad_norm": 0.26678069527800613, + "learning_rate": 1.707166499559898e-06, + "loss": 0.0412, + "step": 3968 + }, + { + "epoch": 0.87, + "grad_norm": 0.25157095142644803, + "learning_rate": 1.701419147350687e-06, + "loss": 0.026, + "step": 3969 + }, + { + "epoch": 0.87, + "grad_norm": 0.24909355821890453, + "learning_rate": 1.6956810561654213e-06, + "loss": 0.0236, + "step": 3970 + }, + { + "epoch": 0.87, + "grad_norm": 0.2158988964917277, + "learning_rate": 1.6899522289081737e-06, + "loss": 0.0178, + "step": 3971 + }, + { + "epoch": 0.87, + "grad_norm": 0.2413588622411202, + "learning_rate": 1.6842326684783473e-06, + "loss": 0.0203, + "step": 3972 + }, + { + "epoch": 0.87, + "grad_norm": 0.2026649179715184, + "learning_rate": 1.6785223777706482e-06, + "loss": 0.0254, + "step": 3973 + }, + { + "epoch": 0.87, + "grad_norm": 0.33542930621663797, + "learning_rate": 1.6728213596750831e-06, + "loss": 0.0343, + "step": 3974 + }, + { + "epoch": 0.87, + "grad_norm": 0.26128103176874784, + "learning_rate": 1.667129617076977e-06, + "loss": 0.0356, + "step": 3975 + }, + { + "epoch": 0.87, + "grad_norm": 0.25916595365103207, + "learning_rate": 1.661447152856952e-06, + "loss": 0.029, + "step": 3976 + }, + { + "epoch": 0.87, + "grad_norm": 0.21586879341047518, + "learning_rate": 1.6557739698909436e-06, + "loss": 0.0251, + "step": 3977 + }, + { + "epoch": 0.87, + "grad_norm": 0.24176543898825847, + "learning_rate": 1.650110071050175e-06, + "loss": 0.025, + "step": 3978 + }, + { + "epoch": 0.87, + "grad_norm": 0.25397719345805286, + "learning_rate": 1.6444554592011909e-06, + "loss": 0.0189, + "step": 3979 + }, + { + "epoch": 0.87, + "grad_norm": 0.19706409849080536, + "learning_rate": 1.638810137205813e-06, + "loss": 0.0212, + "step": 3980 + }, + { + "epoch": 0.87, + "grad_norm": 0.24386950888774606, + "learning_rate": 1.6331741079211872e-06, + "loss": 0.0263, + "step": 3981 + }, + { + "epoch": 0.87, + "grad_norm": 0.22030736666982426, + "learning_rate": 1.627547374199734e-06, + "loss": 0.0244, + "step": 3982 + }, + { + "epoch": 0.87, + "grad_norm": 0.2842806738262328, + "learning_rate": 1.6219299388891797e-06, + "loss": 0.0219, + "step": 3983 + }, + { + "epoch": 0.88, + "grad_norm": 0.20590756121999668, + "learning_rate": 1.6163218048325413e-06, + "loss": 0.0238, + "step": 3984 + }, + { + "epoch": 0.88, + "grad_norm": 0.38306856349578233, + "learning_rate": 1.610722974868133e-06, + "loss": 0.0324, + "step": 3985 + }, + { + "epoch": 0.88, + "grad_norm": 0.26639646808642814, + "learning_rate": 1.6051334518295546e-06, + "loss": 0.0246, + "step": 3986 + }, + { + "epoch": 0.88, + "grad_norm": 0.2887242116187527, + "learning_rate": 1.5995532385456992e-06, + "loss": 0.0276, + "step": 3987 + }, + { + "epoch": 0.88, + "grad_norm": 0.33405953392589066, + "learning_rate": 1.5939823378407426e-06, + "loss": 0.0417, + "step": 3988 + }, + { + "epoch": 0.88, + "grad_norm": 0.21000298367378883, + "learning_rate": 1.5884207525341566e-06, + "loss": 0.0158, + "step": 3989 + }, + { + "epoch": 0.88, + "grad_norm": 0.26179161765461234, + "learning_rate": 1.5828684854406918e-06, + "loss": 0.0276, + "step": 3990 + }, + { + "epoch": 0.88, + "grad_norm": 0.28524516300225977, + "learning_rate": 1.577325539370389e-06, + "loss": 0.0297, + "step": 3991 + }, + { + "epoch": 0.88, + "grad_norm": 0.25805574559513955, + "learning_rate": 1.57179191712856e-06, + "loss": 0.0322, + "step": 3992 + }, + { + "epoch": 0.88, + "grad_norm": 0.2801519256785691, + "learning_rate": 1.5662676215158112e-06, + "loss": 0.034, + "step": 3993 + }, + { + "epoch": 0.88, + "grad_norm": 0.2271380103121437, + "learning_rate": 1.5607526553280172e-06, + "loss": 0.0216, + "step": 3994 + }, + { + "epoch": 0.88, + "grad_norm": 0.23207426122064762, + "learning_rate": 1.5552470213563408e-06, + "loss": 0.0264, + "step": 3995 + }, + { + "epoch": 0.88, + "grad_norm": 0.2378034099816618, + "learning_rate": 1.549750722387211e-06, + "loss": 0.0285, + "step": 3996 + }, + { + "epoch": 0.88, + "grad_norm": 0.35806127587094355, + "learning_rate": 1.5442637612023425e-06, + "loss": 0.0507, + "step": 3997 + }, + { + "epoch": 0.88, + "grad_norm": 0.2183148961496767, + "learning_rate": 1.5387861405787252e-06, + "loss": 0.0228, + "step": 3998 + }, + { + "epoch": 0.88, + "grad_norm": 0.2775771945043418, + "learning_rate": 1.5333178632886058e-06, + "loss": 0.0353, + "step": 3999 + }, + { + "epoch": 0.88, + "grad_norm": 0.1723180176519468, + "learning_rate": 1.5278589320995218e-06, + "loss": 0.0178, + "step": 4000 + }, + { + "epoch": 0.88, + "grad_norm": 0.2291028880465536, + "learning_rate": 1.5224093497742654e-06, + "loss": 0.0254, + "step": 4001 + }, + { + "epoch": 0.88, + "grad_norm": 0.21540815603745844, + "learning_rate": 1.5169691190709057e-06, + "loss": 0.0336, + "step": 4002 + }, + { + "epoch": 0.88, + "grad_norm": 0.2567826857678153, + "learning_rate": 1.5115382427427827e-06, + "loss": 0.0345, + "step": 4003 + }, + { + "epoch": 0.88, + "grad_norm": 0.2195455017079472, + "learning_rate": 1.5061167235384867e-06, + "loss": 0.0308, + "step": 4004 + }, + { + "epoch": 0.88, + "grad_norm": 0.25006646524482823, + "learning_rate": 1.5007045642018868e-06, + "loss": 0.0289, + "step": 4005 + }, + { + "epoch": 0.88, + "grad_norm": 0.21073223652648765, + "learning_rate": 1.4953017674721083e-06, + "loss": 0.0253, + "step": 4006 + }, + { + "epoch": 0.88, + "grad_norm": 0.256005805825172, + "learning_rate": 1.4899083360835408e-06, + "loss": 0.0326, + "step": 4007 + }, + { + "epoch": 0.88, + "grad_norm": 0.27258169581314906, + "learning_rate": 1.484524272765837e-06, + "loss": 0.0268, + "step": 4008 + }, + { + "epoch": 0.88, + "grad_norm": 0.2409342982553185, + "learning_rate": 1.479149580243895e-06, + "loss": 0.0274, + "step": 4009 + }, + { + "epoch": 0.88, + "grad_norm": 0.20566295612613625, + "learning_rate": 1.4737842612378894e-06, + "loss": 0.02, + "step": 4010 + }, + { + "epoch": 0.88, + "grad_norm": 0.18890379873346602, + "learning_rate": 1.468428318463233e-06, + "loss": 0.0205, + "step": 4011 + }, + { + "epoch": 0.88, + "grad_norm": 0.20006802862239356, + "learning_rate": 1.4630817546306087e-06, + "loss": 0.0236, + "step": 4012 + }, + { + "epoch": 0.88, + "grad_norm": 0.23546096032008776, + "learning_rate": 1.4577445724459382e-06, + "loss": 0.0379, + "step": 4013 + }, + { + "epoch": 0.88, + "grad_norm": 0.1940759232465805, + "learning_rate": 1.4524167746104034e-06, + "loss": 0.0301, + "step": 4014 + }, + { + "epoch": 0.88, + "grad_norm": 0.2289597930390186, + "learning_rate": 1.4470983638204384e-06, + "loss": 0.0319, + "step": 4015 + }, + { + "epoch": 0.88, + "grad_norm": 0.22939968273863018, + "learning_rate": 1.4417893427677276e-06, + "loss": 0.0298, + "step": 4016 + }, + { + "epoch": 0.88, + "grad_norm": 0.25927656787820885, + "learning_rate": 1.4364897141391888e-06, + "loss": 0.0262, + "step": 4017 + }, + { + "epoch": 0.88, + "grad_norm": 0.24248916542782697, + "learning_rate": 1.4311994806170048e-06, + "loss": 0.0365, + "step": 4018 + }, + { + "epoch": 0.88, + "grad_norm": 0.24264834765102183, + "learning_rate": 1.42591864487859e-06, + "loss": 0.0251, + "step": 4019 + }, + { + "epoch": 0.88, + "grad_norm": 0.23210383541824384, + "learning_rate": 1.4206472095966107e-06, + "loss": 0.029, + "step": 4020 + }, + { + "epoch": 0.88, + "grad_norm": 0.2245166089428411, + "learning_rate": 1.4153851774389703e-06, + "loss": 0.0233, + "step": 4021 + }, + { + "epoch": 0.88, + "grad_norm": 0.22277273510764165, + "learning_rate": 1.4101325510688192e-06, + "loss": 0.0209, + "step": 4022 + }, + { + "epoch": 0.88, + "grad_norm": 0.24405514555826774, + "learning_rate": 1.4048893331445367e-06, + "loss": 0.0162, + "step": 4023 + }, + { + "epoch": 0.88, + "grad_norm": 0.21945974468325163, + "learning_rate": 1.3996555263197587e-06, + "loss": 0.0211, + "step": 4024 + }, + { + "epoch": 0.88, + "grad_norm": 0.21372582279354385, + "learning_rate": 1.3944311332433368e-06, + "loss": 0.0276, + "step": 4025 + }, + { + "epoch": 0.88, + "grad_norm": 0.32695586956517514, + "learning_rate": 1.3892161565593743e-06, + "loss": 0.0343, + "step": 4026 + }, + { + "epoch": 0.88, + "grad_norm": 0.22012335659477245, + "learning_rate": 1.3840105989071995e-06, + "loss": 0.0294, + "step": 4027 + }, + { + "epoch": 0.88, + "grad_norm": 0.3275218369942323, + "learning_rate": 1.3788144629213785e-06, + "loss": 0.0367, + "step": 4028 + }, + { + "epoch": 0.88, + "grad_norm": 0.29462431745081913, + "learning_rate": 1.3736277512317076e-06, + "loss": 0.0359, + "step": 4029 + }, + { + "epoch": 0.89, + "grad_norm": 0.2192795508608225, + "learning_rate": 1.3684504664632137e-06, + "loss": 0.0198, + "step": 4030 + }, + { + "epoch": 0.89, + "grad_norm": 0.280980113131807, + "learning_rate": 1.3632826112361497e-06, + "loss": 0.0376, + "step": 4031 + }, + { + "epoch": 0.89, + "grad_norm": 0.2036203982346656, + "learning_rate": 1.3581241881660011e-06, + "loss": 0.0202, + "step": 4032 + }, + { + "epoch": 0.89, + "grad_norm": 0.2446505142993538, + "learning_rate": 1.352975199863482e-06, + "loss": 0.0273, + "step": 4033 + }, + { + "epoch": 0.89, + "grad_norm": 0.3287036992419058, + "learning_rate": 1.3478356489345168e-06, + "loss": 0.0291, + "step": 4034 + }, + { + "epoch": 0.89, + "grad_norm": 0.19690709533655248, + "learning_rate": 1.3427055379802733e-06, + "loss": 0.0292, + "step": 4035 + }, + { + "epoch": 0.89, + "grad_norm": 0.20444550805776926, + "learning_rate": 1.3375848695971239e-06, + "loss": 0.0188, + "step": 4036 + }, + { + "epoch": 0.89, + "grad_norm": 0.20293176152451714, + "learning_rate": 1.3324736463766775e-06, + "loss": 0.0229, + "step": 4037 + }, + { + "epoch": 0.89, + "grad_norm": 0.22411925703638305, + "learning_rate": 1.3273718709057493e-06, + "loss": 0.0255, + "step": 4038 + }, + { + "epoch": 0.89, + "grad_norm": 0.20742693462055836, + "learning_rate": 1.322279545766385e-06, + "loss": 0.0184, + "step": 4039 + }, + { + "epoch": 0.89, + "grad_norm": 0.22124226303366012, + "learning_rate": 1.3171966735358343e-06, + "loss": 0.0238, + "step": 4040 + }, + { + "epoch": 0.89, + "grad_norm": 0.21374453332669846, + "learning_rate": 1.3121232567865793e-06, + "loss": 0.0222, + "step": 4041 + }, + { + "epoch": 0.89, + "grad_norm": 0.2995226151281877, + "learning_rate": 1.3070592980862994e-06, + "loss": 0.0282, + "step": 4042 + }, + { + "epoch": 0.89, + "grad_norm": 0.20906475339624503, + "learning_rate": 1.3020047999979002e-06, + "loss": 0.0268, + "step": 4043 + }, + { + "epoch": 0.89, + "grad_norm": 0.22014655769406827, + "learning_rate": 1.2969597650794907e-06, + "loss": 0.0238, + "step": 4044 + }, + { + "epoch": 0.89, + "grad_norm": 0.22428625333450491, + "learning_rate": 1.2919241958843975e-06, + "loss": 0.0227, + "step": 4045 + }, + { + "epoch": 0.89, + "grad_norm": 0.24069317372127222, + "learning_rate": 1.2868980949611486e-06, + "loss": 0.0284, + "step": 4046 + }, + { + "epoch": 0.89, + "grad_norm": 0.2820574772017969, + "learning_rate": 1.2818814648534895e-06, + "loss": 0.0351, + "step": 4047 + }, + { + "epoch": 0.89, + "grad_norm": 0.24369227630638465, + "learning_rate": 1.2768743081003598e-06, + "loss": 0.0232, + "step": 4048 + }, + { + "epoch": 0.89, + "grad_norm": 0.25864173006102703, + "learning_rate": 1.2718766272359195e-06, + "loss": 0.0289, + "step": 4049 + }, + { + "epoch": 0.89, + "grad_norm": 0.2382527631397271, + "learning_rate": 1.266888424789523e-06, + "loss": 0.0228, + "step": 4050 + }, + { + "epoch": 0.89, + "grad_norm": 0.3481749832938333, + "learning_rate": 1.261909703285733e-06, + "loss": 0.0343, + "step": 4051 + }, + { + "epoch": 0.89, + "grad_norm": 0.22609741882848491, + "learning_rate": 1.2569404652443073e-06, + "loss": 0.0244, + "step": 4052 + }, + { + "epoch": 0.89, + "grad_norm": 0.2095581360449896, + "learning_rate": 1.2519807131802097e-06, + "loss": 0.0287, + "step": 4053 + }, + { + "epoch": 0.89, + "grad_norm": 0.2500651244931791, + "learning_rate": 1.2470304496035968e-06, + "loss": 0.0362, + "step": 4054 + }, + { + "epoch": 0.89, + "grad_norm": 0.22651535398660863, + "learning_rate": 1.2420896770198355e-06, + "loss": 0.0238, + "step": 4055 + }, + { + "epoch": 0.89, + "grad_norm": 0.1854365472547319, + "learning_rate": 1.237158397929472e-06, + "loss": 0.014, + "step": 4056 + }, + { + "epoch": 0.89, + "grad_norm": 0.2610574880424356, + "learning_rate": 1.2322366148282638e-06, + "loss": 0.0227, + "step": 4057 + }, + { + "epoch": 0.89, + "grad_norm": 0.19375172698881735, + "learning_rate": 1.2273243302071513e-06, + "loss": 0.0163, + "step": 4058 + }, + { + "epoch": 0.89, + "grad_norm": 0.21801280222549693, + "learning_rate": 1.2224215465522726e-06, + "loss": 0.0292, + "step": 4059 + }, + { + "epoch": 0.89, + "grad_norm": 0.21858894004737908, + "learning_rate": 1.2175282663449584e-06, + "loss": 0.0311, + "step": 4060 + }, + { + "epoch": 0.89, + "grad_norm": 0.23674585923679517, + "learning_rate": 1.2126444920617297e-06, + "loss": 0.0244, + "step": 4061 + }, + { + "epoch": 0.89, + "grad_norm": 0.24901589774890992, + "learning_rate": 1.2077702261742875e-06, + "loss": 0.0328, + "step": 4062 + }, + { + "epoch": 0.89, + "grad_norm": 0.21182642939812502, + "learning_rate": 1.2029054711495358e-06, + "loss": 0.0235, + "step": 4063 + }, + { + "epoch": 0.89, + "grad_norm": 0.24339609680372667, + "learning_rate": 1.19805022944955e-06, + "loss": 0.0215, + "step": 4064 + }, + { + "epoch": 0.89, + "grad_norm": 0.23009335122623728, + "learning_rate": 1.193204503531602e-06, + "loss": 0.0266, + "step": 4065 + }, + { + "epoch": 0.89, + "grad_norm": 0.21858024822689778, + "learning_rate": 1.1883682958481413e-06, + "loss": 0.0203, + "step": 4066 + }, + { + "epoch": 0.89, + "grad_norm": 0.1924488225475941, + "learning_rate": 1.1835416088468033e-06, + "loss": 0.0201, + "step": 4067 + }, + { + "epoch": 0.89, + "grad_norm": 0.2545215926378505, + "learning_rate": 1.178724444970405e-06, + "loss": 0.0242, + "step": 4068 + }, + { + "epoch": 0.89, + "grad_norm": 0.22614522029108927, + "learning_rate": 1.1739168066569406e-06, + "loss": 0.0203, + "step": 4069 + }, + { + "epoch": 0.89, + "grad_norm": 0.23238722759341868, + "learning_rate": 1.1691186963395861e-06, + "loss": 0.0219, + "step": 4070 + }, + { + "epoch": 0.89, + "grad_norm": 0.18852350905397555, + "learning_rate": 1.1643301164466926e-06, + "loss": 0.0153, + "step": 4071 + }, + { + "epoch": 0.89, + "grad_norm": 0.2369111490974642, + "learning_rate": 1.1595510694017943e-06, + "loss": 0.0218, + "step": 4072 + }, + { + "epoch": 0.89, + "grad_norm": 0.21487258370856713, + "learning_rate": 1.154781557623592e-06, + "loss": 0.027, + "step": 4073 + }, + { + "epoch": 0.89, + "grad_norm": 0.17336881153748232, + "learning_rate": 1.1500215835259664e-06, + "loss": 0.0178, + "step": 4074 + }, + { + "epoch": 0.9, + "grad_norm": 0.23694717100406254, + "learning_rate": 1.1452711495179659e-06, + "loss": 0.0344, + "step": 4075 + }, + { + "epoch": 0.9, + "grad_norm": 0.25646757681452675, + "learning_rate": 1.1405302580038224e-06, + "loss": 0.0252, + "step": 4076 + }, + { + "epoch": 0.9, + "grad_norm": 0.22545380458878736, + "learning_rate": 1.1357989113829237e-06, + "loss": 0.0318, + "step": 4077 + }, + { + "epoch": 0.9, + "grad_norm": 0.26999264077912627, + "learning_rate": 1.1310771120498386e-06, + "loss": 0.0314, + "step": 4078 + }, + { + "epoch": 0.9, + "grad_norm": 0.2638899995894788, + "learning_rate": 1.1263648623942912e-06, + "loss": 0.0306, + "step": 4079 + }, + { + "epoch": 0.9, + "grad_norm": 0.23883802215613728, + "learning_rate": 1.1216621648011873e-06, + "loss": 0.0292, + "step": 4080 + }, + { + "epoch": 0.9, + "grad_norm": 0.24956217813602427, + "learning_rate": 1.1169690216505846e-06, + "loss": 0.0202, + "step": 4081 + }, + { + "epoch": 0.9, + "grad_norm": 0.30855110936179464, + "learning_rate": 1.1122854353177171e-06, + "loss": 0.0309, + "step": 4082 + }, + { + "epoch": 0.9, + "grad_norm": 0.25117188211918606, + "learning_rate": 1.1076114081729682e-06, + "loss": 0.0228, + "step": 4083 + }, + { + "epoch": 0.9, + "grad_norm": 0.23851976026841812, + "learning_rate": 1.1029469425819039e-06, + "loss": 0.0268, + "step": 4084 + }, + { + "epoch": 0.9, + "grad_norm": 0.20732132365666606, + "learning_rate": 1.0982920409052312e-06, + "loss": 0.0222, + "step": 4085 + }, + { + "epoch": 0.9, + "grad_norm": 0.1951781001201858, + "learning_rate": 1.0936467054988276e-06, + "loss": 0.0291, + "step": 4086 + }, + { + "epoch": 0.9, + "grad_norm": 0.20133334225995106, + "learning_rate": 1.0890109387137216e-06, + "loss": 0.0231, + "step": 4087 + }, + { + "epoch": 0.9, + "grad_norm": 0.23568867642602953, + "learning_rate": 1.0843847428961074e-06, + "loss": 0.0279, + "step": 4088 + }, + { + "epoch": 0.9, + "grad_norm": 0.22112487061724806, + "learning_rate": 1.0797681203873255e-06, + "loss": 0.0309, + "step": 4089 + }, + { + "epoch": 0.9, + "grad_norm": 0.2557224897586447, + "learning_rate": 1.0751610735238848e-06, + "loss": 0.0389, + "step": 4090 + }, + { + "epoch": 0.9, + "grad_norm": 0.25351980585021766, + "learning_rate": 1.0705636046374334e-06, + "loss": 0.0269, + "step": 4091 + }, + { + "epoch": 0.9, + "grad_norm": 0.2546228420351522, + "learning_rate": 1.0659757160547813e-06, + "loss": 0.0255, + "step": 4092 + }, + { + "epoch": 0.9, + "grad_norm": 0.27955351555875024, + "learning_rate": 1.0613974100978885e-06, + "loss": 0.0308, + "step": 4093 + }, + { + "epoch": 0.9, + "grad_norm": 0.22525865998805114, + "learning_rate": 1.0568286890838575e-06, + "loss": 0.0188, + "step": 4094 + }, + { + "epoch": 0.9, + "grad_norm": 0.2952819419874532, + "learning_rate": 1.0522695553249562e-06, + "loss": 0.0275, + "step": 4095 + }, + { + "epoch": 0.9, + "grad_norm": 0.2355088938746616, + "learning_rate": 1.047720011128579e-06, + "loss": 0.032, + "step": 4096 + }, + { + "epoch": 0.9, + "grad_norm": 0.23432050707250046, + "learning_rate": 1.0431800587972862e-06, + "loss": 0.0189, + "step": 4097 + }, + { + "epoch": 0.9, + "grad_norm": 0.22924152239045292, + "learning_rate": 1.038649700628771e-06, + "loss": 0.0309, + "step": 4098 + }, + { + "epoch": 0.9, + "grad_norm": 0.3333575741576425, + "learning_rate": 1.0341289389158793e-06, + "loss": 0.0321, + "step": 4099 + }, + { + "epoch": 0.9, + "grad_norm": 0.21430262129224475, + "learning_rate": 1.029617775946592e-06, + "loss": 0.0222, + "step": 4100 + }, + { + "epoch": 0.9, + "grad_norm": 0.21626876395465008, + "learning_rate": 1.0251162140040383e-06, + "loss": 0.0233, + "step": 4101 + }, + { + "epoch": 0.9, + "grad_norm": 0.22993769318753188, + "learning_rate": 1.0206242553664868e-06, + "loss": 0.0222, + "step": 4102 + }, + { + "epoch": 0.9, + "grad_norm": 0.2886638598948931, + "learning_rate": 1.016141902307346e-06, + "loss": 0.035, + "step": 4103 + }, + { + "epoch": 0.9, + "grad_norm": 0.23592357847054904, + "learning_rate": 1.011669157095161e-06, + "loss": 0.032, + "step": 4104 + }, + { + "epoch": 0.9, + "grad_norm": 0.2396318431478083, + "learning_rate": 1.0072060219936164e-06, + "loss": 0.0183, + "step": 4105 + }, + { + "epoch": 0.9, + "grad_norm": 0.26774349810437653, + "learning_rate": 1.002752499261528e-06, + "loss": 0.0318, + "step": 4106 + }, + { + "epoch": 0.9, + "grad_norm": 0.23850047079925532, + "learning_rate": 9.98308591152859e-07, + "loss": 0.0339, + "step": 4107 + }, + { + "epoch": 0.9, + "grad_norm": 0.2522215337607944, + "learning_rate": 9.93874299916693e-07, + "loss": 0.0243, + "step": 4108 + }, + { + "epoch": 0.9, + "grad_norm": 0.22719153686620427, + "learning_rate": 9.894496277972498e-07, + "loss": 0.03, + "step": 4109 + }, + { + "epoch": 0.9, + "grad_norm": 0.1965662389439065, + "learning_rate": 9.850345770338875e-07, + "loss": 0.0282, + "step": 4110 + }, + { + "epoch": 0.9, + "grad_norm": 0.24668700750247183, + "learning_rate": 9.80629149861092e-07, + "loss": 0.032, + "step": 4111 + }, + { + "epoch": 0.9, + "grad_norm": 0.23108659783490207, + "learning_rate": 9.76233348508473e-07, + "loss": 0.0265, + "step": 4112 + }, + { + "epoch": 0.9, + "grad_norm": 0.2106170339238177, + "learning_rate": 9.718471752007753e-07, + "loss": 0.0189, + "step": 4113 + }, + { + "epoch": 0.9, + "grad_norm": 0.21015933222442607, + "learning_rate": 9.67470632157863e-07, + "loss": 0.0221, + "step": 4114 + }, + { + "epoch": 0.9, + "grad_norm": 0.20480083611108585, + "learning_rate": 9.63103721594738e-07, + "loss": 0.0241, + "step": 4115 + }, + { + "epoch": 0.9, + "grad_norm": 0.2944418429963534, + "learning_rate": 9.587464457215146e-07, + "loss": 0.0288, + "step": 4116 + }, + { + "epoch": 0.9, + "grad_norm": 0.23625826788366722, + "learning_rate": 9.54398806743444e-07, + "loss": 0.0224, + "step": 4117 + }, + { + "epoch": 0.9, + "grad_norm": 0.263884818661961, + "learning_rate": 9.500608068608841e-07, + "loss": 0.0247, + "step": 4118 + }, + { + "epoch": 0.9, + "grad_norm": 0.23296904069458949, + "learning_rate": 9.457324482693275e-07, + "loss": 0.0228, + "step": 4119 + }, + { + "epoch": 0.9, + "grad_norm": 0.28249035291319474, + "learning_rate": 9.414137331593842e-07, + "loss": 0.0379, + "step": 4120 + }, + { + "epoch": 0.91, + "grad_norm": 0.23230560985053014, + "learning_rate": 9.371046637167835e-07, + "loss": 0.0167, + "step": 4121 + }, + { + "epoch": 0.91, + "grad_norm": 0.25321784965640365, + "learning_rate": 9.328052421223676e-07, + "loss": 0.0247, + "step": 4122 + }, + { + "epoch": 0.91, + "grad_norm": 0.22432202741914464, + "learning_rate": 9.285154705521048e-07, + "loss": 0.029, + "step": 4123 + }, + { + "epoch": 0.91, + "grad_norm": 0.21346787402733183, + "learning_rate": 9.242353511770697e-07, + "loss": 0.0219, + "step": 4124 + }, + { + "epoch": 0.91, + "grad_norm": 0.23649280894740557, + "learning_rate": 9.199648861634625e-07, + "loss": 0.0261, + "step": 4125 + }, + { + "epoch": 0.91, + "grad_norm": 0.1963177180843091, + "learning_rate": 9.157040776725856e-07, + "loss": 0.0204, + "step": 4126 + }, + { + "epoch": 0.91, + "grad_norm": 0.23517643281117567, + "learning_rate": 9.11452927860863e-07, + "loss": 0.0234, + "step": 4127 + }, + { + "epoch": 0.91, + "grad_norm": 0.2606570501311804, + "learning_rate": 9.072114388798314e-07, + "loss": 0.0229, + "step": 4128 + }, + { + "epoch": 0.91, + "grad_norm": 0.2713708297069939, + "learning_rate": 9.029796128761292e-07, + "loss": 0.0274, + "step": 4129 + }, + { + "epoch": 0.91, + "grad_norm": 0.21933885064572553, + "learning_rate": 8.987574519915121e-07, + "loss": 0.0144, + "step": 4130 + }, + { + "epoch": 0.91, + "grad_norm": 0.24057086391334367, + "learning_rate": 8.945449583628396e-07, + "loss": 0.0228, + "step": 4131 + }, + { + "epoch": 0.91, + "grad_norm": 0.20083751483840492, + "learning_rate": 8.903421341220842e-07, + "loss": 0.0253, + "step": 4132 + }, + { + "epoch": 0.91, + "grad_norm": 0.23787963353213165, + "learning_rate": 8.861489813963154e-07, + "loss": 0.0222, + "step": 4133 + }, + { + "epoch": 0.91, + "grad_norm": 0.2820054610447314, + "learning_rate": 8.819655023077201e-07, + "loss": 0.0304, + "step": 4134 + }, + { + "epoch": 0.91, + "grad_norm": 0.2659333346855394, + "learning_rate": 8.777916989735736e-07, + "loss": 0.0293, + "step": 4135 + }, + { + "epoch": 0.91, + "grad_norm": 0.2236776098834561, + "learning_rate": 8.736275735062749e-07, + "loss": 0.0203, + "step": 4136 + }, + { + "epoch": 0.91, + "grad_norm": 0.3545968818018609, + "learning_rate": 8.694731280133051e-07, + "loss": 0.04, + "step": 4137 + }, + { + "epoch": 0.91, + "grad_norm": 0.19651733713495328, + "learning_rate": 8.653283645972598e-07, + "loss": 0.0195, + "step": 4138 + }, + { + "epoch": 0.91, + "grad_norm": 0.24283739537981863, + "learning_rate": 8.611932853558236e-07, + "loss": 0.0356, + "step": 4139 + }, + { + "epoch": 0.91, + "grad_norm": 0.18475203675836635, + "learning_rate": 8.570678923817888e-07, + "loss": 0.0156, + "step": 4140 + }, + { + "epoch": 0.91, + "grad_norm": 0.2951715057089647, + "learning_rate": 8.529521877630409e-07, + "loss": 0.0328, + "step": 4141 + }, + { + "epoch": 0.91, + "grad_norm": 0.21674076419565533, + "learning_rate": 8.48846173582567e-07, + "loss": 0.0212, + "step": 4142 + }, + { + "epoch": 0.91, + "grad_norm": 0.24360912451878924, + "learning_rate": 8.447498519184405e-07, + "loss": 0.0233, + "step": 4143 + }, + { + "epoch": 0.91, + "grad_norm": 0.1933249240586514, + "learning_rate": 8.406632248438362e-07, + "loss": 0.0147, + "step": 4144 + }, + { + "epoch": 0.91, + "grad_norm": 0.2590958414376381, + "learning_rate": 8.365862944270243e-07, + "loss": 0.0361, + "step": 4145 + }, + { + "epoch": 0.91, + "grad_norm": 0.23042954903530702, + "learning_rate": 8.325190627313628e-07, + "loss": 0.0283, + "step": 4146 + }, + { + "epoch": 0.91, + "grad_norm": 0.2168782742937472, + "learning_rate": 8.284615318152988e-07, + "loss": 0.0256, + "step": 4147 + }, + { + "epoch": 0.91, + "grad_norm": 0.23256130203793526, + "learning_rate": 8.244137037323807e-07, + "loss": 0.0243, + "step": 4148 + }, + { + "epoch": 0.91, + "grad_norm": 0.2065068928120067, + "learning_rate": 8.203755805312319e-07, + "loss": 0.027, + "step": 4149 + }, + { + "epoch": 0.91, + "grad_norm": 0.2840058099562093, + "learning_rate": 8.163471642555798e-07, + "loss": 0.0318, + "step": 4150 + }, + { + "epoch": 0.91, + "grad_norm": 0.24845108378959502, + "learning_rate": 8.123284569442203e-07, + "loss": 0.0214, + "step": 4151 + }, + { + "epoch": 0.91, + "grad_norm": 0.26372225244833675, + "learning_rate": 8.083194606310507e-07, + "loss": 0.0255, + "step": 4152 + }, + { + "epoch": 0.91, + "grad_norm": 0.222776832053034, + "learning_rate": 8.043201773450526e-07, + "loss": 0.0328, + "step": 4153 + }, + { + "epoch": 0.91, + "grad_norm": 0.2358572422346872, + "learning_rate": 8.003306091102803e-07, + "loss": 0.0363, + "step": 4154 + }, + { + "epoch": 0.91, + "grad_norm": 0.20142653929065527, + "learning_rate": 7.963507579458851e-07, + "loss": 0.0197, + "step": 4155 + }, + { + "epoch": 0.91, + "grad_norm": 0.23690345186906567, + "learning_rate": 7.923806258660893e-07, + "loss": 0.0185, + "step": 4156 + }, + { + "epoch": 0.91, + "grad_norm": 0.27213312753447716, + "learning_rate": 7.884202148802056e-07, + "loss": 0.0234, + "step": 4157 + }, + { + "epoch": 0.91, + "grad_norm": 0.24306988711717298, + "learning_rate": 7.844695269926194e-07, + "loss": 0.0212, + "step": 4158 + }, + { + "epoch": 0.91, + "grad_norm": 0.2129738146430894, + "learning_rate": 7.805285642027983e-07, + "loss": 0.0246, + "step": 4159 + }, + { + "epoch": 0.91, + "grad_norm": 0.25549705710181264, + "learning_rate": 7.765973285052863e-07, + "loss": 0.0319, + "step": 4160 + }, + { + "epoch": 0.91, + "grad_norm": 0.22458338379785636, + "learning_rate": 7.726758218897079e-07, + "loss": 0.0234, + "step": 4161 + }, + { + "epoch": 0.91, + "grad_norm": 0.23688323660850585, + "learning_rate": 7.687640463407597e-07, + "loss": 0.0226, + "step": 4162 + }, + { + "epoch": 0.91, + "grad_norm": 0.20468908374843428, + "learning_rate": 7.648620038382204e-07, + "loss": 0.0262, + "step": 4163 + }, + { + "epoch": 0.91, + "grad_norm": 0.23873796624918533, + "learning_rate": 7.609696963569325e-07, + "loss": 0.0238, + "step": 4164 + }, + { + "epoch": 0.91, + "grad_norm": 0.25653442844889807, + "learning_rate": 7.5708712586682e-07, + "loss": 0.027, + "step": 4165 + }, + { + "epoch": 0.92, + "grad_norm": 0.22679305677521136, + "learning_rate": 7.532142943328713e-07, + "loss": 0.0226, + "step": 4166 + }, + { + "epoch": 0.92, + "grad_norm": 0.2986472835490037, + "learning_rate": 7.493512037151563e-07, + "loss": 0.0373, + "step": 4167 + }, + { + "epoch": 0.92, + "grad_norm": 0.19956714422279945, + "learning_rate": 7.454978559688019e-07, + "loss": 0.0188, + "step": 4168 + }, + { + "epoch": 0.92, + "grad_norm": 0.2401934798253411, + "learning_rate": 7.416542530440174e-07, + "loss": 0.0289, + "step": 4169 + }, + { + "epoch": 0.92, + "grad_norm": 0.25598255133583186, + "learning_rate": 7.378203968860643e-07, + "loss": 0.031, + "step": 4170 + }, + { + "epoch": 0.92, + "grad_norm": 0.2686609508781585, + "learning_rate": 7.339962894352925e-07, + "loss": 0.0337, + "step": 4171 + }, + { + "epoch": 0.92, + "grad_norm": 0.2355377600378674, + "learning_rate": 7.30181932627101e-07, + "loss": 0.0296, + "step": 4172 + }, + { + "epoch": 0.92, + "grad_norm": 0.23873494683739044, + "learning_rate": 7.263773283919584e-07, + "loss": 0.029, + "step": 4173 + }, + { + "epoch": 0.92, + "grad_norm": 0.2928635599121976, + "learning_rate": 7.225824786553981e-07, + "loss": 0.0269, + "step": 4174 + }, + { + "epoch": 0.92, + "grad_norm": 0.2290730321214082, + "learning_rate": 7.187973853380215e-07, + "loss": 0.021, + "step": 4175 + }, + { + "epoch": 0.92, + "grad_norm": 0.2189374018796047, + "learning_rate": 7.150220503554783e-07, + "loss": 0.0225, + "step": 4176 + }, + { + "epoch": 0.92, + "grad_norm": 0.23741704533576782, + "learning_rate": 7.112564756184981e-07, + "loss": 0.0263, + "step": 4177 + }, + { + "epoch": 0.92, + "grad_norm": 0.24110244905868689, + "learning_rate": 7.075006630328518e-07, + "loss": 0.0175, + "step": 4178 + }, + { + "epoch": 0.92, + "grad_norm": 0.2777066073372374, + "learning_rate": 7.037546144993901e-07, + "loss": 0.0327, + "step": 4179 + }, + { + "epoch": 0.92, + "grad_norm": 0.28038260696353856, + "learning_rate": 7.000183319140053e-07, + "loss": 0.0229, + "step": 4180 + }, + { + "epoch": 0.92, + "grad_norm": 0.28097797855363493, + "learning_rate": 6.962918171676536e-07, + "loss": 0.033, + "step": 4181 + }, + { + "epoch": 0.92, + "grad_norm": 0.2777447286787186, + "learning_rate": 6.925750721463443e-07, + "loss": 0.03, + "step": 4182 + }, + { + "epoch": 0.92, + "grad_norm": 0.3036071747282006, + "learning_rate": 6.88868098731148e-07, + "loss": 0.0404, + "step": 4183 + }, + { + "epoch": 0.92, + "grad_norm": 0.2548057468877089, + "learning_rate": 6.851708987981865e-07, + "loss": 0.0246, + "step": 4184 + }, + { + "epoch": 0.92, + "grad_norm": 0.20866771129708744, + "learning_rate": 6.814834742186361e-07, + "loss": 0.0165, + "step": 4185 + }, + { + "epoch": 0.92, + "grad_norm": 0.24086860735556023, + "learning_rate": 6.778058268587217e-07, + "loss": 0.0374, + "step": 4186 + }, + { + "epoch": 0.92, + "grad_norm": 0.23213997743465795, + "learning_rate": 6.741379585797236e-07, + "loss": 0.0236, + "step": 4187 + }, + { + "epoch": 0.92, + "grad_norm": 0.2619117480694197, + "learning_rate": 6.704798712379768e-07, + "loss": 0.0334, + "step": 4188 + }, + { + "epoch": 0.92, + "grad_norm": 0.2228225789458934, + "learning_rate": 6.66831566684858e-07, + "loss": 0.026, + "step": 4189 + }, + { + "epoch": 0.92, + "grad_norm": 0.23261587877650092, + "learning_rate": 6.631930467667991e-07, + "loss": 0.0254, + "step": 4190 + }, + { + "epoch": 0.92, + "grad_norm": 0.23538675480397928, + "learning_rate": 6.595643133252716e-07, + "loss": 0.0234, + "step": 4191 + }, + { + "epoch": 0.92, + "grad_norm": 0.20758722435386293, + "learning_rate": 6.559453681968064e-07, + "loss": 0.0261, + "step": 4192 + }, + { + "epoch": 0.92, + "grad_norm": 0.23417337009794903, + "learning_rate": 6.523362132129718e-07, + "loss": 0.0214, + "step": 4193 + }, + { + "epoch": 0.92, + "grad_norm": 0.2359065086018697, + "learning_rate": 6.487368502003821e-07, + "loss": 0.0197, + "step": 4194 + }, + { + "epoch": 0.92, + "grad_norm": 0.2548183929766835, + "learning_rate": 6.451472809806958e-07, + "loss": 0.0254, + "step": 4195 + }, + { + "epoch": 0.92, + "grad_norm": 0.24662451047698913, + "learning_rate": 6.415675073706174e-07, + "loss": 0.0226, + "step": 4196 + }, + { + "epoch": 0.92, + "grad_norm": 0.2464448150470119, + "learning_rate": 6.379975311818931e-07, + "loss": 0.0378, + "step": 4197 + }, + { + "epoch": 0.92, + "grad_norm": 0.2398910472719313, + "learning_rate": 6.344373542213112e-07, + "loss": 0.0254, + "step": 4198 + }, + { + "epoch": 0.92, + "grad_norm": 0.195640993439512, + "learning_rate": 6.308869782906946e-07, + "loss": 0.0181, + "step": 4199 + }, + { + "epoch": 0.92, + "grad_norm": 0.23935219596327983, + "learning_rate": 6.27346405186915e-07, + "loss": 0.0238, + "step": 4200 + }, + { + "epoch": 0.92, + "grad_norm": 0.2357560713882237, + "learning_rate": 6.238156367018744e-07, + "loss": 0.0276, + "step": 4201 + }, + { + "epoch": 0.92, + "grad_norm": 0.2454419207707864, + "learning_rate": 6.202946746225191e-07, + "loss": 0.0321, + "step": 4202 + }, + { + "epoch": 0.92, + "grad_norm": 0.2008752487806573, + "learning_rate": 6.16783520730826e-07, + "loss": 0.0179, + "step": 4203 + }, + { + "epoch": 0.92, + "grad_norm": 0.23999128215324347, + "learning_rate": 6.132821768038133e-07, + "loss": 0.0412, + "step": 4204 + }, + { + "epoch": 0.92, + "grad_norm": 0.18943407021881573, + "learning_rate": 6.097906446135349e-07, + "loss": 0.0196, + "step": 4205 + }, + { + "epoch": 0.92, + "grad_norm": 0.24820520949456476, + "learning_rate": 6.063089259270749e-07, + "loss": 0.0241, + "step": 4206 + }, + { + "epoch": 0.92, + "grad_norm": 0.22108302713818226, + "learning_rate": 6.028370225065527e-07, + "loss": 0.0256, + "step": 4207 + }, + { + "epoch": 0.92, + "grad_norm": 0.20942082686440533, + "learning_rate": 5.993749361091206e-07, + "loss": 0.0233, + "step": 4208 + }, + { + "epoch": 0.92, + "grad_norm": 0.21624351021780297, + "learning_rate": 5.95922668486959e-07, + "loss": 0.0221, + "step": 4209 + }, + { + "epoch": 0.92, + "grad_norm": 0.2030324908655009, + "learning_rate": 5.92480221387286e-07, + "loss": 0.0266, + "step": 4210 + }, + { + "epoch": 0.92, + "grad_norm": 0.2234644610137812, + "learning_rate": 5.890475965523412e-07, + "loss": 0.0195, + "step": 4211 + }, + { + "epoch": 0.93, + "grad_norm": 0.20601259497684363, + "learning_rate": 5.856247957193995e-07, + "loss": 0.021, + "step": 4212 + }, + { + "epoch": 0.93, + "grad_norm": 0.2268001855233441, + "learning_rate": 5.822118206207594e-07, + "loss": 0.0288, + "step": 4213 + }, + { + "epoch": 0.93, + "grad_norm": 0.231620561267959, + "learning_rate": 5.788086729837505e-07, + "loss": 0.014, + "step": 4214 + }, + { + "epoch": 0.93, + "grad_norm": 0.27735890091896426, + "learning_rate": 5.754153545307262e-07, + "loss": 0.033, + "step": 4215 + }, + { + "epoch": 0.93, + "grad_norm": 0.22598401570866886, + "learning_rate": 5.720318669790636e-07, + "loss": 0.0209, + "step": 4216 + }, + { + "epoch": 0.93, + "grad_norm": 0.2529445187259624, + "learning_rate": 5.68658212041171e-07, + "loss": 0.0325, + "step": 4217 + }, + { + "epoch": 0.93, + "grad_norm": 0.28292747763192433, + "learning_rate": 5.652943914244713e-07, + "loss": 0.0184, + "step": 4218 + }, + { + "epoch": 0.93, + "grad_norm": 0.23992488277864113, + "learning_rate": 5.61940406831416e-07, + "loss": 0.0251, + "step": 4219 + }, + { + "epoch": 0.93, + "grad_norm": 0.2524621095413231, + "learning_rate": 5.585962599594807e-07, + "loss": 0.0261, + "step": 4220 + }, + { + "epoch": 0.93, + "grad_norm": 0.2279851821378975, + "learning_rate": 5.552619525011538e-07, + "loss": 0.0177, + "step": 4221 + }, + { + "epoch": 0.93, + "grad_norm": 0.23148640939756462, + "learning_rate": 5.519374861439497e-07, + "loss": 0.0286, + "step": 4222 + }, + { + "epoch": 0.93, + "grad_norm": 0.26922703589088404, + "learning_rate": 5.486228625704049e-07, + "loss": 0.0315, + "step": 4223 + }, + { + "epoch": 0.93, + "grad_norm": 0.25237760346266275, + "learning_rate": 5.453180834580663e-07, + "loss": 0.0243, + "step": 4224 + }, + { + "epoch": 0.93, + "grad_norm": 0.28562854167820906, + "learning_rate": 5.42023150479507e-07, + "loss": 0.0342, + "step": 4225 + }, + { + "epoch": 0.93, + "grad_norm": 0.2511582366420523, + "learning_rate": 5.387380653023066e-07, + "loss": 0.029, + "step": 4226 + }, + { + "epoch": 0.93, + "grad_norm": 0.1821205709968757, + "learning_rate": 5.354628295890729e-07, + "loss": 0.0203, + "step": 4227 + }, + { + "epoch": 0.93, + "grad_norm": 0.210666880264267, + "learning_rate": 5.321974449974198e-07, + "loss": 0.0224, + "step": 4228 + }, + { + "epoch": 0.93, + "grad_norm": 0.19076632553010706, + "learning_rate": 5.289419131799811e-07, + "loss": 0.0215, + "step": 4229 + }, + { + "epoch": 0.93, + "grad_norm": 0.21570515041374458, + "learning_rate": 5.256962357843942e-07, + "loss": 0.0194, + "step": 4230 + }, + { + "epoch": 0.93, + "grad_norm": 0.23654925822055214, + "learning_rate": 5.224604144533274e-07, + "loss": 0.0227, + "step": 4231 + }, + { + "epoch": 0.93, + "grad_norm": 0.2083108333517541, + "learning_rate": 5.192344508244418e-07, + "loss": 0.0241, + "step": 4232 + }, + { + "epoch": 0.93, + "grad_norm": 0.2713138676090215, + "learning_rate": 5.160183465304203e-07, + "loss": 0.0346, + "step": 4233 + }, + { + "epoch": 0.93, + "grad_norm": 0.20912935451025808, + "learning_rate": 5.128121031989497e-07, + "loss": 0.0157, + "step": 4234 + }, + { + "epoch": 0.93, + "grad_norm": 0.19525838102139567, + "learning_rate": 5.096157224527343e-07, + "loss": 0.02, + "step": 4235 + }, + { + "epoch": 0.93, + "grad_norm": 0.3220334690689413, + "learning_rate": 5.0642920590948e-07, + "loss": 0.0321, + "step": 4236 + }, + { + "epoch": 0.93, + "grad_norm": 0.2030117379374764, + "learning_rate": 5.032525551819012e-07, + "loss": 0.0231, + "step": 4237 + }, + { + "epoch": 0.93, + "grad_norm": 0.26590538653566287, + "learning_rate": 5.000857718777186e-07, + "loss": 0.0315, + "step": 4238 + }, + { + "epoch": 0.93, + "grad_norm": 0.3333967292621206, + "learning_rate": 4.969288575996656e-07, + "loss": 0.0372, + "step": 4239 + }, + { + "epoch": 0.93, + "grad_norm": 0.282552722045364, + "learning_rate": 4.937818139454709e-07, + "loss": 0.0245, + "step": 4240 + }, + { + "epoch": 0.93, + "grad_norm": 0.28079548508313534, + "learning_rate": 4.906446425078782e-07, + "loss": 0.0313, + "step": 4241 + }, + { + "epoch": 0.93, + "grad_norm": 0.23541741617190348, + "learning_rate": 4.87517344874624e-07, + "loss": 0.0271, + "step": 4242 + }, + { + "epoch": 0.93, + "grad_norm": 0.20766293227191449, + "learning_rate": 4.843999226284579e-07, + "loss": 0.0203, + "step": 4243 + }, + { + "epoch": 0.93, + "grad_norm": 0.24228220287830854, + "learning_rate": 4.812923773471201e-07, + "loss": 0.0349, + "step": 4244 + }, + { + "epoch": 0.93, + "grad_norm": 0.20977995413316466, + "learning_rate": 4.781947106033635e-07, + "loss": 0.0138, + "step": 4245 + }, + { + "epoch": 0.93, + "grad_norm": 0.22777955603260774, + "learning_rate": 4.7510692396493197e-07, + "loss": 0.0256, + "step": 4246 + }, + { + "epoch": 0.93, + "grad_norm": 0.2576554426894255, + "learning_rate": 4.720290189945775e-07, + "loss": 0.0282, + "step": 4247 + }, + { + "epoch": 0.93, + "grad_norm": 0.2607966734003366, + "learning_rate": 4.689609972500453e-07, + "loss": 0.0224, + "step": 4248 + }, + { + "epoch": 0.93, + "grad_norm": 0.26018610602543085, + "learning_rate": 4.659028602840776e-07, + "loss": 0.0223, + "step": 4249 + }, + { + "epoch": 0.93, + "grad_norm": 0.24925940310591385, + "learning_rate": 4.628546096444186e-07, + "loss": 0.0354, + "step": 4250 + }, + { + "epoch": 0.93, + "grad_norm": 0.23503375459028097, + "learning_rate": 4.5981624687380764e-07, + "loss": 0.0189, + "step": 4251 + }, + { + "epoch": 0.93, + "grad_norm": 0.2559029927821459, + "learning_rate": 4.567877735099768e-07, + "loss": 0.0327, + "step": 4252 + }, + { + "epoch": 0.93, + "grad_norm": 0.22518056306733947, + "learning_rate": 4.5376919108565345e-07, + "loss": 0.0331, + "step": 4253 + }, + { + "epoch": 0.93, + "grad_norm": 0.2204743355679495, + "learning_rate": 4.507605011285643e-07, + "loss": 0.0225, + "step": 4254 + }, + { + "epoch": 0.93, + "grad_norm": 0.24288990851315317, + "learning_rate": 4.477617051614225e-07, + "loss": 0.0244, + "step": 4255 + }, + { + "epoch": 0.93, + "grad_norm": 0.24696868193397514, + "learning_rate": 4.4477280470194064e-07, + "loss": 0.0357, + "step": 4256 + }, + { + "epoch": 0.93, + "grad_norm": 0.19652242690213034, + "learning_rate": 4.4179380126281533e-07, + "loss": 0.0196, + "step": 4257 + }, + { + "epoch": 0.94, + "grad_norm": 0.26521811456515404, + "learning_rate": 4.3882469635174287e-07, + "loss": 0.036, + "step": 4258 + }, + { + "epoch": 0.94, + "grad_norm": 0.22014588081843334, + "learning_rate": 4.358654914714033e-07, + "loss": 0.0313, + "step": 4259 + }, + { + "epoch": 0.94, + "grad_norm": 0.2394356203137378, + "learning_rate": 4.329161881194677e-07, + "loss": 0.0297, + "step": 4260 + }, + { + "epoch": 0.94, + "grad_norm": 0.23013257708396465, + "learning_rate": 4.299767877885974e-07, + "loss": 0.0272, + "step": 4261 + }, + { + "epoch": 0.94, + "grad_norm": 0.19900803548497645, + "learning_rate": 4.270472919664426e-07, + "loss": 0.0279, + "step": 4262 + }, + { + "epoch": 0.94, + "grad_norm": 0.24045887203637645, + "learning_rate": 4.241277021356327e-07, + "loss": 0.025, + "step": 4263 + }, + { + "epoch": 0.94, + "grad_norm": 0.2620758637838076, + "learning_rate": 4.212180197737992e-07, + "loss": 0.0243, + "step": 4264 + }, + { + "epoch": 0.94, + "grad_norm": 0.23535719742103473, + "learning_rate": 4.183182463535418e-07, + "loss": 0.0227, + "step": 4265 + }, + { + "epoch": 0.94, + "grad_norm": 0.20651391007562137, + "learning_rate": 4.1542838334245994e-07, + "loss": 0.0206, + "step": 4266 + }, + { + "epoch": 0.94, + "grad_norm": 0.28741586635869343, + "learning_rate": 4.1254843220312814e-07, + "loss": 0.0243, + "step": 4267 + }, + { + "epoch": 0.94, + "grad_norm": 0.22432157903277913, + "learning_rate": 4.096783943931093e-07, + "loss": 0.0302, + "step": 4268 + }, + { + "epoch": 0.94, + "grad_norm": 0.2236200437273739, + "learning_rate": 4.0681827136494157e-07, + "loss": 0.0234, + "step": 4269 + }, + { + "epoch": 0.94, + "grad_norm": 0.22921955065597152, + "learning_rate": 4.039680645661581e-07, + "loss": 0.022, + "step": 4270 + }, + { + "epoch": 0.94, + "grad_norm": 0.2059903699825295, + "learning_rate": 4.011277754392606e-07, + "loss": 0.0307, + "step": 4271 + }, + { + "epoch": 0.94, + "grad_norm": 0.24625171405418378, + "learning_rate": 3.9829740542174143e-07, + "loss": 0.0331, + "step": 4272 + }, + { + "epoch": 0.94, + "grad_norm": 0.280307993602784, + "learning_rate": 3.954769559460614e-07, + "loss": 0.0223, + "step": 4273 + }, + { + "epoch": 0.94, + "grad_norm": 0.2639314292337824, + "learning_rate": 3.9266642843967415e-07, + "loss": 0.0255, + "step": 4274 + }, + { + "epoch": 0.94, + "grad_norm": 0.15444482880435126, + "learning_rate": 3.8986582432500196e-07, + "loss": 0.0118, + "step": 4275 + }, + { + "epoch": 0.94, + "grad_norm": 0.2156307803636951, + "learning_rate": 3.8707514501944657e-07, + "loss": 0.0222, + "step": 4276 + }, + { + "epoch": 0.94, + "grad_norm": 0.24637660921685095, + "learning_rate": 3.842943919353914e-07, + "loss": 0.0297, + "step": 4277 + }, + { + "epoch": 0.94, + "grad_norm": 0.19506373775960947, + "learning_rate": 3.815235664801908e-07, + "loss": 0.0225, + "step": 4278 + }, + { + "epoch": 0.94, + "grad_norm": 0.21452376204645704, + "learning_rate": 3.787626700561742e-07, + "loss": 0.0253, + "step": 4279 + }, + { + "epoch": 0.94, + "grad_norm": 0.2523996551046947, + "learning_rate": 3.7601170406065034e-07, + "loss": 0.0214, + "step": 4280 + }, + { + "epoch": 0.94, + "grad_norm": 0.20145856927965056, + "learning_rate": 3.732706698859012e-07, + "loss": 0.0212, + "step": 4281 + }, + { + "epoch": 0.94, + "grad_norm": 0.2139929832573777, + "learning_rate": 3.705395689191771e-07, + "loss": 0.0232, + "step": 4282 + }, + { + "epoch": 0.94, + "grad_norm": 0.23201054149913145, + "learning_rate": 3.6781840254271227e-07, + "loss": 0.0255, + "step": 4283 + }, + { + "epoch": 0.94, + "grad_norm": 0.2101806378306389, + "learning_rate": 3.651071721336963e-07, + "loss": 0.0256, + "step": 4284 + }, + { + "epoch": 0.94, + "grad_norm": 0.258124278577854, + "learning_rate": 3.62405879064307e-07, + "loss": 0.0235, + "step": 4285 + }, + { + "epoch": 0.94, + "grad_norm": 0.20574487654342996, + "learning_rate": 3.59714524701682e-07, + "loss": 0.0196, + "step": 4286 + }, + { + "epoch": 0.94, + "grad_norm": 0.2911754015930831, + "learning_rate": 3.5703311040793167e-07, + "loss": 0.0259, + "step": 4287 + }, + { + "epoch": 0.94, + "grad_norm": 0.24479935089522756, + "learning_rate": 3.543616375401393e-07, + "loss": 0.0251, + "step": 4288 + }, + { + "epoch": 0.94, + "grad_norm": 0.22234509292754578, + "learning_rate": 3.517001074503501e-07, + "loss": 0.0296, + "step": 4289 + }, + { + "epoch": 0.94, + "grad_norm": 0.2600904312728723, + "learning_rate": 3.490485214855799e-07, + "loss": 0.032, + "step": 4290 + }, + { + "epoch": 0.94, + "grad_norm": 0.22179503947671178, + "learning_rate": 3.464068809878196e-07, + "loss": 0.023, + "step": 4291 + }, + { + "epoch": 0.94, + "grad_norm": 0.3118749447098188, + "learning_rate": 3.4377518729401317e-07, + "loss": 0.0336, + "step": 4292 + }, + { + "epoch": 0.94, + "grad_norm": 0.20872304745239587, + "learning_rate": 3.4115344173607957e-07, + "loss": 0.0196, + "step": 4293 + }, + { + "epoch": 0.94, + "grad_norm": 0.23229821706240353, + "learning_rate": 3.3854164564089964e-07, + "loss": 0.028, + "step": 4294 + }, + { + "epoch": 0.94, + "grad_norm": 0.2519289049439303, + "learning_rate": 3.359398003303183e-07, + "loss": 0.0213, + "step": 4295 + }, + { + "epoch": 0.94, + "grad_norm": 0.25250354053828866, + "learning_rate": 3.333479071211465e-07, + "loss": 0.0296, + "step": 4296 + }, + { + "epoch": 0.94, + "grad_norm": 0.3185874262580183, + "learning_rate": 3.307659673251595e-07, + "loss": 0.036, + "step": 4297 + }, + { + "epoch": 0.94, + "grad_norm": 0.267967650630909, + "learning_rate": 3.281939822490876e-07, + "loss": 0.0359, + "step": 4298 + }, + { + "epoch": 0.94, + "grad_norm": 0.2060807469237559, + "learning_rate": 3.256319531946317e-07, + "loss": 0.0157, + "step": 4299 + }, + { + "epoch": 0.94, + "grad_norm": 0.21211101678876754, + "learning_rate": 3.230798814584502e-07, + "loss": 0.0258, + "step": 4300 + }, + { + "epoch": 0.94, + "grad_norm": 0.23040547552978455, + "learning_rate": 3.2053776833216533e-07, + "loss": 0.0255, + "step": 4301 + }, + { + "epoch": 0.94, + "grad_norm": 0.25107194149449047, + "learning_rate": 3.1800561510234805e-07, + "loss": 0.0193, + "step": 4302 + }, + { + "epoch": 0.95, + "grad_norm": 0.2353953641280843, + "learning_rate": 3.1548342305054435e-07, + "loss": 0.0261, + "step": 4303 + }, + { + "epoch": 0.95, + "grad_norm": 0.21677579863228158, + "learning_rate": 3.1297119345324645e-07, + "loss": 0.0252, + "step": 4304 + }, + { + "epoch": 0.95, + "grad_norm": 0.2627323790992519, + "learning_rate": 3.104689275819128e-07, + "loss": 0.0226, + "step": 4305 + }, + { + "epoch": 0.95, + "grad_norm": 0.18438741916920828, + "learning_rate": 3.079766267029527e-07, + "loss": 0.0179, + "step": 4306 + }, + { + "epoch": 0.95, + "grad_norm": 0.18393177001251199, + "learning_rate": 3.0549429207773483e-07, + "loss": 0.0199, + "step": 4307 + }, + { + "epoch": 0.95, + "grad_norm": 0.19469604482620562, + "learning_rate": 3.030219249625854e-07, + "loss": 0.0169, + "step": 4308 + }, + { + "epoch": 0.95, + "grad_norm": 0.24246372344830378, + "learning_rate": 3.005595266087835e-07, + "loss": 0.0278, + "step": 4309 + }, + { + "epoch": 0.95, + "grad_norm": 0.18704342884511982, + "learning_rate": 2.9810709826256557e-07, + "loss": 0.0179, + "step": 4310 + }, + { + "epoch": 0.95, + "grad_norm": 0.27612142790059196, + "learning_rate": 2.956646411651165e-07, + "loss": 0.0339, + "step": 4311 + }, + { + "epoch": 0.95, + "grad_norm": 0.252322613566112, + "learning_rate": 2.932321565525853e-07, + "loss": 0.028, + "step": 4312 + }, + { + "epoch": 0.95, + "grad_norm": 0.23652565007706597, + "learning_rate": 2.9080964565606273e-07, + "loss": 0.0327, + "step": 4313 + }, + { + "epoch": 0.95, + "grad_norm": 0.22163762100975448, + "learning_rate": 2.883971097015992e-07, + "loss": 0.0256, + "step": 4314 + }, + { + "epoch": 0.95, + "grad_norm": 0.23073831246607718, + "learning_rate": 2.859945499101913e-07, + "loss": 0.0242, + "step": 4315 + }, + { + "epoch": 0.95, + "grad_norm": 0.23499828748890664, + "learning_rate": 2.8360196749778857e-07, + "loss": 0.0294, + "step": 4316 + }, + { + "epoch": 0.95, + "grad_norm": 0.2615324752419736, + "learning_rate": 2.8121936367529357e-07, + "loss": 0.0289, + "step": 4317 + }, + { + "epoch": 0.95, + "grad_norm": 0.21855633541573388, + "learning_rate": 2.788467396485595e-07, + "loss": 0.0237, + "step": 4318 + }, + { + "epoch": 0.95, + "grad_norm": 0.2838499246799449, + "learning_rate": 2.7648409661837903e-07, + "loss": 0.0325, + "step": 4319 + }, + { + "epoch": 0.95, + "grad_norm": 0.2482569248197878, + "learning_rate": 2.7413143578050915e-07, + "loss": 0.0243, + "step": 4320 + }, + { + "epoch": 0.95, + "grad_norm": 0.24498061064465365, + "learning_rate": 2.7178875832563734e-07, + "loss": 0.0209, + "step": 4321 + }, + { + "epoch": 0.95, + "grad_norm": 0.2504161559849217, + "learning_rate": 2.6945606543941073e-07, + "loss": 0.0322, + "step": 4322 + }, + { + "epoch": 0.95, + "grad_norm": 0.1695524452953065, + "learning_rate": 2.671333583024205e-07, + "loss": 0.019, + "step": 4323 + }, + { + "epoch": 0.95, + "grad_norm": 0.24340795824962608, + "learning_rate": 2.6482063809020186e-07, + "loss": 0.0296, + "step": 4324 + }, + { + "epoch": 0.95, + "grad_norm": 0.19713535350830463, + "learning_rate": 2.625179059732341e-07, + "loss": 0.019, + "step": 4325 + }, + { + "epoch": 0.95, + "grad_norm": 0.24370940463376908, + "learning_rate": 2.6022516311695166e-07, + "loss": 0.0241, + "step": 4326 + }, + { + "epoch": 0.95, + "grad_norm": 0.3164895148273076, + "learning_rate": 2.579424106817174e-07, + "loss": 0.0373, + "step": 4327 + }, + { + "epoch": 0.95, + "grad_norm": 0.23927869121772133, + "learning_rate": 2.556696498228495e-07, + "loss": 0.0268, + "step": 4328 + }, + { + "epoch": 0.95, + "grad_norm": 0.23079495055507038, + "learning_rate": 2.5340688169060767e-07, + "loss": 0.0241, + "step": 4329 + }, + { + "epoch": 0.95, + "grad_norm": 0.2125157196177094, + "learning_rate": 2.511541074301915e-07, + "loss": 0.0244, + "step": 4330 + }, + { + "epoch": 0.95, + "grad_norm": 0.4308091868465902, + "learning_rate": 2.489113281817424e-07, + "loss": 0.0395, + "step": 4331 + }, + { + "epoch": 0.95, + "grad_norm": 0.21628613495322382, + "learning_rate": 2.4667854508034774e-07, + "loss": 0.0283, + "step": 4332 + }, + { + "epoch": 0.95, + "grad_norm": 0.2564300695277641, + "learning_rate": 2.444557592560304e-07, + "loss": 0.0321, + "step": 4333 + }, + { + "epoch": 0.95, + "grad_norm": 0.22927813997690163, + "learning_rate": 2.4224297183375487e-07, + "loss": 0.0219, + "step": 4334 + }, + { + "epoch": 0.95, + "grad_norm": 0.2885607540390936, + "learning_rate": 2.400401839334299e-07, + "loss": 0.0306, + "step": 4335 + }, + { + "epoch": 0.95, + "grad_norm": 0.18273473210359006, + "learning_rate": 2.378473966698991e-07, + "loss": 0.0251, + "step": 4336 + }, + { + "epoch": 0.95, + "grad_norm": 0.2038541873515383, + "learning_rate": 2.356646111529415e-07, + "loss": 0.0163, + "step": 4337 + }, + { + "epoch": 0.95, + "grad_norm": 0.22756448693732054, + "learning_rate": 2.3349182848728447e-07, + "loss": 0.0209, + "step": 4338 + }, + { + "epoch": 0.95, + "grad_norm": 0.25275938272936144, + "learning_rate": 2.3132904977258175e-07, + "loss": 0.0275, + "step": 4339 + }, + { + "epoch": 0.95, + "grad_norm": 0.25280394797837497, + "learning_rate": 2.291762761034333e-07, + "loss": 0.034, + "step": 4340 + }, + { + "epoch": 0.95, + "grad_norm": 0.2656973011126717, + "learning_rate": 2.2703350856936534e-07, + "loss": 0.0339, + "step": 4341 + }, + { + "epoch": 0.95, + "grad_norm": 0.2092671130621699, + "learning_rate": 2.249007482548482e-07, + "loss": 0.0234, + "step": 4342 + }, + { + "epoch": 0.95, + "grad_norm": 0.23992350864382142, + "learning_rate": 2.2277799623928953e-07, + "loss": 0.0296, + "step": 4343 + }, + { + "epoch": 0.95, + "grad_norm": 0.2881623764785107, + "learning_rate": 2.2066525359701885e-07, + "loss": 0.04, + "step": 4344 + }, + { + "epoch": 0.95, + "grad_norm": 0.20662916472588463, + "learning_rate": 2.1856252139731637e-07, + "loss": 0.0307, + "step": 4345 + }, + { + "epoch": 0.95, + "grad_norm": 0.2559865096959685, + "learning_rate": 2.1646980070437973e-07, + "loss": 0.0398, + "step": 4346 + }, + { + "epoch": 0.95, + "grad_norm": 0.2727011438698973, + "learning_rate": 2.14387092577355e-07, + "loss": 0.0281, + "step": 4347 + }, + { + "epoch": 0.95, + "grad_norm": 0.2275752092064004, + "learning_rate": 2.1231439807031019e-07, + "loss": 0.0227, + "step": 4348 + }, + { + "epoch": 0.96, + "grad_norm": 0.25661856732154437, + "learning_rate": 2.102517182322483e-07, + "loss": 0.0212, + "step": 4349 + }, + { + "epoch": 0.96, + "grad_norm": 0.2125043426799214, + "learning_rate": 2.0819905410710327e-07, + "loss": 0.0185, + "step": 4350 + }, + { + "epoch": 0.96, + "grad_norm": 0.1752174395363728, + "learning_rate": 2.0615640673374181e-07, + "loss": 0.0182, + "step": 4351 + }, + { + "epoch": 0.96, + "grad_norm": 0.17804550354493404, + "learning_rate": 2.0412377714596365e-07, + "loss": 0.0159, + "step": 4352 + }, + { + "epoch": 0.96, + "grad_norm": 0.24171920077251674, + "learning_rate": 2.0210116637249032e-07, + "loss": 0.0254, + "step": 4353 + }, + { + "epoch": 0.96, + "grad_norm": 0.23536818987728242, + "learning_rate": 2.0008857543698078e-07, + "loss": 0.0203, + "step": 4354 + }, + { + "epoch": 0.96, + "grad_norm": 0.19003395630424846, + "learning_rate": 1.9808600535802024e-07, + "loss": 0.0193, + "step": 4355 + }, + { + "epoch": 0.96, + "grad_norm": 0.21540936662109153, + "learning_rate": 1.9609345714911575e-07, + "loss": 0.0286, + "step": 4356 + }, + { + "epoch": 0.96, + "grad_norm": 0.25794112724783735, + "learning_rate": 1.941109318187162e-07, + "loss": 0.0263, + "step": 4357 + }, + { + "epoch": 0.96, + "grad_norm": 0.20516243064617495, + "learning_rate": 1.9213843037018344e-07, + "loss": 0.0215, + "step": 4358 + }, + { + "epoch": 0.96, + "grad_norm": 0.2646495776838115, + "learning_rate": 1.9017595380181442e-07, + "loss": 0.0318, + "step": 4359 + }, + { + "epoch": 0.96, + "grad_norm": 0.26166491168237466, + "learning_rate": 1.8822350310683246e-07, + "loss": 0.0251, + "step": 4360 + }, + { + "epoch": 0.96, + "grad_norm": 0.2060624071455964, + "learning_rate": 1.862810792733849e-07, + "loss": 0.0249, + "step": 4361 + }, + { + "epoch": 0.96, + "grad_norm": 0.2474567798364588, + "learning_rate": 1.843486832845409e-07, + "loss": 0.0254, + "step": 4362 + }, + { + "epoch": 0.96, + "grad_norm": 0.2384592704533461, + "learning_rate": 1.8242631611830263e-07, + "loss": 0.0279, + "step": 4363 + }, + { + "epoch": 0.96, + "grad_norm": 0.21230885870940985, + "learning_rate": 1.8051397874758736e-07, + "loss": 0.0316, + "step": 4364 + }, + { + "epoch": 0.96, + "grad_norm": 0.26554681524874807, + "learning_rate": 1.786116721402431e-07, + "loss": 0.0214, + "step": 4365 + }, + { + "epoch": 0.96, + "grad_norm": 0.20386617493254652, + "learning_rate": 1.7671939725903752e-07, + "loss": 0.0249, + "step": 4366 + }, + { + "epoch": 0.96, + "grad_norm": 0.23737255742813607, + "learning_rate": 1.7483715506166455e-07, + "loss": 0.0291, + "step": 4367 + }, + { + "epoch": 0.96, + "grad_norm": 0.19966449611016848, + "learning_rate": 1.729649465007377e-07, + "loss": 0.0269, + "step": 4368 + }, + { + "epoch": 0.96, + "grad_norm": 0.24499828752824235, + "learning_rate": 1.7110277252379238e-07, + "loss": 0.0312, + "step": 4369 + }, + { + "epoch": 0.96, + "grad_norm": 0.23564291367368878, + "learning_rate": 1.692506340732858e-07, + "loss": 0.0234, + "step": 4370 + }, + { + "epoch": 0.96, + "grad_norm": 0.2292955570735456, + "learning_rate": 1.6740853208659923e-07, + "loss": 0.0176, + "step": 4371 + }, + { + "epoch": 0.96, + "grad_norm": 0.3094994858065757, + "learning_rate": 1.655764674960292e-07, + "loss": 0.0357, + "step": 4372 + }, + { + "epoch": 0.96, + "grad_norm": 0.22058824064100482, + "learning_rate": 1.6375444122879613e-07, + "loss": 0.0266, + "step": 4373 + }, + { + "epoch": 0.96, + "grad_norm": 0.24158084937870627, + "learning_rate": 1.6194245420704025e-07, + "loss": 0.0243, + "step": 4374 + }, + { + "epoch": 0.96, + "grad_norm": 0.2617999847509731, + "learning_rate": 1.6014050734781461e-07, + "loss": 0.0193, + "step": 4375 + }, + { + "epoch": 0.96, + "grad_norm": 0.21384703310406136, + "learning_rate": 1.583486015630986e-07, + "loss": 0.0278, + "step": 4376 + }, + { + "epoch": 0.96, + "grad_norm": 0.2059478121015002, + "learning_rate": 1.565667377597868e-07, + "loss": 0.0222, + "step": 4377 + }, + { + "epoch": 0.96, + "grad_norm": 0.25593105797118465, + "learning_rate": 1.5479491683969117e-07, + "loss": 0.0291, + "step": 4378 + }, + { + "epoch": 0.96, + "grad_norm": 0.2607773603511524, + "learning_rate": 1.5303313969954103e-07, + "loss": 0.0239, + "step": 4379 + }, + { + "epoch": 0.96, + "grad_norm": 0.18820977971594646, + "learning_rate": 1.5128140723098317e-07, + "loss": 0.0193, + "step": 4380 + }, + { + "epoch": 0.96, + "grad_norm": 0.22737894441256667, + "learning_rate": 1.4953972032057952e-07, + "loss": 0.02, + "step": 4381 + }, + { + "epoch": 0.96, + "grad_norm": 0.22069038084251644, + "learning_rate": 1.4780807984980716e-07, + "loss": 0.0256, + "step": 4382 + }, + { + "epoch": 0.96, + "grad_norm": 0.2673301940370567, + "learning_rate": 1.4608648669506287e-07, + "loss": 0.0311, + "step": 4383 + }, + { + "epoch": 0.96, + "grad_norm": 0.21062860123586485, + "learning_rate": 1.443749417276541e-07, + "loss": 0.0309, + "step": 4384 + }, + { + "epoch": 0.96, + "grad_norm": 0.25730030158581574, + "learning_rate": 1.4267344581380127e-07, + "loss": 0.03, + "step": 4385 + }, + { + "epoch": 0.96, + "grad_norm": 0.23933088782605375, + "learning_rate": 1.4098199981464887e-07, + "loss": 0.0316, + "step": 4386 + }, + { + "epoch": 0.96, + "grad_norm": 0.27638496638288323, + "learning_rate": 1.3930060458624106e-07, + "loss": 0.0289, + "step": 4387 + }, + { + "epoch": 0.96, + "grad_norm": 0.231611256723867, + "learning_rate": 1.37629260979546e-07, + "loss": 0.0222, + "step": 4388 + }, + { + "epoch": 0.96, + "grad_norm": 0.21467912609032905, + "learning_rate": 1.3596796984044037e-07, + "loss": 0.0209, + "step": 4389 + }, + { + "epoch": 0.96, + "grad_norm": 0.21185405602366125, + "learning_rate": 1.3431673200971386e-07, + "loss": 0.0229, + "step": 4390 + }, + { + "epoch": 0.96, + "grad_norm": 0.31429962297338676, + "learning_rate": 1.3267554832306463e-07, + "loss": 0.0379, + "step": 4391 + }, + { + "epoch": 0.96, + "grad_norm": 0.25666941476925725, + "learning_rate": 1.310444196111127e-07, + "loss": 0.0329, + "step": 4392 + }, + { + "epoch": 0.96, + "grad_norm": 0.24319215788408693, + "learning_rate": 1.2942334669937773e-07, + "loss": 0.0285, + "step": 4393 + }, + { + "epoch": 0.97, + "grad_norm": 0.22563324469404789, + "learning_rate": 1.2781233040829234e-07, + "loss": 0.0186, + "step": 4394 + }, + { + "epoch": 0.97, + "grad_norm": 0.24097933441907754, + "learning_rate": 1.2621137155320872e-07, + "loss": 0.0315, + "step": 4395 + }, + { + "epoch": 0.97, + "grad_norm": 0.22624021660480337, + "learning_rate": 1.2462047094437657e-07, + "loss": 0.0185, + "step": 4396 + }, + { + "epoch": 0.97, + "grad_norm": 0.2228838276219386, + "learning_rate": 1.2303962938696068e-07, + "loss": 0.0244, + "step": 4397 + }, + { + "epoch": 0.97, + "grad_norm": 0.24666175686507735, + "learning_rate": 1.2146884768103883e-07, + "loss": 0.0245, + "step": 4398 + }, + { + "epoch": 0.97, + "grad_norm": 0.23184753427529117, + "learning_rate": 1.1990812662158846e-07, + "loss": 0.0288, + "step": 4399 + }, + { + "epoch": 0.97, + "grad_norm": 0.31526595551894643, + "learning_rate": 1.1835746699850215e-07, + "loss": 0.0262, + "step": 4400 + }, + { + "epoch": 0.97, + "grad_norm": 0.2519075219335554, + "learning_rate": 1.1681686959657879e-07, + "loss": 0.0254, + "step": 4401 + }, + { + "epoch": 0.97, + "grad_norm": 0.2899146535678704, + "learning_rate": 1.1528633519552357e-07, + "loss": 0.0271, + "step": 4402 + }, + { + "epoch": 0.97, + "grad_norm": 0.1967784891024214, + "learning_rate": 1.1376586456994798e-07, + "loss": 0.0216, + "step": 4403 + }, + { + "epoch": 0.97, + "grad_norm": 0.20469461639787942, + "learning_rate": 1.1225545848937203e-07, + "loss": 0.0271, + "step": 4404 + }, + { + "epoch": 0.97, + "grad_norm": 0.2547110347756392, + "learning_rate": 1.1075511771822423e-07, + "loss": 0.0289, + "step": 4405 + }, + { + "epoch": 0.97, + "grad_norm": 0.24842515021044465, + "learning_rate": 1.0926484301583273e-07, + "loss": 0.0342, + "step": 4406 + }, + { + "epoch": 0.97, + "grad_norm": 0.27344049724571035, + "learning_rate": 1.0778463513643645e-07, + "loss": 0.041, + "step": 4407 + }, + { + "epoch": 0.97, + "grad_norm": 0.2866099288819449, + "learning_rate": 1.0631449482917833e-07, + "loss": 0.0293, + "step": 4408 + }, + { + "epoch": 0.97, + "grad_norm": 0.2351114860967714, + "learning_rate": 1.0485442283810321e-07, + "loss": 0.0276, + "step": 4409 + }, + { + "epoch": 0.97, + "grad_norm": 0.21697494136147136, + "learning_rate": 1.0340441990216443e-07, + "loss": 0.022, + "step": 4410 + }, + { + "epoch": 0.97, + "grad_norm": 0.21252921688956675, + "learning_rate": 1.019644867552172e-07, + "loss": 0.024, + "step": 4411 + }, + { + "epoch": 0.97, + "grad_norm": 0.23640651821365824, + "learning_rate": 1.0053462412601855e-07, + "loss": 0.0249, + "step": 4412 + }, + { + "epoch": 0.97, + "grad_norm": 0.20857305245457847, + "learning_rate": 9.911483273823408e-08, + "loss": 0.021, + "step": 4413 + }, + { + "epoch": 0.97, + "grad_norm": 0.23242865480423908, + "learning_rate": 9.770511331042454e-08, + "loss": 0.0347, + "step": 4414 + }, + { + "epoch": 0.97, + "grad_norm": 0.2642728949258274, + "learning_rate": 9.630546655606365e-08, + "loss": 0.0216, + "step": 4415 + }, + { + "epoch": 0.97, + "grad_norm": 0.28584540713271855, + "learning_rate": 9.491589318351368e-08, + "loss": 0.0381, + "step": 4416 + }, + { + "epoch": 0.97, + "grad_norm": 0.19898143358756218, + "learning_rate": 9.353639389605207e-08, + "loss": 0.0191, + "step": 4417 + }, + { + "epoch": 0.97, + "grad_norm": 0.23981266690114933, + "learning_rate": 9.216696939184922e-08, + "loss": 0.0248, + "step": 4418 + }, + { + "epoch": 0.97, + "grad_norm": 0.2955660826219277, + "learning_rate": 9.080762036398184e-08, + "loss": 0.0298, + "step": 4419 + }, + { + "epoch": 0.97, + "grad_norm": 0.27698898459903903, + "learning_rate": 8.94583475004196e-08, + "loss": 0.0337, + "step": 4420 + }, + { + "epoch": 0.97, + "grad_norm": 0.2443484680475507, + "learning_rate": 8.811915148404294e-08, + "loss": 0.03, + "step": 4421 + }, + { + "epoch": 0.97, + "grad_norm": 0.20734399888944313, + "learning_rate": 8.679003299262523e-08, + "loss": 0.0265, + "step": 4422 + }, + { + "epoch": 0.97, + "grad_norm": 0.2711587228885377, + "learning_rate": 8.547099269884396e-08, + "loss": 0.029, + "step": 4423 + }, + { + "epoch": 0.97, + "grad_norm": 0.22715330372892692, + "learning_rate": 8.416203127026734e-08, + "loss": 0.0272, + "step": 4424 + }, + { + "epoch": 0.97, + "grad_norm": 0.18137627510868737, + "learning_rate": 8.286314936937434e-08, + "loss": 0.0207, + "step": 4425 + }, + { + "epoch": 0.97, + "grad_norm": 0.25726488504265727, + "learning_rate": 8.157434765353466e-08, + "loss": 0.0222, + "step": 4426 + }, + { + "epoch": 0.97, + "grad_norm": 0.22919501685576507, + "learning_rate": 8.029562677502212e-08, + "loss": 0.0216, + "step": 4427 + }, + { + "epoch": 0.97, + "grad_norm": 0.22722395988706445, + "learning_rate": 7.902698738099901e-08, + "loss": 0.0251, + "step": 4428 + }, + { + "epoch": 0.97, + "grad_norm": 0.25666707801063404, + "learning_rate": 7.776843011353619e-08, + "loss": 0.0306, + "step": 4429 + }, + { + "epoch": 0.97, + "grad_norm": 0.26100535452765306, + "learning_rate": 7.651995560959525e-08, + "loss": 0.0265, + "step": 4430 + }, + { + "epoch": 0.97, + "grad_norm": 0.21609393419766798, + "learning_rate": 7.528156450103963e-08, + "loss": 0.0225, + "step": 4431 + }, + { + "epoch": 0.97, + "grad_norm": 0.21134781666393448, + "learning_rate": 7.405325741462354e-08, + "loss": 0.0201, + "step": 4432 + }, + { + "epoch": 0.97, + "grad_norm": 0.17877245193474503, + "learning_rate": 7.283503497200083e-08, + "loss": 0.0137, + "step": 4433 + }, + { + "epoch": 0.97, + "grad_norm": 0.22853090773120874, + "learning_rate": 7.162689778972276e-08, + "loss": 0.0273, + "step": 4434 + }, + { + "epoch": 0.97, + "grad_norm": 0.2219781169228068, + "learning_rate": 7.042884647923353e-08, + "loss": 0.0245, + "step": 4435 + }, + { + "epoch": 0.97, + "grad_norm": 0.21269905150098456, + "learning_rate": 6.924088164687703e-08, + "loss": 0.0239, + "step": 4436 + }, + { + "epoch": 0.97, + "grad_norm": 0.2551449125986645, + "learning_rate": 6.806300389388565e-08, + "loss": 0.0367, + "step": 4437 + }, + { + "epoch": 0.97, + "grad_norm": 0.24152648610438776, + "learning_rate": 6.689521381639363e-08, + "loss": 0.0222, + "step": 4438 + }, + { + "epoch": 0.97, + "grad_norm": 0.2695999196245218, + "learning_rate": 6.573751200542599e-08, + "loss": 0.0194, + "step": 4439 + }, + { + "epoch": 0.98, + "grad_norm": 0.18773310793839185, + "learning_rate": 6.458989904690072e-08, + "loss": 0.0177, + "step": 4440 + }, + { + "epoch": 0.98, + "grad_norm": 0.3021901684419358, + "learning_rate": 6.345237552163541e-08, + "loss": 0.026, + "step": 4441 + }, + { + "epoch": 0.98, + "grad_norm": 0.16928288465784194, + "learning_rate": 6.232494200533623e-08, + "loss": 0.0136, + "step": 4442 + }, + { + "epoch": 0.98, + "grad_norm": 0.3627623677343884, + "learning_rate": 6.120759906860008e-08, + "loss": 0.0468, + "step": 4443 + }, + { + "epoch": 0.98, + "grad_norm": 0.2693787295266967, + "learning_rate": 6.010034727692792e-08, + "loss": 0.0301, + "step": 4444 + }, + { + "epoch": 0.98, + "grad_norm": 0.2592147780396339, + "learning_rate": 5.900318719070264e-08, + "loss": 0.0259, + "step": 4445 + }, + { + "epoch": 0.98, + "grad_norm": 0.3031344323804535, + "learning_rate": 5.791611936520447e-08, + "loss": 0.0305, + "step": 4446 + }, + { + "epoch": 0.98, + "grad_norm": 0.227992224592611, + "learning_rate": 5.683914435060445e-08, + "loss": 0.0258, + "step": 4447 + }, + { + "epoch": 0.98, + "grad_norm": 0.19873245575682397, + "learning_rate": 5.577226269196656e-08, + "loss": 0.0151, + "step": 4448 + }, + { + "epoch": 0.98, + "grad_norm": 0.2479018741222908, + "learning_rate": 5.471547492924778e-08, + "loss": 0.0242, + "step": 4449 + }, + { + "epoch": 0.98, + "grad_norm": 0.2322767429661271, + "learning_rate": 5.3668781597291386e-08, + "loss": 0.0269, + "step": 4450 + }, + { + "epoch": 0.98, + "grad_norm": 0.21112779581354238, + "learning_rate": 5.263218322584029e-08, + "loss": 0.0245, + "step": 4451 + }, + { + "epoch": 0.98, + "grad_norm": 0.23287870886567938, + "learning_rate": 5.160568033951929e-08, + "loss": 0.0309, + "step": 4452 + }, + { + "epoch": 0.98, + "grad_norm": 0.2579778516701443, + "learning_rate": 5.058927345784836e-08, + "loss": 0.0317, + "step": 4453 + }, + { + "epoch": 0.98, + "grad_norm": 0.2629862737455073, + "learning_rate": 4.9582963095238247e-08, + "loss": 0.0267, + "step": 4454 + }, + { + "epoch": 0.98, + "grad_norm": 0.2353377120505107, + "learning_rate": 4.8586749760985987e-08, + "loss": 0.0295, + "step": 4455 + }, + { + "epoch": 0.98, + "grad_norm": 0.26850899047468857, + "learning_rate": 4.7600633959286044e-08, + "loss": 0.0267, + "step": 4456 + }, + { + "epoch": 0.98, + "grad_norm": 0.25478691547878657, + "learning_rate": 4.6624616189214765e-08, + "loss": 0.025, + "step": 4457 + }, + { + "epoch": 0.98, + "grad_norm": 0.25562566901681366, + "learning_rate": 4.565869694474367e-08, + "loss": 0.0298, + "step": 4458 + }, + { + "epoch": 0.98, + "grad_norm": 0.2124237838973614, + "learning_rate": 4.470287671472395e-08, + "loss": 0.0176, + "step": 4459 + }, + { + "epoch": 0.98, + "grad_norm": 0.24257266273430095, + "learning_rate": 4.375715598290864e-08, + "loss": 0.0304, + "step": 4460 + }, + { + "epoch": 0.98, + "grad_norm": 0.2193001317922933, + "learning_rate": 4.2821535227930424e-08, + "loss": 0.0226, + "step": 4461 + }, + { + "epoch": 0.98, + "grad_norm": 0.23471578618040925, + "learning_rate": 4.1896014923310525e-08, + "loss": 0.0235, + "step": 4462 + }, + { + "epoch": 0.98, + "grad_norm": 0.17518535913199723, + "learning_rate": 4.098059553746536e-08, + "loss": 0.0186, + "step": 4463 + }, + { + "epoch": 0.98, + "grad_norm": 0.22191256420057429, + "learning_rate": 4.0075277533688784e-08, + "loss": 0.0261, + "step": 4464 + }, + { + "epoch": 0.98, + "grad_norm": 0.26132643888696494, + "learning_rate": 3.918006137017205e-08, + "loss": 0.0243, + "step": 4465 + }, + { + "epoch": 0.98, + "grad_norm": 0.18770673401504426, + "learning_rate": 3.829494749998608e-08, + "loss": 0.0165, + "step": 4466 + }, + { + "epoch": 0.98, + "grad_norm": 0.20540545543875058, + "learning_rate": 3.7419936371094756e-08, + "loss": 0.0281, + "step": 4467 + }, + { + "epoch": 0.98, + "grad_norm": 0.22627322869373032, + "learning_rate": 3.655502842634606e-08, + "loss": 0.039, + "step": 4468 + }, + { + "epoch": 0.98, + "grad_norm": 0.23842989084493904, + "learning_rate": 3.570022410347651e-08, + "loss": 0.0272, + "step": 4469 + }, + { + "epoch": 0.98, + "grad_norm": 0.2099454474834303, + "learning_rate": 3.485552383510671e-08, + "loss": 0.0195, + "step": 4470 + }, + { + "epoch": 0.98, + "grad_norm": 0.21160685337741267, + "learning_rate": 3.402092804874357e-08, + "loss": 0.0223, + "step": 4471 + }, + { + "epoch": 0.98, + "grad_norm": 0.22340505571162192, + "learning_rate": 3.3196437166780336e-08, + "loss": 0.0287, + "step": 4472 + }, + { + "epoch": 0.98, + "grad_norm": 0.19631909382666662, + "learning_rate": 3.2382051606500986e-08, + "loss": 0.0196, + "step": 4473 + }, + { + "epoch": 0.98, + "grad_norm": 0.2376322898599393, + "learning_rate": 3.1577771780066936e-08, + "loss": 0.0277, + "step": 4474 + }, + { + "epoch": 0.98, + "grad_norm": 0.2635491827444607, + "learning_rate": 3.078359809453257e-08, + "loss": 0.0348, + "step": 4475 + }, + { + "epoch": 0.98, + "grad_norm": 0.18508521419488638, + "learning_rate": 2.999953095182972e-08, + "loss": 0.0123, + "step": 4476 + }, + { + "epoch": 0.98, + "grad_norm": 0.2867455805144428, + "learning_rate": 2.9225570748785402e-08, + "loss": 0.0334, + "step": 4477 + }, + { + "epoch": 0.98, + "grad_norm": 0.21363303896832175, + "learning_rate": 2.8461717877099615e-08, + "loss": 0.0207, + "step": 4478 + }, + { + "epoch": 0.98, + "grad_norm": 0.21129233926357469, + "learning_rate": 2.770797272336756e-08, + "loss": 0.0248, + "step": 4479 + }, + { + "epoch": 0.98, + "grad_norm": 0.2801433088352883, + "learning_rate": 2.696433566905965e-08, + "loss": 0.0351, + "step": 4480 + }, + { + "epoch": 0.98, + "grad_norm": 0.2532927363554509, + "learning_rate": 2.623080709054149e-08, + "loss": 0.0282, + "step": 4481 + }, + { + "epoch": 0.98, + "grad_norm": 0.2864289817800324, + "learning_rate": 2.550738735905167e-08, + "loss": 0.0324, + "step": 4482 + }, + { + "epoch": 0.98, + "grad_norm": 0.2373376988185147, + "learning_rate": 2.479407684071733e-08, + "loss": 0.0258, + "step": 4483 + }, + { + "epoch": 0.98, + "grad_norm": 0.2127057068581972, + "learning_rate": 2.4090875896551903e-08, + "loss": 0.0177, + "step": 4484 + }, + { + "epoch": 0.99, + "grad_norm": 0.23692122823597622, + "learning_rate": 2.3397784882448483e-08, + "loss": 0.0299, + "step": 4485 + }, + { + "epoch": 0.99, + "grad_norm": 0.20988079581968236, + "learning_rate": 2.2714804149184256e-08, + "loss": 0.0262, + "step": 4486 + }, + { + "epoch": 0.99, + "grad_norm": 0.23558350368400716, + "learning_rate": 2.2041934042420497e-08, + "loss": 0.0209, + "step": 4487 + }, + { + "epoch": 0.99, + "grad_norm": 0.2822744071559274, + "learning_rate": 2.137917490269814e-08, + "loss": 0.031, + "step": 4488 + }, + { + "epoch": 0.99, + "grad_norm": 0.32218783046509364, + "learning_rate": 2.0726527065448865e-08, + "loss": 0.0352, + "step": 4489 + }, + { + "epoch": 0.99, + "grad_norm": 0.2496145301608871, + "learning_rate": 2.0083990860977343e-08, + "loss": 0.033, + "step": 4490 + }, + { + "epoch": 0.99, + "grad_norm": 0.2534664306167664, + "learning_rate": 1.9451566614479e-08, + "loss": 0.0314, + "step": 4491 + }, + { + "epoch": 0.99, + "grad_norm": 0.21548093461183634, + "learning_rate": 1.8829254646022256e-08, + "loss": 0.0247, + "step": 4492 + }, + { + "epoch": 0.99, + "grad_norm": 0.2361792103104299, + "learning_rate": 1.8217055270568497e-08, + "loss": 0.0211, + "step": 4493 + }, + { + "epoch": 0.99, + "grad_norm": 0.2320042231582333, + "learning_rate": 1.7614968797952102e-08, + "loss": 0.0239, + "step": 4494 + }, + { + "epoch": 0.99, + "grad_norm": 0.2202331890576311, + "learning_rate": 1.702299553289377e-08, + "loss": 0.0336, + "step": 4495 + }, + { + "epoch": 0.99, + "grad_norm": 0.23760916560119846, + "learning_rate": 1.6441135774996066e-08, + "loss": 0.0317, + "step": 4496 + }, + { + "epoch": 0.99, + "grad_norm": 0.29437743706311953, + "learning_rate": 1.586938981873898e-08, + "loss": 0.0276, + "step": 4497 + }, + { + "epoch": 0.99, + "grad_norm": 0.18441166189966846, + "learning_rate": 1.530775795348882e-08, + "loss": 0.0143, + "step": 4498 + }, + { + "epoch": 0.99, + "grad_norm": 0.23832741274571254, + "learning_rate": 1.4756240463491555e-08, + "loss": 0.0232, + "step": 4499 + }, + { + "epoch": 0.99, + "grad_norm": 0.2247094470461646, + "learning_rate": 1.421483762787057e-08, + "loss": 0.0274, + "step": 4500 + }, + { + "epoch": 0.99, + "grad_norm": 0.3082288685382838, + "learning_rate": 1.368354972063557e-08, + "loss": 0.0349, + "step": 4501 + }, + { + "epoch": 0.99, + "grad_norm": 0.19854662215208568, + "learning_rate": 1.3162377010673689e-08, + "loss": 0.027, + "step": 4502 + }, + { + "epoch": 0.99, + "grad_norm": 0.31196752721804216, + "learning_rate": 1.2651319761753933e-08, + "loss": 0.0464, + "step": 4503 + }, + { + "epoch": 0.99, + "grad_norm": 0.30917682369252936, + "learning_rate": 1.2150378232527183e-08, + "loss": 0.0323, + "step": 4504 + }, + { + "epoch": 0.99, + "grad_norm": 0.22311947137196755, + "learning_rate": 1.1659552676519525e-08, + "loss": 0.0258, + "step": 4505 + }, + { + "epoch": 0.99, + "grad_norm": 0.2824374931944922, + "learning_rate": 1.1178843342143363e-08, + "loss": 0.03, + "step": 4506 + }, + { + "epoch": 0.99, + "grad_norm": 0.2217890767832452, + "learning_rate": 1.070825047268631e-08, + "loss": 0.0223, + "step": 4507 + }, + { + "epoch": 0.99, + "grad_norm": 0.2394927603447689, + "learning_rate": 1.024777430632229e-08, + "loss": 0.0262, + "step": 4508 + }, + { + "epoch": 0.99, + "grad_norm": 0.30117619581870964, + "learning_rate": 9.797415076095996e-09, + "loss": 0.0328, + "step": 4509 + }, + { + "epoch": 0.99, + "grad_norm": 0.28627625082660674, + "learning_rate": 9.357173009942878e-09, + "loss": 0.0275, + "step": 4510 + }, + { + "epoch": 0.99, + "grad_norm": 0.20651706686452403, + "learning_rate": 8.927048330666932e-09, + "loss": 0.0214, + "step": 4511 + }, + { + "epoch": 0.99, + "grad_norm": 0.2013347207359353, + "learning_rate": 8.50704125595847e-09, + "loss": 0.0292, + "step": 4512 + }, + { + "epoch": 0.99, + "grad_norm": 0.24912173257265088, + "learning_rate": 8.097151998387453e-09, + "loss": 0.0236, + "step": 4513 + }, + { + "epoch": 0.99, + "grad_norm": 0.2045073513613524, + "learning_rate": 7.697380765399053e-09, + "loss": 0.0181, + "step": 4514 + }, + { + "epoch": 0.99, + "grad_norm": 0.2715851713271294, + "learning_rate": 7.3077277593203155e-09, + "loss": 0.0358, + "step": 4515 + }, + { + "epoch": 0.99, + "grad_norm": 0.24233401389028633, + "learning_rate": 6.928193177360154e-09, + "loss": 0.0288, + "step": 4516 + }, + { + "epoch": 0.99, + "grad_norm": 0.2407937244795581, + "learning_rate": 6.558777211598255e-09, + "loss": 0.0264, + "step": 4517 + }, + { + "epoch": 0.99, + "grad_norm": 0.27028214168412523, + "learning_rate": 6.199480049000617e-09, + "loss": 0.0313, + "step": 4518 + }, + { + "epoch": 0.99, + "grad_norm": 0.22730838559569844, + "learning_rate": 5.850301871410668e-09, + "loss": 0.0315, + "step": 4519 + }, + { + "epoch": 0.99, + "grad_norm": 0.22956373032952396, + "learning_rate": 5.51124285554927e-09, + "loss": 0.0258, + "step": 4520 + }, + { + "epoch": 0.99, + "grad_norm": 0.22859658331428653, + "learning_rate": 5.182303173016934e-09, + "loss": 0.0302, + "step": 4521 + }, + { + "epoch": 0.99, + "grad_norm": 0.20019196298169717, + "learning_rate": 4.8634829902893806e-09, + "loss": 0.02, + "step": 4522 + }, + { + "epoch": 0.99, + "grad_norm": 0.256797976281126, + "learning_rate": 4.554782468726426e-09, + "loss": 0.0336, + "step": 4523 + }, + { + "epoch": 0.99, + "grad_norm": 0.24382401345259824, + "learning_rate": 4.256201764560874e-09, + "loss": 0.0276, + "step": 4524 + }, + { + "epoch": 0.99, + "grad_norm": 0.26355353631491457, + "learning_rate": 3.967741028907401e-09, + "loss": 0.0321, + "step": 4525 + }, + { + "epoch": 0.99, + "grad_norm": 0.236170992355893, + "learning_rate": 3.6894004077558942e-09, + "loss": 0.0261, + "step": 4526 + }, + { + "epoch": 0.99, + "grad_norm": 0.21118369570046402, + "learning_rate": 3.421180041980332e-09, + "loss": 0.026, + "step": 4527 + }, + { + "epoch": 0.99, + "grad_norm": 0.215920213733391, + "learning_rate": 3.1630800673254636e-09, + "loss": 0.0233, + "step": 4528 + }, + { + "epoch": 0.99, + "grad_norm": 0.341207246372847, + "learning_rate": 2.9151006144201298e-09, + "loss": 0.0292, + "step": 4529 + }, + { + "epoch": 0.99, + "grad_norm": 0.27762589408390465, + "learning_rate": 2.6772418087639417e-09, + "loss": 0.0251, + "step": 4530 + }, + { + "epoch": 1.0, + "grad_norm": 0.19320572422128685, + "learning_rate": 2.4495037707428226e-09, + "loss": 0.0163, + "step": 4531 + }, + { + "epoch": 1.0, + "grad_norm": 0.26428603242243764, + "learning_rate": 2.2318866156134654e-09, + "loss": 0.0248, + "step": 4532 + }, + { + "epoch": 1.0, + "grad_norm": 0.33253742231504263, + "learning_rate": 2.0243904535144353e-09, + "loss": 0.0453, + "step": 4533 + }, + { + "epoch": 1.0, + "grad_norm": 0.22122698196730783, + "learning_rate": 1.8270153894617282e-09, + "loss": 0.0256, + "step": 4534 + }, + { + "epoch": 1.0, + "grad_norm": 0.23089916641515254, + "learning_rate": 1.6397615233465503e-09, + "loss": 0.0264, + "step": 4535 + }, + { + "epoch": 1.0, + "grad_norm": 0.22087355590784324, + "learning_rate": 1.4626289499397596e-09, + "loss": 0.0225, + "step": 4536 + }, + { + "epoch": 1.0, + "grad_norm": 0.24501147023180128, + "learning_rate": 1.2956177588896445e-09, + "loss": 0.0271, + "step": 4537 + }, + { + "epoch": 1.0, + "grad_norm": 0.23313726392912676, + "learning_rate": 1.138728034719705e-09, + "loss": 0.0281, + "step": 4538 + }, + { + "epoch": 1.0, + "grad_norm": 0.24477027569640544, + "learning_rate": 9.919598568353118e-10, + "loss": 0.0282, + "step": 4539 + }, + { + "epoch": 1.0, + "grad_norm": 0.29386165607098685, + "learning_rate": 8.553132995170466e-10, + "loss": 0.0264, + "step": 4540 + }, + { + "epoch": 1.0, + "grad_norm": 0.36618149702624436, + "learning_rate": 7.287884319184813e-10, + "loss": 0.0326, + "step": 4541 + }, + { + "epoch": 1.0, + "grad_norm": 0.28540606856102807, + "learning_rate": 6.123853180795003e-10, + "loss": 0.0362, + "step": 4542 + }, + { + "epoch": 1.0, + "grad_norm": 0.20529983887905334, + "learning_rate": 5.061040169107578e-10, + "loss": 0.0167, + "step": 4543 + }, + { + "epoch": 1.0, + "grad_norm": 0.2795280753393063, + "learning_rate": 4.0994458220033896e-10, + "loss": 0.0294, + "step": 4544 + }, + { + "epoch": 1.0, + "grad_norm": 0.2661258659536841, + "learning_rate": 3.2390706261598015e-10, + "loss": 0.0309, + "step": 4545 + }, + { + "epoch": 1.0, + "grad_norm": 0.2145268096519866, + "learning_rate": 2.479915017028489e-10, + "loss": 0.0283, + "step": 4546 + }, + { + "epoch": 1.0, + "grad_norm": 0.18357432039405988, + "learning_rate": 1.8219793788132322e-10, + "loss": 0.0201, + "step": 4547 + }, + { + "epoch": 1.0, + "grad_norm": 0.2166040353315691, + "learning_rate": 1.265264044514325e-10, + "loss": 0.0244, + "step": 4548 + }, + { + "epoch": 1.0, + "grad_norm": 0.23382594647015617, + "learning_rate": 8.097692958619619e-11, + "loss": 0.0268, + "step": 4549 + }, + { + "epoch": 1.0, + "grad_norm": 0.24069606796661666, + "learning_rate": 4.5549536340505627e-11, + "loss": 0.0201, + "step": 4550 + }, + { + "epoch": 1.0, + "grad_norm": 0.22354764024397278, + "learning_rate": 2.024424264224223e-11, + "loss": 0.0247, + "step": 4551 + }, + { + "epoch": 1.0, + "grad_norm": 0.22004441054170246, + "learning_rate": 5.061061301159242e-12, + "loss": 0.0244, + "step": 4552 + }, + { + "epoch": 1.0, + "grad_norm": 0.27832210607118085, + "learning_rate": 0.0, + "loss": 0.0444, + "step": 4553 + }, + { + "epoch": 1.0, + "step": 4553, + "total_flos": 1.7101442093126403e+22, + "train_loss": 0.1402213812993695, + "train_runtime": 25492.8985, + "train_samples_per_second": 11.43, + "train_steps_per_second": 0.179 + } + ], + "logging_steps": 1.0, + "max_steps": 4553, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "total_flos": 1.7101442093126403e+22, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}