diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,88396 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999603881956823, + "eval_steps": 500, + "global_step": 12622, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.922360863537334e-05, + "grad_norm": 31.868705434213155, + "learning_rate": 5.277044854881267e-08, + "loss": 2.4055, + "step": 1 + }, + { + "epoch": 0.00015844721727074668, + "grad_norm": 33.35421203082402, + "learning_rate": 1.0554089709762534e-07, + "loss": 2.229, + "step": 2 + }, + { + "epoch": 0.00023767082590612002, + "grad_norm": 30.905772260462495, + "learning_rate": 1.5831134564643802e-07, + "loss": 2.5515, + "step": 3 + }, + { + "epoch": 0.00031689443454149336, + "grad_norm": 31.82938654107664, + "learning_rate": 2.1108179419525068e-07, + "loss": 2.4607, + "step": 4 + }, + { + "epoch": 0.0003961180431768667, + "grad_norm": 33.89602990583608, + "learning_rate": 2.6385224274406334e-07, + "loss": 2.7529, + "step": 5 + }, + { + "epoch": 0.00047534165181224003, + "grad_norm": 30.444966472962584, + "learning_rate": 3.1662269129287605e-07, + "loss": 2.5701, + "step": 6 + }, + { + "epoch": 0.0005545652604476134, + "grad_norm": 28.601879017937765, + "learning_rate": 3.693931398416887e-07, + "loss": 2.3982, + "step": 7 + }, + { + "epoch": 0.0006337888690829867, + "grad_norm": 28.96128266138133, + "learning_rate": 4.2216358839050136e-07, + "loss": 2.4097, + "step": 8 + }, + { + "epoch": 0.00071301247771836, + "grad_norm": 34.74863678489119, + "learning_rate": 4.7493403693931397e-07, + "loss": 2.4995, + "step": 9 + }, + { + "epoch": 0.0007922360863537334, + "grad_norm": 27.248425019992244, + "learning_rate": 5.277044854881267e-07, + "loss": 2.4025, + "step": 10 + }, + { + "epoch": 0.0008714596949891067, + "grad_norm": 26.19048251774951, + "learning_rate": 5.804749340369393e-07, + "loss": 2.4338, + "step": 11 + }, + { + "epoch": 0.0009506833036244801, + "grad_norm": 29.591611096105435, + "learning_rate": 6.332453825857521e-07, + "loss": 2.4309, + "step": 12 + }, + { + "epoch": 0.0010299069122598535, + "grad_norm": 28.882358214389345, + "learning_rate": 6.860158311345646e-07, + "loss": 2.2793, + "step": 13 + }, + { + "epoch": 0.0011091305208952267, + "grad_norm": 28.514388351231172, + "learning_rate": 7.387862796833774e-07, + "loss": 2.2069, + "step": 14 + }, + { + "epoch": 0.0011883541295306002, + "grad_norm": 29.394819635818916, + "learning_rate": 7.915567282321901e-07, + "loss": 2.2789, + "step": 15 + }, + { + "epoch": 0.0012675777381659734, + "grad_norm": 28.82636668537433, + "learning_rate": 8.443271767810027e-07, + "loss": 2.1414, + "step": 16 + }, + { + "epoch": 0.0013468013468013469, + "grad_norm": 27.172986033880207, + "learning_rate": 8.970976253298154e-07, + "loss": 2.0737, + "step": 17 + }, + { + "epoch": 0.00142602495543672, + "grad_norm": 28.9492321515514, + "learning_rate": 9.498680738786279e-07, + "loss": 2.0167, + "step": 18 + }, + { + "epoch": 0.0015052485640720936, + "grad_norm": 17.255700208930268, + "learning_rate": 1.0026385224274407e-06, + "loss": 1.6937, + "step": 19 + }, + { + "epoch": 0.0015844721727074668, + "grad_norm": 13.912069084573742, + "learning_rate": 1.0554089709762534e-06, + "loss": 1.7679, + "step": 20 + }, + { + "epoch": 0.0016636957813428402, + "grad_norm": 14.66426105865668, + "learning_rate": 1.108179419525066e-06, + "loss": 1.7548, + "step": 21 + }, + { + "epoch": 0.0017429193899782135, + "grad_norm": 14.210612712641286, + "learning_rate": 1.1609498680738787e-06, + "loss": 1.7754, + "step": 22 + }, + { + "epoch": 0.001822142998613587, + "grad_norm": 18.649014993687306, + "learning_rate": 1.2137203166226915e-06, + "loss": 1.8135, + "step": 23 + }, + { + "epoch": 0.0019013666072489601, + "grad_norm": 16.151271281666702, + "learning_rate": 1.2664907651715042e-06, + "loss": 1.6643, + "step": 24 + }, + { + "epoch": 0.0019805902158843334, + "grad_norm": 21.06738298338953, + "learning_rate": 1.3192612137203166e-06, + "loss": 1.8694, + "step": 25 + }, + { + "epoch": 0.002059813824519707, + "grad_norm": 15.419672041356916, + "learning_rate": 1.3720316622691293e-06, + "loss": 1.5757, + "step": 26 + }, + { + "epoch": 0.0021390374331550803, + "grad_norm": 15.600724016035702, + "learning_rate": 1.4248021108179422e-06, + "loss": 1.553, + "step": 27 + }, + { + "epoch": 0.0022182610417904535, + "grad_norm": 12.229878834112258, + "learning_rate": 1.4775725593667548e-06, + "loss": 1.4652, + "step": 28 + }, + { + "epoch": 0.0022974846504258267, + "grad_norm": 10.225961851553592, + "learning_rate": 1.5303430079155673e-06, + "loss": 1.4839, + "step": 29 + }, + { + "epoch": 0.0023767082590612004, + "grad_norm": 12.164324439005954, + "learning_rate": 1.5831134564643801e-06, + "loss": 1.3534, + "step": 30 + }, + { + "epoch": 0.0024559318676965736, + "grad_norm": 11.063575560268422, + "learning_rate": 1.6358839050131928e-06, + "loss": 1.4274, + "step": 31 + }, + { + "epoch": 0.002535155476331947, + "grad_norm": 9.73448058354835, + "learning_rate": 1.6886543535620054e-06, + "loss": 1.4187, + "step": 32 + }, + { + "epoch": 0.00261437908496732, + "grad_norm": 8.730204246763352, + "learning_rate": 1.7414248021108183e-06, + "loss": 1.1519, + "step": 33 + }, + { + "epoch": 0.0026936026936026937, + "grad_norm": 6.924881468158276, + "learning_rate": 1.7941952506596308e-06, + "loss": 1.2454, + "step": 34 + }, + { + "epoch": 0.002772826302238067, + "grad_norm": 6.629579400630702, + "learning_rate": 1.8469656992084434e-06, + "loss": 1.2072, + "step": 35 + }, + { + "epoch": 0.00285204991087344, + "grad_norm": 6.657573499106914, + "learning_rate": 1.8997361477572559e-06, + "loss": 1.046, + "step": 36 + }, + { + "epoch": 0.0029312735195088134, + "grad_norm": 8.025208743375664, + "learning_rate": 1.9525065963060687e-06, + "loss": 1.2457, + "step": 37 + }, + { + "epoch": 0.003010497128144187, + "grad_norm": 6.679507832590588, + "learning_rate": 2.0052770448548814e-06, + "loss": 1.2994, + "step": 38 + }, + { + "epoch": 0.0030897207367795603, + "grad_norm": 6.448298994490477, + "learning_rate": 2.058047493403694e-06, + "loss": 1.2889, + "step": 39 + }, + { + "epoch": 0.0031689443454149336, + "grad_norm": 6.8153830133577005, + "learning_rate": 2.1108179419525067e-06, + "loss": 1.142, + "step": 40 + }, + { + "epoch": 0.003248167954050307, + "grad_norm": 5.655776676536306, + "learning_rate": 2.1635883905013194e-06, + "loss": 1.1317, + "step": 41 + }, + { + "epoch": 0.0033273915626856805, + "grad_norm": 7.634046937784093, + "learning_rate": 2.216358839050132e-06, + "loss": 0.9966, + "step": 42 + }, + { + "epoch": 0.0034066151713210537, + "grad_norm": 6.154130288982867, + "learning_rate": 2.2691292875989447e-06, + "loss": 1.0714, + "step": 43 + }, + { + "epoch": 0.003485838779956427, + "grad_norm": 6.599696603402416, + "learning_rate": 2.3218997361477573e-06, + "loss": 1.1753, + "step": 44 + }, + { + "epoch": 0.0035650623885918, + "grad_norm": 6.1045473837052135, + "learning_rate": 2.37467018469657e-06, + "loss": 1.0945, + "step": 45 + }, + { + "epoch": 0.003644285997227174, + "grad_norm": 6.639820010209079, + "learning_rate": 2.427440633245383e-06, + "loss": 1.2281, + "step": 46 + }, + { + "epoch": 0.003723509605862547, + "grad_norm": 6.158858746048843, + "learning_rate": 2.4802110817941953e-06, + "loss": 1.1257, + "step": 47 + }, + { + "epoch": 0.0038027332144979203, + "grad_norm": 6.448066336502261, + "learning_rate": 2.5329815303430084e-06, + "loss": 1.1324, + "step": 48 + }, + { + "epoch": 0.0038819568231332935, + "grad_norm": 6.262066262387423, + "learning_rate": 2.5857519788918206e-06, + "loss": 1.1343, + "step": 49 + }, + { + "epoch": 0.003961180431768667, + "grad_norm": 5.243397804131298, + "learning_rate": 2.6385224274406333e-06, + "loss": 0.9142, + "step": 50 + }, + { + "epoch": 0.00404040404040404, + "grad_norm": 5.967104448204892, + "learning_rate": 2.6912928759894464e-06, + "loss": 1.0485, + "step": 51 + }, + { + "epoch": 0.004119627649039414, + "grad_norm": 5.66949631539659, + "learning_rate": 2.7440633245382586e-06, + "loss": 1.1379, + "step": 52 + }, + { + "epoch": 0.004198851257674787, + "grad_norm": 5.474294930789325, + "learning_rate": 2.7968337730870717e-06, + "loss": 1.0624, + "step": 53 + }, + { + "epoch": 0.0042780748663101605, + "grad_norm": 4.8417338949953255, + "learning_rate": 2.8496042216358843e-06, + "loss": 1.1212, + "step": 54 + }, + { + "epoch": 0.004357298474945534, + "grad_norm": 5.087923167229228, + "learning_rate": 2.9023746701846966e-06, + "loss": 0.9097, + "step": 55 + }, + { + "epoch": 0.004436522083580907, + "grad_norm": 5.316728911094931, + "learning_rate": 2.9551451187335096e-06, + "loss": 0.9949, + "step": 56 + }, + { + "epoch": 0.004515745692216281, + "grad_norm": 5.218219318853345, + "learning_rate": 3.0079155672823223e-06, + "loss": 0.9964, + "step": 57 + }, + { + "epoch": 0.0045949693008516534, + "grad_norm": 5.0674909088177635, + "learning_rate": 3.0606860158311345e-06, + "loss": 0.9277, + "step": 58 + }, + { + "epoch": 0.004674192909487027, + "grad_norm": 5.4550208422255535, + "learning_rate": 3.1134564643799476e-06, + "loss": 0.9852, + "step": 59 + }, + { + "epoch": 0.004753416518122401, + "grad_norm": 5.001551321481967, + "learning_rate": 3.1662269129287603e-06, + "loss": 0.9319, + "step": 60 + }, + { + "epoch": 0.004832640126757774, + "grad_norm": 5.447524685496177, + "learning_rate": 3.2189973614775725e-06, + "loss": 1.0637, + "step": 61 + }, + { + "epoch": 0.004911863735393147, + "grad_norm": 5.218168830884699, + "learning_rate": 3.2717678100263856e-06, + "loss": 0.9396, + "step": 62 + }, + { + "epoch": 0.004991087344028521, + "grad_norm": 4.880088882954173, + "learning_rate": 3.3245382585751982e-06, + "loss": 1.0132, + "step": 63 + }, + { + "epoch": 0.005070310952663894, + "grad_norm": 5.0825664489953315, + "learning_rate": 3.377308707124011e-06, + "loss": 1.0607, + "step": 64 + }, + { + "epoch": 0.005149534561299267, + "grad_norm": 4.47382446788315, + "learning_rate": 3.4300791556728235e-06, + "loss": 1.0139, + "step": 65 + }, + { + "epoch": 0.00522875816993464, + "grad_norm": 5.145244155377481, + "learning_rate": 3.4828496042216366e-06, + "loss": 0.8655, + "step": 66 + }, + { + "epoch": 0.005307981778570014, + "grad_norm": 5.083565443816077, + "learning_rate": 3.535620052770449e-06, + "loss": 1.017, + "step": 67 + }, + { + "epoch": 0.0053872053872053875, + "grad_norm": 4.802988575118353, + "learning_rate": 3.5883905013192615e-06, + "loss": 0.9873, + "step": 68 + }, + { + "epoch": 0.00546642899584076, + "grad_norm": 4.240379484307994, + "learning_rate": 3.6411609498680746e-06, + "loss": 0.9657, + "step": 69 + }, + { + "epoch": 0.005545652604476134, + "grad_norm": 4.5666670004609236, + "learning_rate": 3.693931398416887e-06, + "loss": 0.9206, + "step": 70 + }, + { + "epoch": 0.005624876213111508, + "grad_norm": 4.195885760168908, + "learning_rate": 3.7467018469656995e-06, + "loss": 1.0085, + "step": 71 + }, + { + "epoch": 0.00570409982174688, + "grad_norm": 5.520513458463618, + "learning_rate": 3.7994722955145117e-06, + "loss": 1.1191, + "step": 72 + }, + { + "epoch": 0.005783323430382254, + "grad_norm": 4.699359191613322, + "learning_rate": 3.852242744063324e-06, + "loss": 0.9894, + "step": 73 + }, + { + "epoch": 0.005862547039017627, + "grad_norm": 5.172166281666409, + "learning_rate": 3.9050131926121375e-06, + "loss": 0.9746, + "step": 74 + }, + { + "epoch": 0.0059417706476530005, + "grad_norm": 4.69763964850509, + "learning_rate": 3.95778364116095e-06, + "loss": 0.8907, + "step": 75 + }, + { + "epoch": 0.006020994256288374, + "grad_norm": 4.9958513834839415, + "learning_rate": 4.010554089709763e-06, + "loss": 1.0053, + "step": 76 + }, + { + "epoch": 0.006100217864923747, + "grad_norm": 4.774640599323078, + "learning_rate": 4.063324538258576e-06, + "loss": 0.9138, + "step": 77 + }, + { + "epoch": 0.006179441473559121, + "grad_norm": 4.358778391915449, + "learning_rate": 4.116094986807388e-06, + "loss": 0.9861, + "step": 78 + }, + { + "epoch": 0.006258665082194494, + "grad_norm": 4.605810160381926, + "learning_rate": 4.168865435356201e-06, + "loss": 0.9896, + "step": 79 + }, + { + "epoch": 0.006337888690829867, + "grad_norm": 4.809837933134756, + "learning_rate": 4.221635883905013e-06, + "loss": 0.967, + "step": 80 + }, + { + "epoch": 0.006417112299465241, + "grad_norm": 4.80021330417163, + "learning_rate": 4.274406332453826e-06, + "loss": 0.8566, + "step": 81 + }, + { + "epoch": 0.006496335908100614, + "grad_norm": 5.462415067634996, + "learning_rate": 4.327176781002639e-06, + "loss": 0.8521, + "step": 82 + }, + { + "epoch": 0.006575559516735987, + "grad_norm": 5.719895957749736, + "learning_rate": 4.379947229551452e-06, + "loss": 0.9843, + "step": 83 + }, + { + "epoch": 0.006654783125371361, + "grad_norm": 4.544961628485232, + "learning_rate": 4.432717678100264e-06, + "loss": 0.9193, + "step": 84 + }, + { + "epoch": 0.006734006734006734, + "grad_norm": 4.467983498834651, + "learning_rate": 4.485488126649077e-06, + "loss": 0.9251, + "step": 85 + }, + { + "epoch": 0.006813230342642107, + "grad_norm": 4.102025313306762, + "learning_rate": 4.538258575197889e-06, + "loss": 0.8745, + "step": 86 + }, + { + "epoch": 0.006892453951277481, + "grad_norm": 4.371674476843521, + "learning_rate": 4.5910290237467024e-06, + "loss": 0.9766, + "step": 87 + }, + { + "epoch": 0.006971677559912854, + "grad_norm": 4.725585073534907, + "learning_rate": 4.643799472295515e-06, + "loss": 0.9342, + "step": 88 + }, + { + "epoch": 0.0070509011685482275, + "grad_norm": 5.077637280352382, + "learning_rate": 4.696569920844328e-06, + "loss": 0.9327, + "step": 89 + }, + { + "epoch": 0.0071301247771836, + "grad_norm": 4.659863741989866, + "learning_rate": 4.74934036939314e-06, + "loss": 0.8369, + "step": 90 + }, + { + "epoch": 0.007209348385818974, + "grad_norm": 4.533011763036514, + "learning_rate": 4.802110817941953e-06, + "loss": 0.8696, + "step": 91 + }, + { + "epoch": 0.007288571994454348, + "grad_norm": 4.65339049188555, + "learning_rate": 4.854881266490766e-06, + "loss": 1.0413, + "step": 92 + }, + { + "epoch": 0.00736779560308972, + "grad_norm": 4.649252972991806, + "learning_rate": 4.907651715039578e-06, + "loss": 0.8295, + "step": 93 + }, + { + "epoch": 0.007447019211725094, + "grad_norm": 5.084528879374284, + "learning_rate": 4.960422163588391e-06, + "loss": 0.947, + "step": 94 + }, + { + "epoch": 0.007526242820360468, + "grad_norm": 5.365711702599791, + "learning_rate": 5.013192612137203e-06, + "loss": 0.9293, + "step": 95 + }, + { + "epoch": 0.0076054664289958405, + "grad_norm": 5.046350203333127, + "learning_rate": 5.065963060686017e-06, + "loss": 0.79, + "step": 96 + }, + { + "epoch": 0.007684690037631214, + "grad_norm": 5.3954948399293094, + "learning_rate": 5.118733509234829e-06, + "loss": 0.9778, + "step": 97 + }, + { + "epoch": 0.007763913646266587, + "grad_norm": 4.688927569396289, + "learning_rate": 5.171503957783641e-06, + "loss": 0.8867, + "step": 98 + }, + { + "epoch": 0.00784313725490196, + "grad_norm": 4.388795302213495, + "learning_rate": 5.224274406332454e-06, + "loss": 0.8771, + "step": 99 + }, + { + "epoch": 0.007922360863537333, + "grad_norm": 3.8591469175181463, + "learning_rate": 5.2770448548812665e-06, + "loss": 0.8644, + "step": 100 + }, + { + "epoch": 0.008001584472172708, + "grad_norm": 4.641112549258453, + "learning_rate": 5.32981530343008e-06, + "loss": 0.8712, + "step": 101 + }, + { + "epoch": 0.00808080808080808, + "grad_norm": 4.555516172471135, + "learning_rate": 5.382585751978893e-06, + "loss": 0.9133, + "step": 102 + }, + { + "epoch": 0.008160031689443454, + "grad_norm": 4.443942171385705, + "learning_rate": 5.435356200527705e-06, + "loss": 0.9634, + "step": 103 + }, + { + "epoch": 0.008239255298078828, + "grad_norm": 4.106083206477799, + "learning_rate": 5.488126649076517e-06, + "loss": 0.9857, + "step": 104 + }, + { + "epoch": 0.008318478906714201, + "grad_norm": 4.373637779113837, + "learning_rate": 5.540897097625331e-06, + "loss": 0.934, + "step": 105 + }, + { + "epoch": 0.008397702515349574, + "grad_norm": 4.025247144093253, + "learning_rate": 5.593667546174143e-06, + "loss": 0.8653, + "step": 106 + }, + { + "epoch": 0.008476926123984948, + "grad_norm": 4.11317321322005, + "learning_rate": 5.6464379947229556e-06, + "loss": 0.7644, + "step": 107 + }, + { + "epoch": 0.008556149732620321, + "grad_norm": 5.3484256844048375, + "learning_rate": 5.699208443271769e-06, + "loss": 0.9789, + "step": 108 + }, + { + "epoch": 0.008635373341255694, + "grad_norm": 4.572732992399881, + "learning_rate": 5.751978891820581e-06, + "loss": 0.8207, + "step": 109 + }, + { + "epoch": 0.008714596949891068, + "grad_norm": 5.321804779554728, + "learning_rate": 5.804749340369393e-06, + "loss": 0.8386, + "step": 110 + }, + { + "epoch": 0.008793820558526441, + "grad_norm": 4.244600615493288, + "learning_rate": 5.857519788918207e-06, + "loss": 0.7825, + "step": 111 + }, + { + "epoch": 0.008873044167161814, + "grad_norm": 4.020909635380337, + "learning_rate": 5.910290237467019e-06, + "loss": 0.8229, + "step": 112 + }, + { + "epoch": 0.008952267775797187, + "grad_norm": 5.803912944648255, + "learning_rate": 5.9630606860158315e-06, + "loss": 1.015, + "step": 113 + }, + { + "epoch": 0.009031491384432561, + "grad_norm": 4.261138899479962, + "learning_rate": 6.015831134564645e-06, + "loss": 0.7781, + "step": 114 + }, + { + "epoch": 0.009110714993067934, + "grad_norm": 4.61631624738624, + "learning_rate": 6.068601583113457e-06, + "loss": 0.8741, + "step": 115 + }, + { + "epoch": 0.009189938601703307, + "grad_norm": 4.8776197021487775, + "learning_rate": 6.121372031662269e-06, + "loss": 0.8966, + "step": 116 + }, + { + "epoch": 0.009269162210338681, + "grad_norm": 4.086195762521685, + "learning_rate": 6.174142480211083e-06, + "loss": 1.0242, + "step": 117 + }, + { + "epoch": 0.009348385818974054, + "grad_norm": 5.630709412798026, + "learning_rate": 6.226912928759895e-06, + "loss": 0.9456, + "step": 118 + }, + { + "epoch": 0.009427609427609427, + "grad_norm": 4.588901704970874, + "learning_rate": 6.2796833773087074e-06, + "loss": 0.8328, + "step": 119 + }, + { + "epoch": 0.009506833036244802, + "grad_norm": 5.393860711539961, + "learning_rate": 6.3324538258575205e-06, + "loss": 0.9444, + "step": 120 + }, + { + "epoch": 0.009586056644880174, + "grad_norm": 4.441173614780535, + "learning_rate": 6.385224274406333e-06, + "loss": 0.8644, + "step": 121 + }, + { + "epoch": 0.009665280253515547, + "grad_norm": 4.494882352693371, + "learning_rate": 6.437994722955145e-06, + "loss": 0.8996, + "step": 122 + }, + { + "epoch": 0.009744503862150922, + "grad_norm": 4.905180828585797, + "learning_rate": 6.490765171503959e-06, + "loss": 0.8668, + "step": 123 + }, + { + "epoch": 0.009823727470786294, + "grad_norm": 5.089005617514571, + "learning_rate": 6.543535620052771e-06, + "loss": 0.9542, + "step": 124 + }, + { + "epoch": 0.009902951079421667, + "grad_norm": 6.375768513761875, + "learning_rate": 6.596306068601583e-06, + "loss": 0.9587, + "step": 125 + }, + { + "epoch": 0.009982174688057042, + "grad_norm": 5.0498298372319335, + "learning_rate": 6.6490765171503965e-06, + "loss": 0.9445, + "step": 126 + }, + { + "epoch": 0.010061398296692415, + "grad_norm": 4.355769440314316, + "learning_rate": 6.701846965699209e-06, + "loss": 0.7922, + "step": 127 + }, + { + "epoch": 0.010140621905327787, + "grad_norm": 4.156816938903829, + "learning_rate": 6.754617414248022e-06, + "loss": 0.8037, + "step": 128 + }, + { + "epoch": 0.01021984551396316, + "grad_norm": 4.121763608581514, + "learning_rate": 6.807387862796835e-06, + "loss": 0.8335, + "step": 129 + }, + { + "epoch": 0.010299069122598535, + "grad_norm": 4.04306955201829, + "learning_rate": 6.860158311345647e-06, + "loss": 0.8892, + "step": 130 + }, + { + "epoch": 0.010378292731233908, + "grad_norm": 4.2830441388875045, + "learning_rate": 6.912928759894459e-06, + "loss": 0.8526, + "step": 131 + }, + { + "epoch": 0.01045751633986928, + "grad_norm": 4.322006369790227, + "learning_rate": 6.965699208443273e-06, + "loss": 0.9522, + "step": 132 + }, + { + "epoch": 0.010536739948504655, + "grad_norm": 4.014897802602603, + "learning_rate": 7.0184696569920855e-06, + "loss": 0.9428, + "step": 133 + }, + { + "epoch": 0.010615963557140028, + "grad_norm": 4.168727587985997, + "learning_rate": 7.071240105540898e-06, + "loss": 0.9373, + "step": 134 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 4.904887657583289, + "learning_rate": 7.124010554089711e-06, + "loss": 0.9598, + "step": 135 + }, + { + "epoch": 0.010774410774410775, + "grad_norm": 5.235074439299815, + "learning_rate": 7.176781002638523e-06, + "loss": 0.8605, + "step": 136 + }, + { + "epoch": 0.010853634383046148, + "grad_norm": 3.918901785131192, + "learning_rate": 7.229551451187335e-06, + "loss": 0.9594, + "step": 137 + }, + { + "epoch": 0.01093285799168152, + "grad_norm": 5.361671346898265, + "learning_rate": 7.282321899736149e-06, + "loss": 0.9146, + "step": 138 + }, + { + "epoch": 0.011012081600316895, + "grad_norm": 4.72013151628534, + "learning_rate": 7.3350923482849614e-06, + "loss": 0.888, + "step": 139 + }, + { + "epoch": 0.011091305208952268, + "grad_norm": 4.310702462486159, + "learning_rate": 7.387862796833774e-06, + "loss": 0.8192, + "step": 140 + }, + { + "epoch": 0.01117052881758764, + "grad_norm": 4.154478434577734, + "learning_rate": 7.440633245382587e-06, + "loss": 0.7708, + "step": 141 + }, + { + "epoch": 0.011249752426223015, + "grad_norm": 3.8639612804964796, + "learning_rate": 7.493403693931399e-06, + "loss": 0.7781, + "step": 142 + }, + { + "epoch": 0.011328976034858388, + "grad_norm": 4.124867136969742, + "learning_rate": 7.546174142480211e-06, + "loss": 0.785, + "step": 143 + }, + { + "epoch": 0.01140819964349376, + "grad_norm": 4.248137504752284, + "learning_rate": 7.5989445910290234e-06, + "loss": 0.9768, + "step": 144 + }, + { + "epoch": 0.011487423252129134, + "grad_norm": 4.226461559466392, + "learning_rate": 7.651715039577837e-06, + "loss": 0.8604, + "step": 145 + }, + { + "epoch": 0.011566646860764508, + "grad_norm": 4.4624465699300115, + "learning_rate": 7.704485488126649e-06, + "loss": 0.8059, + "step": 146 + }, + { + "epoch": 0.011645870469399881, + "grad_norm": 3.6667829398885305, + "learning_rate": 7.757255936675462e-06, + "loss": 0.8426, + "step": 147 + }, + { + "epoch": 0.011725094078035254, + "grad_norm": 5.001089157790554, + "learning_rate": 7.810026385224275e-06, + "loss": 0.8442, + "step": 148 + }, + { + "epoch": 0.011804317686670628, + "grad_norm": 4.330793339427337, + "learning_rate": 7.862796833773088e-06, + "loss": 0.9529, + "step": 149 + }, + { + "epoch": 0.011883541295306001, + "grad_norm": 3.67472160005844, + "learning_rate": 7.9155672823219e-06, + "loss": 0.8692, + "step": 150 + }, + { + "epoch": 0.011962764903941374, + "grad_norm": 3.845005744976269, + "learning_rate": 7.968337730870712e-06, + "loss": 0.7479, + "step": 151 + }, + { + "epoch": 0.012041988512576748, + "grad_norm": 4.3384613812152155, + "learning_rate": 8.021108179419526e-06, + "loss": 0.7601, + "step": 152 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 4.1343625213704405, + "learning_rate": 8.073878627968339e-06, + "loss": 0.8503, + "step": 153 + }, + { + "epoch": 0.012200435729847494, + "grad_norm": 4.2433093058821045, + "learning_rate": 8.126649076517152e-06, + "loss": 0.7726, + "step": 154 + }, + { + "epoch": 0.012279659338482869, + "grad_norm": 3.45255082771628, + "learning_rate": 8.179419525065963e-06, + "loss": 0.747, + "step": 155 + }, + { + "epoch": 0.012358882947118241, + "grad_norm": 4.341564180857874, + "learning_rate": 8.232189973614776e-06, + "loss": 0.869, + "step": 156 + }, + { + "epoch": 0.012438106555753614, + "grad_norm": 3.7577045631718127, + "learning_rate": 8.28496042216359e-06, + "loss": 0.6547, + "step": 157 + }, + { + "epoch": 0.012517330164388989, + "grad_norm": 5.513269080787856, + "learning_rate": 8.337730870712402e-06, + "loss": 0.8728, + "step": 158 + }, + { + "epoch": 0.012596553773024361, + "grad_norm": 4.37882606551242, + "learning_rate": 8.390501319261214e-06, + "loss": 0.8575, + "step": 159 + }, + { + "epoch": 0.012675777381659734, + "grad_norm": 4.388520745167087, + "learning_rate": 8.443271767810027e-06, + "loss": 1.066, + "step": 160 + }, + { + "epoch": 0.012755000990295109, + "grad_norm": 4.250390412888382, + "learning_rate": 8.49604221635884e-06, + "loss": 0.8017, + "step": 161 + }, + { + "epoch": 0.012834224598930482, + "grad_norm": 3.9454617049590857, + "learning_rate": 8.548812664907651e-06, + "loss": 0.8749, + "step": 162 + }, + { + "epoch": 0.012913448207565854, + "grad_norm": 4.430707449945308, + "learning_rate": 8.601583113456466e-06, + "loss": 0.6564, + "step": 163 + }, + { + "epoch": 0.012992671816201227, + "grad_norm": 3.9202256632045724, + "learning_rate": 8.654353562005277e-06, + "loss": 0.8393, + "step": 164 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 4.268136791602566, + "learning_rate": 8.70712401055409e-06, + "loss": 0.9371, + "step": 165 + }, + { + "epoch": 0.013151119033471975, + "grad_norm": 3.9162695096517184, + "learning_rate": 8.759894459102904e-06, + "loss": 0.7281, + "step": 166 + }, + { + "epoch": 0.013230342642107347, + "grad_norm": 4.553388186923074, + "learning_rate": 8.812664907651715e-06, + "loss": 0.8769, + "step": 167 + }, + { + "epoch": 0.013309566250742722, + "grad_norm": 3.839789189950159, + "learning_rate": 8.865435356200528e-06, + "loss": 0.7697, + "step": 168 + }, + { + "epoch": 0.013388789859378095, + "grad_norm": 3.8145935609705197, + "learning_rate": 8.918205804749341e-06, + "loss": 0.7364, + "step": 169 + }, + { + "epoch": 0.013468013468013467, + "grad_norm": 4.268096339566533, + "learning_rate": 8.970976253298154e-06, + "loss": 0.9221, + "step": 170 + }, + { + "epoch": 0.013547237076648842, + "grad_norm": 4.550504129757893, + "learning_rate": 9.023746701846966e-06, + "loss": 0.8489, + "step": 171 + }, + { + "epoch": 0.013626460685284215, + "grad_norm": 4.283525417991337, + "learning_rate": 9.076517150395779e-06, + "loss": 0.9542, + "step": 172 + }, + { + "epoch": 0.013705684293919588, + "grad_norm": 4.3678315522937385, + "learning_rate": 9.129287598944592e-06, + "loss": 0.8868, + "step": 173 + }, + { + "epoch": 0.013784907902554962, + "grad_norm": 3.9556925817923583, + "learning_rate": 9.182058047493405e-06, + "loss": 0.81, + "step": 174 + }, + { + "epoch": 0.013864131511190335, + "grad_norm": 3.704795715263707, + "learning_rate": 9.234828496042218e-06, + "loss": 0.8582, + "step": 175 + }, + { + "epoch": 0.013943355119825708, + "grad_norm": 4.170391413752929, + "learning_rate": 9.28759894459103e-06, + "loss": 0.8575, + "step": 176 + }, + { + "epoch": 0.014022578728461082, + "grad_norm": 3.7131669252291637, + "learning_rate": 9.340369393139842e-06, + "loss": 0.8142, + "step": 177 + }, + { + "epoch": 0.014101802337096455, + "grad_norm": 3.4994250961024402, + "learning_rate": 9.393139841688655e-06, + "loss": 0.8888, + "step": 178 + }, + { + "epoch": 0.014181025945731828, + "grad_norm": 4.2069622125317325, + "learning_rate": 9.445910290237469e-06, + "loss": 0.8666, + "step": 179 + }, + { + "epoch": 0.0142602495543672, + "grad_norm": 3.9202895838726217, + "learning_rate": 9.49868073878628e-06, + "loss": 0.6881, + "step": 180 + }, + { + "epoch": 0.014339473163002575, + "grad_norm": 3.795816921822793, + "learning_rate": 9.551451187335093e-06, + "loss": 0.9302, + "step": 181 + }, + { + "epoch": 0.014418696771637948, + "grad_norm": 3.857082471296832, + "learning_rate": 9.604221635883906e-06, + "loss": 0.7497, + "step": 182 + }, + { + "epoch": 0.01449792038027332, + "grad_norm": 3.425914907611555, + "learning_rate": 9.656992084432717e-06, + "loss": 0.74, + "step": 183 + }, + { + "epoch": 0.014577143988908695, + "grad_norm": 4.827477679594857, + "learning_rate": 9.709762532981532e-06, + "loss": 0.9002, + "step": 184 + }, + { + "epoch": 0.014656367597544068, + "grad_norm": 4.403657129664381, + "learning_rate": 9.762532981530344e-06, + "loss": 0.8277, + "step": 185 + }, + { + "epoch": 0.01473559120617944, + "grad_norm": 3.837542836735528, + "learning_rate": 9.815303430079157e-06, + "loss": 0.7833, + "step": 186 + }, + { + "epoch": 0.014814814814814815, + "grad_norm": 4.244820929131236, + "learning_rate": 9.86807387862797e-06, + "loss": 0.9134, + "step": 187 + }, + { + "epoch": 0.014894038423450188, + "grad_norm": 3.492855668523716, + "learning_rate": 9.920844327176781e-06, + "loss": 0.8036, + "step": 188 + }, + { + "epoch": 0.014973262032085561, + "grad_norm": 3.611719510210302, + "learning_rate": 9.973614775725594e-06, + "loss": 0.7717, + "step": 189 + }, + { + "epoch": 0.015052485640720936, + "grad_norm": 3.741597374680162, + "learning_rate": 1.0026385224274406e-05, + "loss": 0.9001, + "step": 190 + }, + { + "epoch": 0.015131709249356308, + "grad_norm": 4.527848826983816, + "learning_rate": 1.007915567282322e-05, + "loss": 0.827, + "step": 191 + }, + { + "epoch": 0.015210932857991681, + "grad_norm": 3.354860688994162, + "learning_rate": 1.0131926121372034e-05, + "loss": 0.6931, + "step": 192 + }, + { + "epoch": 0.015290156466627056, + "grad_norm": 4.219651178061965, + "learning_rate": 1.0184696569920845e-05, + "loss": 0.8936, + "step": 193 + }, + { + "epoch": 0.015369380075262428, + "grad_norm": 3.6376167818913494, + "learning_rate": 1.0237467018469658e-05, + "loss": 0.7386, + "step": 194 + }, + { + "epoch": 0.015448603683897801, + "grad_norm": 3.5033686642392685, + "learning_rate": 1.0290237467018471e-05, + "loss": 0.7582, + "step": 195 + }, + { + "epoch": 0.015527827292533174, + "grad_norm": 4.679814278475639, + "learning_rate": 1.0343007915567282e-05, + "loss": 0.884, + "step": 196 + }, + { + "epoch": 0.015607050901168549, + "grad_norm": 3.8639364280687487, + "learning_rate": 1.0395778364116096e-05, + "loss": 0.8326, + "step": 197 + }, + { + "epoch": 0.01568627450980392, + "grad_norm": 3.634592082543115, + "learning_rate": 1.0448548812664909e-05, + "loss": 0.7581, + "step": 198 + }, + { + "epoch": 0.015765498118439296, + "grad_norm": 4.118737206191648, + "learning_rate": 1.050131926121372e-05, + "loss": 0.7847, + "step": 199 + }, + { + "epoch": 0.015844721727074667, + "grad_norm": 3.584822138410341, + "learning_rate": 1.0554089709762533e-05, + "loss": 0.6461, + "step": 200 + }, + { + "epoch": 0.01592394533571004, + "grad_norm": 3.703795809615698, + "learning_rate": 1.0606860158311348e-05, + "loss": 0.7239, + "step": 201 + }, + { + "epoch": 0.016003168944345416, + "grad_norm": 3.5727922420656153, + "learning_rate": 1.065963060686016e-05, + "loss": 0.8866, + "step": 202 + }, + { + "epoch": 0.016082392552980787, + "grad_norm": 3.723572236030753, + "learning_rate": 1.0712401055408972e-05, + "loss": 0.895, + "step": 203 + }, + { + "epoch": 0.01616161616161616, + "grad_norm": 4.3724041554593445, + "learning_rate": 1.0765171503957785e-05, + "loss": 0.7314, + "step": 204 + }, + { + "epoch": 0.016240839770251536, + "grad_norm": 3.5208260272751803, + "learning_rate": 1.0817941952506597e-05, + "loss": 0.8545, + "step": 205 + }, + { + "epoch": 0.016320063378886907, + "grad_norm": 3.6578900613380094, + "learning_rate": 1.087071240105541e-05, + "loss": 0.6679, + "step": 206 + }, + { + "epoch": 0.01639928698752228, + "grad_norm": 4.5009170735929755, + "learning_rate": 1.0923482849604223e-05, + "loss": 0.7741, + "step": 207 + }, + { + "epoch": 0.016478510596157656, + "grad_norm": 3.4412113654885594, + "learning_rate": 1.0976253298153034e-05, + "loss": 0.7697, + "step": 208 + }, + { + "epoch": 0.016557734204793027, + "grad_norm": 4.514535209752686, + "learning_rate": 1.1029023746701847e-05, + "loss": 0.9767, + "step": 209 + }, + { + "epoch": 0.016636957813428402, + "grad_norm": 4.138695914329832, + "learning_rate": 1.1081794195250662e-05, + "loss": 0.819, + "step": 210 + }, + { + "epoch": 0.016716181422063776, + "grad_norm": 3.5464001325184724, + "learning_rate": 1.1134564643799472e-05, + "loss": 0.7432, + "step": 211 + }, + { + "epoch": 0.016795405030699147, + "grad_norm": 3.268689749462536, + "learning_rate": 1.1187335092348287e-05, + "loss": 0.7539, + "step": 212 + }, + { + "epoch": 0.016874628639334522, + "grad_norm": 3.6449475341487245, + "learning_rate": 1.12401055408971e-05, + "loss": 0.7753, + "step": 213 + }, + { + "epoch": 0.016953852247969897, + "grad_norm": 4.216205799069353, + "learning_rate": 1.1292875989445911e-05, + "loss": 0.8787, + "step": 214 + }, + { + "epoch": 0.017033075856605268, + "grad_norm": 4.299724070076182, + "learning_rate": 1.1345646437994724e-05, + "loss": 0.8365, + "step": 215 + }, + { + "epoch": 0.017112299465240642, + "grad_norm": 3.74584938994572, + "learning_rate": 1.1398416886543537e-05, + "loss": 0.8036, + "step": 216 + }, + { + "epoch": 0.017191523073876017, + "grad_norm": 4.077208167309988, + "learning_rate": 1.1451187335092349e-05, + "loss": 0.7797, + "step": 217 + }, + { + "epoch": 0.017270746682511388, + "grad_norm": 3.565018608216613, + "learning_rate": 1.1503957783641162e-05, + "loss": 0.671, + "step": 218 + }, + { + "epoch": 0.017349970291146762, + "grad_norm": 3.8567431211721, + "learning_rate": 1.1556728232189975e-05, + "loss": 0.7822, + "step": 219 + }, + { + "epoch": 0.017429193899782137, + "grad_norm": 3.745620195928399, + "learning_rate": 1.1609498680738786e-05, + "loss": 0.8644, + "step": 220 + }, + { + "epoch": 0.017508417508417508, + "grad_norm": 3.8005079906268007, + "learning_rate": 1.16622691292876e-05, + "loss": 0.8291, + "step": 221 + }, + { + "epoch": 0.017587641117052882, + "grad_norm": 3.7551160204839547, + "learning_rate": 1.1715039577836414e-05, + "loss": 0.8835, + "step": 222 + }, + { + "epoch": 0.017666864725688253, + "grad_norm": 3.7167090708993857, + "learning_rate": 1.1767810026385225e-05, + "loss": 0.8747, + "step": 223 + }, + { + "epoch": 0.017746088334323628, + "grad_norm": 3.2585918653928063, + "learning_rate": 1.1820580474934039e-05, + "loss": 0.8071, + "step": 224 + }, + { + "epoch": 0.017825311942959002, + "grad_norm": 3.7416082846867003, + "learning_rate": 1.1873350923482852e-05, + "loss": 0.6964, + "step": 225 + }, + { + "epoch": 0.017904535551594374, + "grad_norm": 3.7342682163534016, + "learning_rate": 1.1926121372031663e-05, + "loss": 0.723, + "step": 226 + }, + { + "epoch": 0.017983759160229748, + "grad_norm": 3.506114113377626, + "learning_rate": 1.1978891820580476e-05, + "loss": 0.7236, + "step": 227 + }, + { + "epoch": 0.018062982768865123, + "grad_norm": 4.026296434389851, + "learning_rate": 1.203166226912929e-05, + "loss": 0.7678, + "step": 228 + }, + { + "epoch": 0.018142206377500494, + "grad_norm": 4.059294464998192, + "learning_rate": 1.20844327176781e-05, + "loss": 0.9026, + "step": 229 + }, + { + "epoch": 0.018221429986135868, + "grad_norm": 3.8225269226973237, + "learning_rate": 1.2137203166226914e-05, + "loss": 0.8269, + "step": 230 + }, + { + "epoch": 0.018300653594771243, + "grad_norm": 4.168487101336105, + "learning_rate": 1.2189973614775727e-05, + "loss": 0.7832, + "step": 231 + }, + { + "epoch": 0.018379877203406614, + "grad_norm": 4.366470054616233, + "learning_rate": 1.2242744063324538e-05, + "loss": 0.7617, + "step": 232 + }, + { + "epoch": 0.01845910081204199, + "grad_norm": 4.157728193933643, + "learning_rate": 1.2295514511873353e-05, + "loss": 0.8242, + "step": 233 + }, + { + "epoch": 0.018538324420677363, + "grad_norm": 3.5409903145818107, + "learning_rate": 1.2348284960422166e-05, + "loss": 0.7008, + "step": 234 + }, + { + "epoch": 0.018617548029312734, + "grad_norm": 3.9399017253064925, + "learning_rate": 1.2401055408970977e-05, + "loss": 0.7594, + "step": 235 + }, + { + "epoch": 0.01869677163794811, + "grad_norm": 4.072821094514031, + "learning_rate": 1.245382585751979e-05, + "loss": 0.794, + "step": 236 + }, + { + "epoch": 0.018775995246583483, + "grad_norm": 3.587038963862682, + "learning_rate": 1.2506596306068604e-05, + "loss": 0.7905, + "step": 237 + }, + { + "epoch": 0.018855218855218854, + "grad_norm": 4.222390342873257, + "learning_rate": 1.2559366754617415e-05, + "loss": 0.7026, + "step": 238 + }, + { + "epoch": 0.01893444246385423, + "grad_norm": 3.6227628867044306, + "learning_rate": 1.2612137203166228e-05, + "loss": 0.7938, + "step": 239 + }, + { + "epoch": 0.019013666072489603, + "grad_norm": 3.9591522550380924, + "learning_rate": 1.2664907651715041e-05, + "loss": 0.7552, + "step": 240 + }, + { + "epoch": 0.019092889681124974, + "grad_norm": 3.465839747417068, + "learning_rate": 1.2717678100263852e-05, + "loss": 0.7554, + "step": 241 + }, + { + "epoch": 0.01917211328976035, + "grad_norm": 5.25631605275512, + "learning_rate": 1.2770448548812666e-05, + "loss": 0.8187, + "step": 242 + }, + { + "epoch": 0.019251336898395723, + "grad_norm": 4.231239499070505, + "learning_rate": 1.282321899736148e-05, + "loss": 0.6708, + "step": 243 + }, + { + "epoch": 0.019330560507031094, + "grad_norm": 4.322833233877656, + "learning_rate": 1.287598944591029e-05, + "loss": 0.691, + "step": 244 + }, + { + "epoch": 0.01940978411566647, + "grad_norm": 4.853147718334036, + "learning_rate": 1.2928759894459105e-05, + "loss": 0.78, + "step": 245 + }, + { + "epoch": 0.019489007724301843, + "grad_norm": 3.7149811759579805, + "learning_rate": 1.2981530343007918e-05, + "loss": 0.8873, + "step": 246 + }, + { + "epoch": 0.019568231332937214, + "grad_norm": 4.039438627767664, + "learning_rate": 1.303430079155673e-05, + "loss": 0.9019, + "step": 247 + }, + { + "epoch": 0.01964745494157259, + "grad_norm": 4.334229332140161, + "learning_rate": 1.3087071240105542e-05, + "loss": 0.8289, + "step": 248 + }, + { + "epoch": 0.019726678550207963, + "grad_norm": 3.8079823489206848, + "learning_rate": 1.3139841688654355e-05, + "loss": 0.7061, + "step": 249 + }, + { + "epoch": 0.019805902158843335, + "grad_norm": 3.1954119319695016, + "learning_rate": 1.3192612137203167e-05, + "loss": 0.8081, + "step": 250 + }, + { + "epoch": 0.01988512576747871, + "grad_norm": 3.563204029462328, + "learning_rate": 1.324538258575198e-05, + "loss": 0.7448, + "step": 251 + }, + { + "epoch": 0.019964349376114084, + "grad_norm": 4.341566034690226, + "learning_rate": 1.3298153034300793e-05, + "loss": 0.8055, + "step": 252 + }, + { + "epoch": 0.020043572984749455, + "grad_norm": 3.3198199379309274, + "learning_rate": 1.3350923482849604e-05, + "loss": 0.8121, + "step": 253 + }, + { + "epoch": 0.02012279659338483, + "grad_norm": 4.06557739337397, + "learning_rate": 1.3403693931398417e-05, + "loss": 0.8344, + "step": 254 + }, + { + "epoch": 0.020202020202020204, + "grad_norm": 3.499526476418784, + "learning_rate": 1.3456464379947232e-05, + "loss": 0.7689, + "step": 255 + }, + { + "epoch": 0.020281243810655575, + "grad_norm": 3.926451846400831, + "learning_rate": 1.3509234828496044e-05, + "loss": 0.8759, + "step": 256 + }, + { + "epoch": 0.02036046741929095, + "grad_norm": 3.8956301180414226, + "learning_rate": 1.3562005277044857e-05, + "loss": 0.7401, + "step": 257 + }, + { + "epoch": 0.02043969102792632, + "grad_norm": 3.9302225168681155, + "learning_rate": 1.361477572559367e-05, + "loss": 0.8247, + "step": 258 + }, + { + "epoch": 0.020518914636561695, + "grad_norm": 3.5472244564328763, + "learning_rate": 1.3667546174142481e-05, + "loss": 0.7644, + "step": 259 + }, + { + "epoch": 0.02059813824519707, + "grad_norm": 3.56902753063322, + "learning_rate": 1.3720316622691294e-05, + "loss": 0.7661, + "step": 260 + }, + { + "epoch": 0.02067736185383244, + "grad_norm": 4.206985293609494, + "learning_rate": 1.3773087071240107e-05, + "loss": 0.858, + "step": 261 + }, + { + "epoch": 0.020756585462467815, + "grad_norm": 3.789742057549914, + "learning_rate": 1.3825857519788919e-05, + "loss": 0.7101, + "step": 262 + }, + { + "epoch": 0.02083580907110319, + "grad_norm": 3.5289805722076224, + "learning_rate": 1.3878627968337732e-05, + "loss": 0.7384, + "step": 263 + }, + { + "epoch": 0.02091503267973856, + "grad_norm": 3.4809773057379734, + "learning_rate": 1.3931398416886547e-05, + "loss": 0.9131, + "step": 264 + }, + { + "epoch": 0.020994256288373935, + "grad_norm": 4.028840068111051, + "learning_rate": 1.3984168865435356e-05, + "loss": 0.8361, + "step": 265 + }, + { + "epoch": 0.02107347989700931, + "grad_norm": 3.4749301836903337, + "learning_rate": 1.4036939313984171e-05, + "loss": 0.7317, + "step": 266 + }, + { + "epoch": 0.02115270350564468, + "grad_norm": 3.467759847181791, + "learning_rate": 1.4089709762532984e-05, + "loss": 0.844, + "step": 267 + }, + { + "epoch": 0.021231927114280055, + "grad_norm": 3.8743740199567394, + "learning_rate": 1.4142480211081795e-05, + "loss": 0.8776, + "step": 268 + }, + { + "epoch": 0.02131115072291543, + "grad_norm": 4.110449384942725, + "learning_rate": 1.4195250659630609e-05, + "loss": 0.8366, + "step": 269 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 3.8833441075642963, + "learning_rate": 1.4248021108179422e-05, + "loss": 0.7772, + "step": 270 + }, + { + "epoch": 0.021469597940186175, + "grad_norm": 3.289699051116365, + "learning_rate": 1.4300791556728233e-05, + "loss": 0.7317, + "step": 271 + }, + { + "epoch": 0.02154882154882155, + "grad_norm": 3.34801754211088, + "learning_rate": 1.4353562005277046e-05, + "loss": 0.7615, + "step": 272 + }, + { + "epoch": 0.02162804515745692, + "grad_norm": 3.0799025441784265, + "learning_rate": 1.440633245382586e-05, + "loss": 0.8028, + "step": 273 + }, + { + "epoch": 0.021707268766092296, + "grad_norm": 3.58422767361047, + "learning_rate": 1.445910290237467e-05, + "loss": 0.8324, + "step": 274 + }, + { + "epoch": 0.02178649237472767, + "grad_norm": 4.023408860426394, + "learning_rate": 1.4511873350923484e-05, + "loss": 0.861, + "step": 275 + }, + { + "epoch": 0.02186571598336304, + "grad_norm": 4.097140721055274, + "learning_rate": 1.4564643799472298e-05, + "loss": 0.9003, + "step": 276 + }, + { + "epoch": 0.021944939591998416, + "grad_norm": 3.707139064155454, + "learning_rate": 1.461741424802111e-05, + "loss": 0.7211, + "step": 277 + }, + { + "epoch": 0.02202416320063379, + "grad_norm": 3.418846745319288, + "learning_rate": 1.4670184696569923e-05, + "loss": 0.7495, + "step": 278 + }, + { + "epoch": 0.02210338680926916, + "grad_norm": 4.817702318931237, + "learning_rate": 1.4722955145118736e-05, + "loss": 0.6855, + "step": 279 + }, + { + "epoch": 0.022182610417904536, + "grad_norm": 5.710433622705206, + "learning_rate": 1.4775725593667547e-05, + "loss": 0.8139, + "step": 280 + }, + { + "epoch": 0.02226183402653991, + "grad_norm": 3.664054971437985, + "learning_rate": 1.482849604221636e-05, + "loss": 0.8447, + "step": 281 + }, + { + "epoch": 0.02234105763517528, + "grad_norm": 3.4240551483562376, + "learning_rate": 1.4881266490765173e-05, + "loss": 0.7682, + "step": 282 + }, + { + "epoch": 0.022420281243810656, + "grad_norm": 3.9249114246971284, + "learning_rate": 1.4934036939313985e-05, + "loss": 0.7726, + "step": 283 + }, + { + "epoch": 0.02249950485244603, + "grad_norm": 3.1316834100159534, + "learning_rate": 1.4986807387862798e-05, + "loss": 0.7919, + "step": 284 + }, + { + "epoch": 0.0225787284610814, + "grad_norm": 3.7673726856479672, + "learning_rate": 1.503957783641161e-05, + "loss": 0.6971, + "step": 285 + }, + { + "epoch": 0.022657952069716776, + "grad_norm": 3.502806068320617, + "learning_rate": 1.5092348284960422e-05, + "loss": 0.8588, + "step": 286 + }, + { + "epoch": 0.02273717567835215, + "grad_norm": 3.674320971057659, + "learning_rate": 1.5145118733509237e-05, + "loss": 0.8545, + "step": 287 + }, + { + "epoch": 0.02281639928698752, + "grad_norm": 3.65343275322434, + "learning_rate": 1.5197889182058047e-05, + "loss": 0.822, + "step": 288 + }, + { + "epoch": 0.022895622895622896, + "grad_norm": 3.9145186417350097, + "learning_rate": 1.5250659630606862e-05, + "loss": 0.8197, + "step": 289 + }, + { + "epoch": 0.022974846504258267, + "grad_norm": 3.587462611763387, + "learning_rate": 1.5303430079155675e-05, + "loss": 0.806, + "step": 290 + }, + { + "epoch": 0.023054070112893642, + "grad_norm": 3.2429608145578976, + "learning_rate": 1.5356200527704484e-05, + "loss": 0.716, + "step": 291 + }, + { + "epoch": 0.023133293721529016, + "grad_norm": 3.7327217962083674, + "learning_rate": 1.5408970976253298e-05, + "loss": 0.8026, + "step": 292 + }, + { + "epoch": 0.023212517330164387, + "grad_norm": 3.7050830424089245, + "learning_rate": 1.5461741424802114e-05, + "loss": 0.7938, + "step": 293 + }, + { + "epoch": 0.023291740938799762, + "grad_norm": 3.4178884118590487, + "learning_rate": 1.5514511873350924e-05, + "loss": 0.7051, + "step": 294 + }, + { + "epoch": 0.023370964547435136, + "grad_norm": 4.149806296582689, + "learning_rate": 1.5567282321899737e-05, + "loss": 0.8055, + "step": 295 + }, + { + "epoch": 0.023450188156070507, + "grad_norm": 3.595358031209693, + "learning_rate": 1.562005277044855e-05, + "loss": 0.7103, + "step": 296 + }, + { + "epoch": 0.023529411764705882, + "grad_norm": 3.8238105633654027, + "learning_rate": 1.5672823218997363e-05, + "loss": 0.7502, + "step": 297 + }, + { + "epoch": 0.023608635373341257, + "grad_norm": 4.105921446221804, + "learning_rate": 1.5725593667546176e-05, + "loss": 0.8152, + "step": 298 + }, + { + "epoch": 0.023687858981976628, + "grad_norm": 3.3725631824917355, + "learning_rate": 1.577836411609499e-05, + "loss": 0.8643, + "step": 299 + }, + { + "epoch": 0.023767082590612002, + "grad_norm": 3.3390397221383106, + "learning_rate": 1.58311345646438e-05, + "loss": 0.845, + "step": 300 + }, + { + "epoch": 0.023846306199247377, + "grad_norm": 3.281381423728863, + "learning_rate": 1.5883905013192612e-05, + "loss": 0.696, + "step": 301 + }, + { + "epoch": 0.023925529807882748, + "grad_norm": 4.143666183958962, + "learning_rate": 1.5936675461741425e-05, + "loss": 0.6892, + "step": 302 + }, + { + "epoch": 0.024004753416518122, + "grad_norm": 3.6469645487470803, + "learning_rate": 1.5989445910290238e-05, + "loss": 0.7484, + "step": 303 + }, + { + "epoch": 0.024083977025153497, + "grad_norm": 3.462031792761476, + "learning_rate": 1.604221635883905e-05, + "loss": 0.6692, + "step": 304 + }, + { + "epoch": 0.024163200633788868, + "grad_norm": 4.382336173610418, + "learning_rate": 1.6094986807387864e-05, + "loss": 0.8901, + "step": 305 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 3.615706822076657, + "learning_rate": 1.6147757255936677e-05, + "loss": 0.746, + "step": 306 + }, + { + "epoch": 0.024321647851059617, + "grad_norm": 4.0891182726928506, + "learning_rate": 1.620052770448549e-05, + "loss": 0.7599, + "step": 307 + }, + { + "epoch": 0.024400871459694988, + "grad_norm": 3.2599454134387145, + "learning_rate": 1.6253298153034303e-05, + "loss": 0.7387, + "step": 308 + }, + { + "epoch": 0.024480095068330363, + "grad_norm": 3.077261991593952, + "learning_rate": 1.6306068601583113e-05, + "loss": 0.6451, + "step": 309 + }, + { + "epoch": 0.024559318676965737, + "grad_norm": 3.3946669801299554, + "learning_rate": 1.6358839050131926e-05, + "loss": 0.7617, + "step": 310 + }, + { + "epoch": 0.024638542285601108, + "grad_norm": 3.0909202813571754, + "learning_rate": 1.641160949868074e-05, + "loss": 0.7394, + "step": 311 + }, + { + "epoch": 0.024717765894236483, + "grad_norm": 3.2303813414067646, + "learning_rate": 1.6464379947229552e-05, + "loss": 0.6724, + "step": 312 + }, + { + "epoch": 0.024796989502871857, + "grad_norm": 4.148102731792718, + "learning_rate": 1.6517150395778365e-05, + "loss": 0.9289, + "step": 313 + }, + { + "epoch": 0.024876213111507228, + "grad_norm": 3.1480484242615727, + "learning_rate": 1.656992084432718e-05, + "loss": 0.8047, + "step": 314 + }, + { + "epoch": 0.024955436720142603, + "grad_norm": 3.0850680980658396, + "learning_rate": 1.6622691292875988e-05, + "loss": 0.646, + "step": 315 + }, + { + "epoch": 0.025034660328777977, + "grad_norm": 3.847289304454283, + "learning_rate": 1.6675461741424805e-05, + "loss": 0.8803, + "step": 316 + }, + { + "epoch": 0.02511388393741335, + "grad_norm": 3.4955441653574355, + "learning_rate": 1.6728232189973618e-05, + "loss": 0.6668, + "step": 317 + }, + { + "epoch": 0.025193107546048723, + "grad_norm": 3.5042065798133177, + "learning_rate": 1.6781002638522427e-05, + "loss": 0.6678, + "step": 318 + }, + { + "epoch": 0.025272331154684097, + "grad_norm": 3.1172788400995293, + "learning_rate": 1.683377308707124e-05, + "loss": 0.6891, + "step": 319 + }, + { + "epoch": 0.02535155476331947, + "grad_norm": 3.314291280894821, + "learning_rate": 1.6886543535620054e-05, + "loss": 0.6419, + "step": 320 + }, + { + "epoch": 0.025430778371954843, + "grad_norm": 4.01611550806511, + "learning_rate": 1.6939313984168867e-05, + "loss": 0.7323, + "step": 321 + }, + { + "epoch": 0.025510001980590218, + "grad_norm": 3.7084401810504657, + "learning_rate": 1.699208443271768e-05, + "loss": 0.762, + "step": 322 + }, + { + "epoch": 0.02558922558922559, + "grad_norm": 3.753149408505484, + "learning_rate": 1.7044854881266493e-05, + "loss": 0.8163, + "step": 323 + }, + { + "epoch": 0.025668449197860963, + "grad_norm": 4.135777493301567, + "learning_rate": 1.7097625329815303e-05, + "loss": 0.7717, + "step": 324 + }, + { + "epoch": 0.025747672806496334, + "grad_norm": 3.1570838097689085, + "learning_rate": 1.7150395778364116e-05, + "loss": 0.6832, + "step": 325 + }, + { + "epoch": 0.02582689641513171, + "grad_norm": 3.484950768473148, + "learning_rate": 1.7203166226912932e-05, + "loss": 0.7892, + "step": 326 + }, + { + "epoch": 0.025906120023767083, + "grad_norm": 4.095040453604106, + "learning_rate": 1.7255936675461742e-05, + "loss": 0.9254, + "step": 327 + }, + { + "epoch": 0.025985343632402454, + "grad_norm": 3.552826016710888, + "learning_rate": 1.7308707124010555e-05, + "loss": 0.7753, + "step": 328 + }, + { + "epoch": 0.02606456724103783, + "grad_norm": 3.1391953350397426, + "learning_rate": 1.7361477572559368e-05, + "loss": 0.7731, + "step": 329 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 3.589391543210561, + "learning_rate": 1.741424802110818e-05, + "loss": 0.8643, + "step": 330 + }, + { + "epoch": 0.026223014458308574, + "grad_norm": 3.345659315748946, + "learning_rate": 1.7467018469656994e-05, + "loss": 0.7583, + "step": 331 + }, + { + "epoch": 0.02630223806694395, + "grad_norm": 3.3314584139164296, + "learning_rate": 1.7519788918205807e-05, + "loss": 0.7089, + "step": 332 + }, + { + "epoch": 0.026381461675579324, + "grad_norm": 3.5738923858018365, + "learning_rate": 1.7572559366754617e-05, + "loss": 0.7971, + "step": 333 + }, + { + "epoch": 0.026460685284214695, + "grad_norm": 2.9993215529920443, + "learning_rate": 1.762532981530343e-05, + "loss": 0.6821, + "step": 334 + }, + { + "epoch": 0.02653990889285007, + "grad_norm": 3.2038798869042426, + "learning_rate": 1.7678100263852246e-05, + "loss": 0.6752, + "step": 335 + }, + { + "epoch": 0.026619132501485444, + "grad_norm": 3.926245847094454, + "learning_rate": 1.7730870712401056e-05, + "loss": 0.6815, + "step": 336 + }, + { + "epoch": 0.026698356110120815, + "grad_norm": 4.166435984990503, + "learning_rate": 1.778364116094987e-05, + "loss": 0.8396, + "step": 337 + }, + { + "epoch": 0.02677757971875619, + "grad_norm": 3.32979714666828, + "learning_rate": 1.7836411609498682e-05, + "loss": 0.7585, + "step": 338 + }, + { + "epoch": 0.026856803327391564, + "grad_norm": 3.231145519092424, + "learning_rate": 1.7889182058047495e-05, + "loss": 0.639, + "step": 339 + }, + { + "epoch": 0.026936026936026935, + "grad_norm": 3.9174238979156164, + "learning_rate": 1.794195250659631e-05, + "loss": 0.8961, + "step": 340 + }, + { + "epoch": 0.02701525054466231, + "grad_norm": 3.3408210835803076, + "learning_rate": 1.799472295514512e-05, + "loss": 0.8431, + "step": 341 + }, + { + "epoch": 0.027094474153297684, + "grad_norm": 3.5027731677869878, + "learning_rate": 1.804749340369393e-05, + "loss": 0.7896, + "step": 342 + }, + { + "epoch": 0.027173697761933055, + "grad_norm": 3.2871288694337113, + "learning_rate": 1.8100263852242744e-05, + "loss": 0.6117, + "step": 343 + }, + { + "epoch": 0.02725292137056843, + "grad_norm": 3.350721831688502, + "learning_rate": 1.8153034300791557e-05, + "loss": 0.7742, + "step": 344 + }, + { + "epoch": 0.027332144979203804, + "grad_norm": 3.4371672105624453, + "learning_rate": 1.820580474934037e-05, + "loss": 0.7458, + "step": 345 + }, + { + "epoch": 0.027411368587839175, + "grad_norm": 3.450554693901827, + "learning_rate": 1.8258575197889184e-05, + "loss": 0.6961, + "step": 346 + }, + { + "epoch": 0.02749059219647455, + "grad_norm": 3.6673363372608354, + "learning_rate": 1.8311345646437997e-05, + "loss": 0.6564, + "step": 347 + }, + { + "epoch": 0.027569815805109924, + "grad_norm": 3.2710856783239604, + "learning_rate": 1.836411609498681e-05, + "loss": 0.661, + "step": 348 + }, + { + "epoch": 0.027649039413745295, + "grad_norm": 4.205237087646808, + "learning_rate": 1.8416886543535623e-05, + "loss": 0.8297, + "step": 349 + }, + { + "epoch": 0.02772826302238067, + "grad_norm": 3.2138147795983985, + "learning_rate": 1.8469656992084436e-05, + "loss": 0.6865, + "step": 350 + }, + { + "epoch": 0.027807486631016044, + "grad_norm": 3.8356757431162443, + "learning_rate": 1.8522427440633246e-05, + "loss": 0.7885, + "step": 351 + }, + { + "epoch": 0.027886710239651415, + "grad_norm": 3.2551435313043333, + "learning_rate": 1.857519788918206e-05, + "loss": 0.7511, + "step": 352 + }, + { + "epoch": 0.02796593384828679, + "grad_norm": 3.0983731597894617, + "learning_rate": 1.8627968337730872e-05, + "loss": 0.6247, + "step": 353 + }, + { + "epoch": 0.028045157456922164, + "grad_norm": 3.1731553515873396, + "learning_rate": 1.8680738786279685e-05, + "loss": 0.6205, + "step": 354 + }, + { + "epoch": 0.028124381065557535, + "grad_norm": 3.287247997085444, + "learning_rate": 1.8733509234828498e-05, + "loss": 0.7181, + "step": 355 + }, + { + "epoch": 0.02820360467419291, + "grad_norm": 3.327138725584456, + "learning_rate": 1.878627968337731e-05, + "loss": 0.8072, + "step": 356 + }, + { + "epoch": 0.028282828282828285, + "grad_norm": 3.319797807388505, + "learning_rate": 1.883905013192612e-05, + "loss": 0.8466, + "step": 357 + }, + { + "epoch": 0.028362051891463656, + "grad_norm": 3.570402366889043, + "learning_rate": 1.8891820580474937e-05, + "loss": 0.8399, + "step": 358 + }, + { + "epoch": 0.02844127550009903, + "grad_norm": 3.137571438918123, + "learning_rate": 1.894459102902375e-05, + "loss": 0.7881, + "step": 359 + }, + { + "epoch": 0.0285204991087344, + "grad_norm": 3.4841293423776003, + "learning_rate": 1.899736147757256e-05, + "loss": 0.8527, + "step": 360 + }, + { + "epoch": 0.028599722717369776, + "grad_norm": 3.4820764436145932, + "learning_rate": 1.9050131926121373e-05, + "loss": 0.8243, + "step": 361 + }, + { + "epoch": 0.02867894632600515, + "grad_norm": 3.3164503046184723, + "learning_rate": 1.9102902374670186e-05, + "loss": 0.8558, + "step": 362 + }, + { + "epoch": 0.02875816993464052, + "grad_norm": 2.9865196186958567, + "learning_rate": 1.9155672823219e-05, + "loss": 0.6437, + "step": 363 + }, + { + "epoch": 0.028837393543275896, + "grad_norm": 3.2335490271808904, + "learning_rate": 1.9208443271767812e-05, + "loss": 0.7301, + "step": 364 + }, + { + "epoch": 0.02891661715191127, + "grad_norm": 3.3502128059371152, + "learning_rate": 1.9261213720316625e-05, + "loss": 0.8369, + "step": 365 + }, + { + "epoch": 0.02899584076054664, + "grad_norm": 2.8743265293932776, + "learning_rate": 1.9313984168865435e-05, + "loss": 0.6829, + "step": 366 + }, + { + "epoch": 0.029075064369182016, + "grad_norm": 3.280219616689533, + "learning_rate": 1.9366754617414248e-05, + "loss": 0.6537, + "step": 367 + }, + { + "epoch": 0.02915428797781739, + "grad_norm": 2.732268199224151, + "learning_rate": 1.9419525065963065e-05, + "loss": 0.7224, + "step": 368 + }, + { + "epoch": 0.02923351158645276, + "grad_norm": 2.984363660002938, + "learning_rate": 1.9472295514511874e-05, + "loss": 0.7127, + "step": 369 + }, + { + "epoch": 0.029312735195088136, + "grad_norm": 3.371642454877917, + "learning_rate": 1.9525065963060687e-05, + "loss": 0.7686, + "step": 370 + }, + { + "epoch": 0.02939195880372351, + "grad_norm": 3.1768839335393078, + "learning_rate": 1.95778364116095e-05, + "loss": 0.7357, + "step": 371 + }, + { + "epoch": 0.02947118241235888, + "grad_norm": 2.7923715663376765, + "learning_rate": 1.9630606860158313e-05, + "loss": 0.6656, + "step": 372 + }, + { + "epoch": 0.029550406020994256, + "grad_norm": 4.191541846532922, + "learning_rate": 1.9683377308707127e-05, + "loss": 0.7534, + "step": 373 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 4.000464072905396, + "learning_rate": 1.973614775725594e-05, + "loss": 0.8135, + "step": 374 + }, + { + "epoch": 0.029708853238265002, + "grad_norm": 3.0150323185962975, + "learning_rate": 1.978891820580475e-05, + "loss": 0.6979, + "step": 375 + }, + { + "epoch": 0.029788076846900376, + "grad_norm": 3.154364618050119, + "learning_rate": 1.9841688654353562e-05, + "loss": 0.738, + "step": 376 + }, + { + "epoch": 0.02986730045553575, + "grad_norm": 2.8574921657055388, + "learning_rate": 1.9894459102902375e-05, + "loss": 0.6646, + "step": 377 + }, + { + "epoch": 0.029946524064171122, + "grad_norm": 4.068817276971112, + "learning_rate": 1.994722955145119e-05, + "loss": 0.7863, + "step": 378 + }, + { + "epoch": 0.030025747672806496, + "grad_norm": 3.801094142270408, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 379 + }, + { + "epoch": 0.03010497128144187, + "grad_norm": 2.9202579325609848, + "learning_rate": 1.999999967077406e-05, + "loss": 0.7807, + "step": 380 + }, + { + "epoch": 0.030184194890077242, + "grad_norm": 3.664901234494972, + "learning_rate": 1.9999998683096255e-05, + "loss": 0.6887, + "step": 381 + }, + { + "epoch": 0.030263418498712617, + "grad_norm": 3.7705786876071548, + "learning_rate": 1.999999703696666e-05, + "loss": 0.8314, + "step": 382 + }, + { + "epoch": 0.03034264210734799, + "grad_norm": 2.9239267082931084, + "learning_rate": 1.999999473238537e-05, + "loss": 0.6929, + "step": 383 + }, + { + "epoch": 0.030421865715983362, + "grad_norm": 3.1488338802910625, + "learning_rate": 1.9999991769352545e-05, + "loss": 0.78, + "step": 384 + }, + { + "epoch": 0.030501089324618737, + "grad_norm": 3.1924989010982188, + "learning_rate": 1.9999988147868384e-05, + "loss": 0.774, + "step": 385 + }, + { + "epoch": 0.03058031293325411, + "grad_norm": 2.9362412624508267, + "learning_rate": 1.9999983867933114e-05, + "loss": 0.6999, + "step": 386 + }, + { + "epoch": 0.030659536541889482, + "grad_norm": 3.05407156990383, + "learning_rate": 1.999997892954703e-05, + "loss": 0.7604, + "step": 387 + }, + { + "epoch": 0.030738760150524857, + "grad_norm": 3.0079815654319813, + "learning_rate": 1.9999973332710443e-05, + "loss": 0.7542, + "step": 388 + }, + { + "epoch": 0.03081798375916023, + "grad_norm": 2.9564707196198143, + "learning_rate": 1.9999967077423732e-05, + "loss": 0.6401, + "step": 389 + }, + { + "epoch": 0.030897207367795602, + "grad_norm": 3.3880826537854043, + "learning_rate": 1.9999960163687307e-05, + "loss": 0.6489, + "step": 390 + }, + { + "epoch": 0.030976430976430977, + "grad_norm": 3.00111825208427, + "learning_rate": 1.999995259150162e-05, + "loss": 0.65, + "step": 391 + }, + { + "epoch": 0.031055654585066348, + "grad_norm": 3.6544433389417113, + "learning_rate": 1.999994436086717e-05, + "loss": 0.7, + "step": 392 + }, + { + "epoch": 0.031134878193701723, + "grad_norm": 3.660774889293522, + "learning_rate": 1.9999935471784508e-05, + "loss": 0.789, + "step": 393 + }, + { + "epoch": 0.031214101802337097, + "grad_norm": 3.026834031119743, + "learning_rate": 1.9999925924254203e-05, + "loss": 0.6067, + "step": 394 + }, + { + "epoch": 0.03129332541097247, + "grad_norm": 3.317335090945264, + "learning_rate": 1.9999915718276898e-05, + "loss": 0.7867, + "step": 395 + }, + { + "epoch": 0.03137254901960784, + "grad_norm": 3.5844819431308514, + "learning_rate": 1.9999904853853256e-05, + "loss": 0.6939, + "step": 396 + }, + { + "epoch": 0.03145177262824322, + "grad_norm": 2.762998963626351, + "learning_rate": 1.9999893330983998e-05, + "loss": 0.5991, + "step": 397 + }, + { + "epoch": 0.03153099623687859, + "grad_norm": 3.3876228445124448, + "learning_rate": 1.999988114966988e-05, + "loss": 0.7375, + "step": 398 + }, + { + "epoch": 0.031610219845513966, + "grad_norm": 4.3883250638148645, + "learning_rate": 1.9999868309911704e-05, + "loss": 0.7754, + "step": 399 + }, + { + "epoch": 0.031689443454149334, + "grad_norm": 3.392568834553705, + "learning_rate": 1.9999854811710317e-05, + "loss": 0.7602, + "step": 400 + }, + { + "epoch": 0.03176866706278471, + "grad_norm": 3.02439206130161, + "learning_rate": 1.9999840655066608e-05, + "loss": 0.8112, + "step": 401 + }, + { + "epoch": 0.03184789067142008, + "grad_norm": 3.1035894348375943, + "learning_rate": 1.9999825839981506e-05, + "loss": 0.7849, + "step": 402 + }, + { + "epoch": 0.03192711428005546, + "grad_norm": 3.3498760431884147, + "learning_rate": 1.9999810366455986e-05, + "loss": 0.7188, + "step": 403 + }, + { + "epoch": 0.03200633788869083, + "grad_norm": 3.0627182954455456, + "learning_rate": 1.9999794234491075e-05, + "loss": 0.7826, + "step": 404 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 3.2296720837773853, + "learning_rate": 1.9999777444087826e-05, + "loss": 0.7912, + "step": 405 + }, + { + "epoch": 0.032164785105961574, + "grad_norm": 2.922693644758063, + "learning_rate": 1.999975999524735e-05, + "loss": 0.7255, + "step": 406 + }, + { + "epoch": 0.03224400871459695, + "grad_norm": 2.8195517460950814, + "learning_rate": 1.9999741887970795e-05, + "loss": 0.6582, + "step": 407 + }, + { + "epoch": 0.03232323232323232, + "grad_norm": 2.9473629113911604, + "learning_rate": 1.999972312225935e-05, + "loss": 0.7149, + "step": 408 + }, + { + "epoch": 0.0324024559318677, + "grad_norm": 3.3702272349427056, + "learning_rate": 1.999970369811425e-05, + "loss": 0.8295, + "step": 409 + }, + { + "epoch": 0.03248167954050307, + "grad_norm": 3.475859899358589, + "learning_rate": 1.9999683615536784e-05, + "loss": 0.6059, + "step": 410 + }, + { + "epoch": 0.03256090314913844, + "grad_norm": 2.652656758009029, + "learning_rate": 1.9999662874528264e-05, + "loss": 0.6792, + "step": 411 + }, + { + "epoch": 0.032640126757773814, + "grad_norm": 3.1009047844442295, + "learning_rate": 1.999964147509006e-05, + "loss": 0.678, + "step": 412 + }, + { + "epoch": 0.03271935036640919, + "grad_norm": 3.1179207540858416, + "learning_rate": 1.999961941722358e-05, + "loss": 0.7409, + "step": 413 + }, + { + "epoch": 0.03279857397504456, + "grad_norm": 2.8619668085480496, + "learning_rate": 1.9999596700930274e-05, + "loss": 0.6693, + "step": 414 + }, + { + "epoch": 0.03287779758367994, + "grad_norm": 2.962266476851864, + "learning_rate": 1.999957332621164e-05, + "loss": 0.7561, + "step": 415 + }, + { + "epoch": 0.03295702119231531, + "grad_norm": 3.224454599986635, + "learning_rate": 1.999954929306922e-05, + "loss": 0.677, + "step": 416 + }, + { + "epoch": 0.03303624480095068, + "grad_norm": 3.3015275650816043, + "learning_rate": 1.999952460150459e-05, + "loss": 0.8423, + "step": 417 + }, + { + "epoch": 0.033115468409586055, + "grad_norm": 2.855492149164199, + "learning_rate": 1.9999499251519388e-05, + "loss": 0.7511, + "step": 418 + }, + { + "epoch": 0.03319469201822143, + "grad_norm": 3.8401180843939384, + "learning_rate": 1.9999473243115268e-05, + "loss": 0.8389, + "step": 419 + }, + { + "epoch": 0.033273915626856804, + "grad_norm": 2.8469379162486432, + "learning_rate": 1.999944657629395e-05, + "loss": 0.7397, + "step": 420 + }, + { + "epoch": 0.03335313923549218, + "grad_norm": 3.2696244839813335, + "learning_rate": 1.999941925105719e-05, + "loss": 0.7419, + "step": 421 + }, + { + "epoch": 0.03343236284412755, + "grad_norm": 3.1230839546840063, + "learning_rate": 1.9999391267406786e-05, + "loss": 0.8035, + "step": 422 + }, + { + "epoch": 0.03351158645276292, + "grad_norm": 2.9283918023315363, + "learning_rate": 1.9999362625344584e-05, + "loss": 0.6943, + "step": 423 + }, + { + "epoch": 0.033590810061398295, + "grad_norm": 3.1936284231474725, + "learning_rate": 1.9999333324872464e-05, + "loss": 0.9064, + "step": 424 + }, + { + "epoch": 0.03367003367003367, + "grad_norm": 2.8875438705586474, + "learning_rate": 1.9999303365992357e-05, + "loss": 0.7968, + "step": 425 + }, + { + "epoch": 0.033749257278669044, + "grad_norm": 2.400006216231242, + "learning_rate": 1.999927274870624e-05, + "loss": 0.5922, + "step": 426 + }, + { + "epoch": 0.03382848088730442, + "grad_norm": 2.999635943694363, + "learning_rate": 1.9999241473016126e-05, + "loss": 0.7664, + "step": 427 + }, + { + "epoch": 0.03390770449593979, + "grad_norm": 2.958039689462677, + "learning_rate": 1.999920953892407e-05, + "loss": 0.7659, + "step": 428 + }, + { + "epoch": 0.03398692810457516, + "grad_norm": 3.0957704885821955, + "learning_rate": 1.9999176946432183e-05, + "loss": 0.6943, + "step": 429 + }, + { + "epoch": 0.034066151713210535, + "grad_norm": 2.924849074337557, + "learning_rate": 1.9999143695542606e-05, + "loss": 0.744, + "step": 430 + }, + { + "epoch": 0.03414537532184591, + "grad_norm": 3.466886521758863, + "learning_rate": 1.9999109786257528e-05, + "loss": 0.8334, + "step": 431 + }, + { + "epoch": 0.034224598930481284, + "grad_norm": 2.899606871934526, + "learning_rate": 1.9999075218579184e-05, + "loss": 0.722, + "step": 432 + }, + { + "epoch": 0.03430382253911666, + "grad_norm": 2.438949140716664, + "learning_rate": 1.999903999250985e-05, + "loss": 0.6094, + "step": 433 + }, + { + "epoch": 0.03438304614775203, + "grad_norm": 3.1741480519318954, + "learning_rate": 1.9999004108051846e-05, + "loss": 0.721, + "step": 434 + }, + { + "epoch": 0.0344622697563874, + "grad_norm": 2.7943785874454408, + "learning_rate": 1.999896756520753e-05, + "loss": 0.6219, + "step": 435 + }, + { + "epoch": 0.034541493365022775, + "grad_norm": 2.8887590844679036, + "learning_rate": 1.9998930363979315e-05, + "loss": 0.6477, + "step": 436 + }, + { + "epoch": 0.03462071697365815, + "grad_norm": 3.4787657190867733, + "learning_rate": 1.999889250436965e-05, + "loss": 0.726, + "step": 437 + }, + { + "epoch": 0.034699940582293524, + "grad_norm": 3.57284819177238, + "learning_rate": 1.9998853986381018e-05, + "loss": 0.6634, + "step": 438 + }, + { + "epoch": 0.0347791641909289, + "grad_norm": 3.613442654515561, + "learning_rate": 1.9998814810015968e-05, + "loss": 0.6943, + "step": 439 + }, + { + "epoch": 0.034858387799564274, + "grad_norm": 3.218407392773835, + "learning_rate": 1.9998774975277074e-05, + "loss": 0.7278, + "step": 440 + }, + { + "epoch": 0.03493761140819964, + "grad_norm": 2.728775105672768, + "learning_rate": 1.9998734482166954e-05, + "loss": 0.5458, + "step": 441 + }, + { + "epoch": 0.035016835016835016, + "grad_norm": 3.005926546632378, + "learning_rate": 1.9998693330688283e-05, + "loss": 0.7494, + "step": 442 + }, + { + "epoch": 0.03509605862547039, + "grad_norm": 3.7011643987907146, + "learning_rate": 1.9998651520843766e-05, + "loss": 0.7735, + "step": 443 + }, + { + "epoch": 0.035175282234105765, + "grad_norm": 2.9851244415240963, + "learning_rate": 1.999860905263616e-05, + "loss": 0.6974, + "step": 444 + }, + { + "epoch": 0.03525450584274114, + "grad_norm": 3.428898050367164, + "learning_rate": 1.9998565926068253e-05, + "loss": 0.7623, + "step": 445 + }, + { + "epoch": 0.03533372945137651, + "grad_norm": 2.8356099510017088, + "learning_rate": 1.999852214114289e-05, + "loss": 0.6526, + "step": 446 + }, + { + "epoch": 0.03541295306001188, + "grad_norm": 2.594817507939691, + "learning_rate": 1.9998477697862956e-05, + "loss": 0.6711, + "step": 447 + }, + { + "epoch": 0.035492176668647256, + "grad_norm": 3.235172623698129, + "learning_rate": 1.9998432596231373e-05, + "loss": 0.7407, + "step": 448 + }, + { + "epoch": 0.03557140027728263, + "grad_norm": 2.7348183642356796, + "learning_rate": 1.9998386836251116e-05, + "loss": 0.6542, + "step": 449 + }, + { + "epoch": 0.035650623885918005, + "grad_norm": 3.272985917072195, + "learning_rate": 1.9998340417925193e-05, + "loss": 0.6998, + "step": 450 + }, + { + "epoch": 0.03572984749455338, + "grad_norm": 3.8264584256652254, + "learning_rate": 1.9998293341256664e-05, + "loss": 0.797, + "step": 451 + }, + { + "epoch": 0.03580907110318875, + "grad_norm": 3.21843345949715, + "learning_rate": 1.9998245606248627e-05, + "loss": 0.7336, + "step": 452 + }, + { + "epoch": 0.03588829471182412, + "grad_norm": 3.036525917040152, + "learning_rate": 1.999819721290422e-05, + "loss": 0.6946, + "step": 453 + }, + { + "epoch": 0.035967518320459496, + "grad_norm": 2.8182487471269044, + "learning_rate": 1.9998148161226645e-05, + "loss": 0.6617, + "step": 454 + }, + { + "epoch": 0.03604674192909487, + "grad_norm": 2.9361313245946223, + "learning_rate": 1.9998098451219115e-05, + "loss": 0.675, + "step": 455 + }, + { + "epoch": 0.036125965537730245, + "grad_norm": 3.7030365958009726, + "learning_rate": 1.999804808288491e-05, + "loss": 0.796, + "step": 456 + }, + { + "epoch": 0.03620518914636562, + "grad_norm": 3.5489158000641896, + "learning_rate": 1.9997997056227347e-05, + "loss": 0.8261, + "step": 457 + }, + { + "epoch": 0.03628441275500099, + "grad_norm": 2.7238264832859973, + "learning_rate": 1.9997945371249784e-05, + "loss": 0.6541, + "step": 458 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 3.3916413944293984, + "learning_rate": 1.999789302795563e-05, + "loss": 0.6755, + "step": 459 + }, + { + "epoch": 0.036442859972271736, + "grad_norm": 3.0983398976573184, + "learning_rate": 1.999784002634832e-05, + "loss": 0.7259, + "step": 460 + }, + { + "epoch": 0.03652208358090711, + "grad_norm": 2.8820099131603234, + "learning_rate": 1.9997786366431354e-05, + "loss": 0.7672, + "step": 461 + }, + { + "epoch": 0.036601307189542485, + "grad_norm": 2.640988448612532, + "learning_rate": 1.9997732048208264e-05, + "loss": 0.6634, + "step": 462 + }, + { + "epoch": 0.03668053079817786, + "grad_norm": 3.072518897736735, + "learning_rate": 1.9997677071682623e-05, + "loss": 0.6494, + "step": 463 + }, + { + "epoch": 0.03675975440681323, + "grad_norm": 2.9755891729458304, + "learning_rate": 1.9997621436858053e-05, + "loss": 0.7039, + "step": 464 + }, + { + "epoch": 0.0368389780154486, + "grad_norm": 2.6731496434863637, + "learning_rate": 1.9997565143738216e-05, + "loss": 0.6326, + "step": 465 + }, + { + "epoch": 0.03691820162408398, + "grad_norm": 3.013420128072819, + "learning_rate": 1.999750819232682e-05, + "loss": 0.7497, + "step": 466 + }, + { + "epoch": 0.03699742523271935, + "grad_norm": 2.8700835063291374, + "learning_rate": 1.9997450582627614e-05, + "loss": 0.6467, + "step": 467 + }, + { + "epoch": 0.037076648841354726, + "grad_norm": 2.441887982804938, + "learning_rate": 1.9997392314644392e-05, + "loss": 0.5403, + "step": 468 + }, + { + "epoch": 0.0371558724499901, + "grad_norm": 2.627724197173415, + "learning_rate": 1.999733338838099e-05, + "loss": 0.5625, + "step": 469 + }, + { + "epoch": 0.03723509605862547, + "grad_norm": 3.2004314480087097, + "learning_rate": 1.999727380384129e-05, + "loss": 0.7292, + "step": 470 + }, + { + "epoch": 0.03731431966726084, + "grad_norm": 2.6246971490359643, + "learning_rate": 1.999721356102921e-05, + "loss": 0.5749, + "step": 471 + }, + { + "epoch": 0.03739354327589622, + "grad_norm": 2.8178098789578265, + "learning_rate": 1.9997152659948727e-05, + "loss": 0.6766, + "step": 472 + }, + { + "epoch": 0.03747276688453159, + "grad_norm": 2.751576181721114, + "learning_rate": 1.9997091100603842e-05, + "loss": 0.751, + "step": 473 + }, + { + "epoch": 0.037551990493166966, + "grad_norm": 2.8643044061564487, + "learning_rate": 1.999702888299861e-05, + "loss": 0.7831, + "step": 474 + }, + { + "epoch": 0.03763121410180234, + "grad_norm": 3.117882607758987, + "learning_rate": 1.9996966007137125e-05, + "loss": 0.7705, + "step": 475 + }, + { + "epoch": 0.03771043771043771, + "grad_norm": 2.9402830765122383, + "learning_rate": 1.9996902473023537e-05, + "loss": 0.6483, + "step": 476 + }, + { + "epoch": 0.03778966131907308, + "grad_norm": 3.282518962096688, + "learning_rate": 1.999683828066202e-05, + "loss": 0.6724, + "step": 477 + }, + { + "epoch": 0.03786888492770846, + "grad_norm": 3.2056871207163846, + "learning_rate": 1.9996773430056806e-05, + "loss": 0.7173, + "step": 478 + }, + { + "epoch": 0.03794810853634383, + "grad_norm": 2.9749120725141576, + "learning_rate": 1.999670792121216e-05, + "loss": 0.6135, + "step": 479 + }, + { + "epoch": 0.038027332144979206, + "grad_norm": 2.9635473200006155, + "learning_rate": 1.99966417541324e-05, + "loss": 0.6571, + "step": 480 + }, + { + "epoch": 0.038106555753614574, + "grad_norm": 2.928957653416349, + "learning_rate": 1.9996574928821883e-05, + "loss": 0.5789, + "step": 481 + }, + { + "epoch": 0.03818577936224995, + "grad_norm": 2.98391492816687, + "learning_rate": 1.9996507445285003e-05, + "loss": 0.7262, + "step": 482 + }, + { + "epoch": 0.03826500297088532, + "grad_norm": 2.800091720516489, + "learning_rate": 1.999643930352621e-05, + "loss": 0.633, + "step": 483 + }, + { + "epoch": 0.0383442265795207, + "grad_norm": 3.2330381177522893, + "learning_rate": 1.999637050354999e-05, + "loss": 0.6621, + "step": 484 + }, + { + "epoch": 0.03842345018815607, + "grad_norm": 2.552098027501547, + "learning_rate": 1.9996301045360874e-05, + "loss": 0.6485, + "step": 485 + }, + { + "epoch": 0.038502673796791446, + "grad_norm": 2.807944468047814, + "learning_rate": 1.999623092896343e-05, + "loss": 0.6129, + "step": 486 + }, + { + "epoch": 0.038581897405426814, + "grad_norm": 4.063652502957622, + "learning_rate": 1.9996160154362275e-05, + "loss": 0.835, + "step": 487 + }, + { + "epoch": 0.03866112101406219, + "grad_norm": 2.661945008145644, + "learning_rate": 1.9996088721562076e-05, + "loss": 0.6821, + "step": 488 + }, + { + "epoch": 0.03874034462269756, + "grad_norm": 2.6965061288315693, + "learning_rate": 1.9996016630567535e-05, + "loss": 0.6292, + "step": 489 + }, + { + "epoch": 0.03881956823133294, + "grad_norm": 3.0953770748312848, + "learning_rate": 1.9995943881383393e-05, + "loss": 0.6311, + "step": 490 + }, + { + "epoch": 0.03889879183996831, + "grad_norm": 2.785452994251254, + "learning_rate": 1.9995870474014444e-05, + "loss": 0.6612, + "step": 491 + }, + { + "epoch": 0.03897801544860369, + "grad_norm": 2.9224780599158673, + "learning_rate": 1.9995796408465523e-05, + "loss": 0.6748, + "step": 492 + }, + { + "epoch": 0.039057239057239054, + "grad_norm": 4.149683108294913, + "learning_rate": 1.9995721684741505e-05, + "loss": 0.8614, + "step": 493 + }, + { + "epoch": 0.03913646266587443, + "grad_norm": 3.1582695477828517, + "learning_rate": 1.9995646302847307e-05, + "loss": 0.7074, + "step": 494 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 2.7600043242543313, + "learning_rate": 1.9995570262787903e-05, + "loss": 0.6541, + "step": 495 + }, + { + "epoch": 0.03929490988314518, + "grad_norm": 3.1489370384526953, + "learning_rate": 1.9995493564568286e-05, + "loss": 0.7382, + "step": 496 + }, + { + "epoch": 0.03937413349178055, + "grad_norm": 2.8524636809738855, + "learning_rate": 1.9995416208193518e-05, + "loss": 0.7343, + "step": 497 + }, + { + "epoch": 0.03945335710041593, + "grad_norm": 2.958917817282755, + "learning_rate": 1.999533819366868e-05, + "loss": 0.6717, + "step": 498 + }, + { + "epoch": 0.039532580709051295, + "grad_norm": 2.9267099722503582, + "learning_rate": 1.9995259520998927e-05, + "loss": 0.7438, + "step": 499 + }, + { + "epoch": 0.03961180431768667, + "grad_norm": 2.695233361159774, + "learning_rate": 1.9995180190189424e-05, + "loss": 0.6826, + "step": 500 + }, + { + "epoch": 0.039691027926322044, + "grad_norm": 2.5503050358416206, + "learning_rate": 1.9995100201245397e-05, + "loss": 0.6195, + "step": 501 + }, + { + "epoch": 0.03977025153495742, + "grad_norm": 3.1679747148764723, + "learning_rate": 1.999501955417212e-05, + "loss": 0.5611, + "step": 502 + }, + { + "epoch": 0.03984947514359279, + "grad_norm": 2.673508667082099, + "learning_rate": 1.999493824897489e-05, + "loss": 0.6866, + "step": 503 + }, + { + "epoch": 0.03992869875222817, + "grad_norm": 2.712319456941955, + "learning_rate": 1.9994856285659073e-05, + "loss": 0.7074, + "step": 504 + }, + { + "epoch": 0.040007922360863535, + "grad_norm": 3.1805104047232295, + "learning_rate": 1.9994773664230064e-05, + "loss": 0.6495, + "step": 505 + }, + { + "epoch": 0.04008714596949891, + "grad_norm": 3.0759353713991215, + "learning_rate": 1.99946903846933e-05, + "loss": 0.7408, + "step": 506 + }, + { + "epoch": 0.040166369578134284, + "grad_norm": 3.7609544068127594, + "learning_rate": 1.9994606447054265e-05, + "loss": 0.7069, + "step": 507 + }, + { + "epoch": 0.04024559318676966, + "grad_norm": 3.1347559824941316, + "learning_rate": 1.999452185131849e-05, + "loss": 0.8218, + "step": 508 + }, + { + "epoch": 0.04032481679540503, + "grad_norm": 2.926264130145264, + "learning_rate": 1.9994436597491537e-05, + "loss": 0.7224, + "step": 509 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 3.2332216646277736, + "learning_rate": 1.9994350685579024e-05, + "loss": 0.7147, + "step": 510 + }, + { + "epoch": 0.040483264012675775, + "grad_norm": 2.999816638567332, + "learning_rate": 1.999426411558661e-05, + "loss": 0.7248, + "step": 511 + }, + { + "epoch": 0.04056248762131115, + "grad_norm": 2.7011088575440314, + "learning_rate": 1.9994176887519994e-05, + "loss": 0.6763, + "step": 512 + }, + { + "epoch": 0.040641711229946524, + "grad_norm": 2.875825212497789, + "learning_rate": 1.9994089001384918e-05, + "loss": 0.652, + "step": 513 + }, + { + "epoch": 0.0407209348385819, + "grad_norm": 2.7613176935656845, + "learning_rate": 1.9994000457187167e-05, + "loss": 0.6271, + "step": 514 + }, + { + "epoch": 0.04080015844721727, + "grad_norm": 3.0338670732042847, + "learning_rate": 1.999391125493258e-05, + "loss": 0.6945, + "step": 515 + }, + { + "epoch": 0.04087938205585264, + "grad_norm": 2.9271041032335794, + "learning_rate": 1.9993821394627018e-05, + "loss": 0.7362, + "step": 516 + }, + { + "epoch": 0.040958605664488015, + "grad_norm": 2.764063485558089, + "learning_rate": 1.9993730876276407e-05, + "loss": 0.6527, + "step": 517 + }, + { + "epoch": 0.04103782927312339, + "grad_norm": 3.2356110659940462, + "learning_rate": 1.9993639699886707e-05, + "loss": 0.8494, + "step": 518 + }, + { + "epoch": 0.041117052881758764, + "grad_norm": 2.711864109774637, + "learning_rate": 1.9993547865463916e-05, + "loss": 0.7577, + "step": 519 + }, + { + "epoch": 0.04119627649039414, + "grad_norm": 2.7076231482128525, + "learning_rate": 1.9993455373014087e-05, + "loss": 0.7267, + "step": 520 + }, + { + "epoch": 0.04127550009902951, + "grad_norm": 2.522840786648438, + "learning_rate": 1.99933622225433e-05, + "loss": 0.6705, + "step": 521 + }, + { + "epoch": 0.04135472370766488, + "grad_norm": 2.4498342280290197, + "learning_rate": 1.9993268414057704e-05, + "loss": 0.6374, + "step": 522 + }, + { + "epoch": 0.041433947316300256, + "grad_norm": 2.779138934794209, + "learning_rate": 1.9993173947563466e-05, + "loss": 0.6527, + "step": 523 + }, + { + "epoch": 0.04151317092493563, + "grad_norm": 3.209255526032306, + "learning_rate": 1.9993078823066804e-05, + "loss": 0.7538, + "step": 524 + }, + { + "epoch": 0.041592394533571005, + "grad_norm": 2.7573208701047967, + "learning_rate": 1.9992983040573986e-05, + "loss": 0.7038, + "step": 525 + }, + { + "epoch": 0.04167161814220638, + "grad_norm": 3.277805689495167, + "learning_rate": 1.9992886600091318e-05, + "loss": 0.6516, + "step": 526 + }, + { + "epoch": 0.041750841750841754, + "grad_norm": 3.1060003358891355, + "learning_rate": 1.9992789501625155e-05, + "loss": 0.7219, + "step": 527 + }, + { + "epoch": 0.04183006535947712, + "grad_norm": 3.558104529708639, + "learning_rate": 1.9992691745181882e-05, + "loss": 0.787, + "step": 528 + }, + { + "epoch": 0.041909288968112496, + "grad_norm": 3.511454003505485, + "learning_rate": 1.9992593330767938e-05, + "loss": 0.6802, + "step": 529 + }, + { + "epoch": 0.04198851257674787, + "grad_norm": 2.8584552202792386, + "learning_rate": 1.9992494258389805e-05, + "loss": 0.67, + "step": 530 + }, + { + "epoch": 0.042067736185383245, + "grad_norm": 2.844480677624732, + "learning_rate": 1.9992394528054006e-05, + "loss": 0.6093, + "step": 531 + }, + { + "epoch": 0.04214695979401862, + "grad_norm": 2.6071483043285983, + "learning_rate": 1.9992294139767106e-05, + "loss": 0.5991, + "step": 532 + }, + { + "epoch": 0.042226183402653994, + "grad_norm": 2.7460919041493788, + "learning_rate": 1.999219309353572e-05, + "loss": 0.6853, + "step": 533 + }, + { + "epoch": 0.04230540701128936, + "grad_norm": 2.5187884497002933, + "learning_rate": 1.9992091389366497e-05, + "loss": 0.6424, + "step": 534 + }, + { + "epoch": 0.042384630619924736, + "grad_norm": 2.897588748423445, + "learning_rate": 1.9991989027266134e-05, + "loss": 0.5928, + "step": 535 + }, + { + "epoch": 0.04246385422856011, + "grad_norm": 2.696778484381764, + "learning_rate": 1.9991886007241375e-05, + "loss": 0.697, + "step": 536 + }, + { + "epoch": 0.042543077837195485, + "grad_norm": 3.1141253510387403, + "learning_rate": 1.9991782329298998e-05, + "loss": 0.638, + "step": 537 + }, + { + "epoch": 0.04262230144583086, + "grad_norm": 2.7855091294178185, + "learning_rate": 1.9991677993445832e-05, + "loss": 0.7393, + "step": 538 + }, + { + "epoch": 0.042701525054466234, + "grad_norm": 3.211922209710532, + "learning_rate": 1.9991572999688746e-05, + "loss": 0.7502, + "step": 539 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 2.7847666934413233, + "learning_rate": 1.9991467348034653e-05, + "loss": 0.7073, + "step": 540 + }, + { + "epoch": 0.042859972271736976, + "grad_norm": 3.1466928505805254, + "learning_rate": 1.9991361038490515e-05, + "loss": 0.7464, + "step": 541 + }, + { + "epoch": 0.04293919588037235, + "grad_norm": 2.9109729397270394, + "learning_rate": 1.9991254071063327e-05, + "loss": 0.6223, + "step": 542 + }, + { + "epoch": 0.043018419489007725, + "grad_norm": 2.9595310235056376, + "learning_rate": 1.9991146445760133e-05, + "loss": 0.7574, + "step": 543 + }, + { + "epoch": 0.0430976430976431, + "grad_norm": 2.578088025546306, + "learning_rate": 1.9991038162588018e-05, + "loss": 0.729, + "step": 544 + }, + { + "epoch": 0.043176866706278474, + "grad_norm": 2.5866156442876784, + "learning_rate": 1.9990929221554117e-05, + "loss": 0.5977, + "step": 545 + }, + { + "epoch": 0.04325609031491384, + "grad_norm": 2.615899734064934, + "learning_rate": 1.99908196226656e-05, + "loss": 0.7644, + "step": 546 + }, + { + "epoch": 0.04333531392354922, + "grad_norm": 3.012293580248462, + "learning_rate": 1.9990709365929678e-05, + "loss": 0.5903, + "step": 547 + }, + { + "epoch": 0.04341453753218459, + "grad_norm": 2.6347402725839952, + "learning_rate": 1.999059845135362e-05, + "loss": 0.7533, + "step": 548 + }, + { + "epoch": 0.043493761140819966, + "grad_norm": 2.804277972290371, + "learning_rate": 1.9990486878944727e-05, + "loss": 0.7683, + "step": 549 + }, + { + "epoch": 0.04357298474945534, + "grad_norm": 3.298628451797767, + "learning_rate": 1.9990374648710343e-05, + "loss": 0.7281, + "step": 550 + }, + { + "epoch": 0.04365220835809071, + "grad_norm": 2.5461350179164435, + "learning_rate": 1.9990261760657858e-05, + "loss": 0.5977, + "step": 551 + }, + { + "epoch": 0.04373143196672608, + "grad_norm": 2.878080238406779, + "learning_rate": 1.9990148214794713e-05, + "loss": 0.6163, + "step": 552 + }, + { + "epoch": 0.04381065557536146, + "grad_norm": 2.8437619434504833, + "learning_rate": 1.999003401112837e-05, + "loss": 0.5998, + "step": 553 + }, + { + "epoch": 0.04388987918399683, + "grad_norm": 3.257285964911966, + "learning_rate": 1.9989919149666356e-05, + "loss": 0.7809, + "step": 554 + }, + { + "epoch": 0.043969102792632206, + "grad_norm": 2.6453604089495237, + "learning_rate": 1.998980363041624e-05, + "loss": 0.6726, + "step": 555 + }, + { + "epoch": 0.04404832640126758, + "grad_norm": 2.9857843193323936, + "learning_rate": 1.9989687453385617e-05, + "loss": 0.6915, + "step": 556 + }, + { + "epoch": 0.04412755000990295, + "grad_norm": 2.574734905677725, + "learning_rate": 1.9989570618582145e-05, + "loss": 0.5921, + "step": 557 + }, + { + "epoch": 0.04420677361853832, + "grad_norm": 2.9648203643839453, + "learning_rate": 1.9989453126013515e-05, + "loss": 0.6709, + "step": 558 + }, + { + "epoch": 0.0442859972271737, + "grad_norm": 2.9420517654172564, + "learning_rate": 1.9989334975687462e-05, + "loss": 0.6243, + "step": 559 + }, + { + "epoch": 0.04436522083580907, + "grad_norm": 2.7075965434973446, + "learning_rate": 1.9989216167611766e-05, + "loss": 0.6519, + "step": 560 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.4922142754295864, + "learning_rate": 1.998909670179425e-05, + "loss": 0.7702, + "step": 561 + }, + { + "epoch": 0.04452366805307982, + "grad_norm": 2.471925624577639, + "learning_rate": 1.9988976578242785e-05, + "loss": 0.5786, + "step": 562 + }, + { + "epoch": 0.04460289166171519, + "grad_norm": 2.6487943441010824, + "learning_rate": 1.9988855796965275e-05, + "loss": 0.5797, + "step": 563 + }, + { + "epoch": 0.04468211527035056, + "grad_norm": 2.947701025591218, + "learning_rate": 1.998873435796967e-05, + "loss": 0.6425, + "step": 564 + }, + { + "epoch": 0.04476133887898594, + "grad_norm": 2.9218173867935384, + "learning_rate": 1.9988612261263972e-05, + "loss": 0.6799, + "step": 565 + }, + { + "epoch": 0.04484056248762131, + "grad_norm": 3.177205720590406, + "learning_rate": 1.9988489506856218e-05, + "loss": 0.7271, + "step": 566 + }, + { + "epoch": 0.044919786096256686, + "grad_norm": 2.2078488201996302, + "learning_rate": 1.9988366094754493e-05, + "loss": 0.5412, + "step": 567 + }, + { + "epoch": 0.04499900970489206, + "grad_norm": 2.777484065929777, + "learning_rate": 1.9988242024966924e-05, + "loss": 0.7146, + "step": 568 + }, + { + "epoch": 0.04507823331352743, + "grad_norm": 2.6607829463343857, + "learning_rate": 1.9988117297501674e-05, + "loss": 0.5175, + "step": 569 + }, + { + "epoch": 0.0451574569221628, + "grad_norm": 3.1262530114276723, + "learning_rate": 1.998799191236696e-05, + "loss": 0.5948, + "step": 570 + }, + { + "epoch": 0.04523668053079818, + "grad_norm": 2.6460522397025845, + "learning_rate": 1.998786586957104e-05, + "loss": 0.58, + "step": 571 + }, + { + "epoch": 0.04531590413943355, + "grad_norm": 2.338051691139102, + "learning_rate": 1.998773916912221e-05, + "loss": 0.5258, + "step": 572 + }, + { + "epoch": 0.04539512774806893, + "grad_norm": 3.1732048495489216, + "learning_rate": 1.9987611811028814e-05, + "loss": 0.7305, + "step": 573 + }, + { + "epoch": 0.0454743513567043, + "grad_norm": 3.2720050513201784, + "learning_rate": 1.9987483795299236e-05, + "loss": 0.7008, + "step": 574 + }, + { + "epoch": 0.04555357496533967, + "grad_norm": 2.6673925623928763, + "learning_rate": 1.9987355121941907e-05, + "loss": 0.5623, + "step": 575 + }, + { + "epoch": 0.04563279857397504, + "grad_norm": 2.9195876692283096, + "learning_rate": 1.99872257909653e-05, + "loss": 0.6321, + "step": 576 + }, + { + "epoch": 0.04571202218261042, + "grad_norm": 2.7217386213900614, + "learning_rate": 1.9987095802377933e-05, + "loss": 0.6518, + "step": 577 + }, + { + "epoch": 0.04579124579124579, + "grad_norm": 3.2974273164470427, + "learning_rate": 1.9986965156188357e-05, + "loss": 0.7561, + "step": 578 + }, + { + "epoch": 0.04587046939988117, + "grad_norm": 2.9400032161999388, + "learning_rate": 1.9986833852405183e-05, + "loss": 0.6642, + "step": 579 + }, + { + "epoch": 0.045949693008516534, + "grad_norm": 2.7068679155876376, + "learning_rate": 1.9986701891037053e-05, + "loss": 0.6378, + "step": 580 + }, + { + "epoch": 0.04602891661715191, + "grad_norm": 3.9742121301651583, + "learning_rate": 1.9986569272092656e-05, + "loss": 0.7511, + "step": 581 + }, + { + "epoch": 0.046108140225787284, + "grad_norm": 3.242659719526242, + "learning_rate": 1.9986435995580725e-05, + "loss": 0.761, + "step": 582 + }, + { + "epoch": 0.04618736383442266, + "grad_norm": 2.895531543417738, + "learning_rate": 1.9986302061510036e-05, + "loss": 0.6983, + "step": 583 + }, + { + "epoch": 0.04626658744305803, + "grad_norm": 2.8702758756080415, + "learning_rate": 1.9986167469889405e-05, + "loss": 0.6306, + "step": 584 + }, + { + "epoch": 0.04634581105169341, + "grad_norm": 2.1347346059102748, + "learning_rate": 1.9986032220727698e-05, + "loss": 0.6373, + "step": 585 + }, + { + "epoch": 0.046425034660328775, + "grad_norm": 2.8633299012680866, + "learning_rate": 1.9985896314033816e-05, + "loss": 0.6229, + "step": 586 + }, + { + "epoch": 0.04650425826896415, + "grad_norm": 3.186312914456255, + "learning_rate": 1.9985759749816715e-05, + "loss": 0.7059, + "step": 587 + }, + { + "epoch": 0.046583481877599524, + "grad_norm": 3.238427936423136, + "learning_rate": 1.9985622528085382e-05, + "loss": 0.8296, + "step": 588 + }, + { + "epoch": 0.0466627054862349, + "grad_norm": 2.4922093733575577, + "learning_rate": 1.9985484648848854e-05, + "loss": 0.6537, + "step": 589 + }, + { + "epoch": 0.04674192909487027, + "grad_norm": 2.7315794081336238, + "learning_rate": 1.9985346112116207e-05, + "loss": 0.6153, + "step": 590 + }, + { + "epoch": 0.04682115270350565, + "grad_norm": 2.9502974462280163, + "learning_rate": 1.9985206917896563e-05, + "loss": 0.7039, + "step": 591 + }, + { + "epoch": 0.046900376312141015, + "grad_norm": 2.7589366963533455, + "learning_rate": 1.9985067066199093e-05, + "loss": 0.712, + "step": 592 + }, + { + "epoch": 0.04697959992077639, + "grad_norm": 2.6013313259748894, + "learning_rate": 1.9984926557033003e-05, + "loss": 0.5692, + "step": 593 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 2.539902134310688, + "learning_rate": 1.998478539040754e-05, + "loss": 0.6376, + "step": 594 + }, + { + "epoch": 0.04713804713804714, + "grad_norm": 2.8541696879373997, + "learning_rate": 1.9984643566332005e-05, + "loss": 0.6713, + "step": 595 + }, + { + "epoch": 0.04721727074668251, + "grad_norm": 2.4711420630229806, + "learning_rate": 1.9984501084815734e-05, + "loss": 0.648, + "step": 596 + }, + { + "epoch": 0.04729649435531789, + "grad_norm": 2.5613465734917105, + "learning_rate": 1.9984357945868106e-05, + "loss": 0.5956, + "step": 597 + }, + { + "epoch": 0.047375717963953255, + "grad_norm": 2.8631944466667556, + "learning_rate": 1.998421414949855e-05, + "loss": 0.6392, + "step": 598 + }, + { + "epoch": 0.04745494157258863, + "grad_norm": 3.0172239664770606, + "learning_rate": 1.9984069695716534e-05, + "loss": 0.7078, + "step": 599 + }, + { + "epoch": 0.047534165181224004, + "grad_norm": 3.0444262679152603, + "learning_rate": 1.998392458453157e-05, + "loss": 0.6846, + "step": 600 + }, + { + "epoch": 0.04761338878985938, + "grad_norm": 3.0072222650922678, + "learning_rate": 1.998377881595321e-05, + "loss": 0.79, + "step": 601 + }, + { + "epoch": 0.04769261239849475, + "grad_norm": 3.5902124668081044, + "learning_rate": 1.9983632389991056e-05, + "loss": 0.7086, + "step": 602 + }, + { + "epoch": 0.04777183600713013, + "grad_norm": 3.1212771496113625, + "learning_rate": 1.9983485306654745e-05, + "loss": 0.6768, + "step": 603 + }, + { + "epoch": 0.047851059615765495, + "grad_norm": 2.616003039410528, + "learning_rate": 1.9983337565953968e-05, + "loss": 0.6232, + "step": 604 + }, + { + "epoch": 0.04793028322440087, + "grad_norm": 2.591166475711505, + "learning_rate": 1.9983189167898446e-05, + "loss": 0.5417, + "step": 605 + }, + { + "epoch": 0.048009506833036245, + "grad_norm": 2.779868819625606, + "learning_rate": 1.998304011249795e-05, + "loss": 0.6163, + "step": 606 + }, + { + "epoch": 0.04808873044167162, + "grad_norm": 2.829172799196641, + "learning_rate": 1.9982890399762303e-05, + "loss": 0.6054, + "step": 607 + }, + { + "epoch": 0.048167954050306994, + "grad_norm": 2.809769704582584, + "learning_rate": 1.9982740029701356e-05, + "loss": 0.6892, + "step": 608 + }, + { + "epoch": 0.04824717765894237, + "grad_norm": 2.605794862233877, + "learning_rate": 1.998258900232501e-05, + "loss": 0.5969, + "step": 609 + }, + { + "epoch": 0.048326401267577736, + "grad_norm": 2.7851336729890046, + "learning_rate": 1.9982437317643218e-05, + "loss": 0.6597, + "step": 610 + }, + { + "epoch": 0.04840562487621311, + "grad_norm": 2.5018389523938454, + "learning_rate": 1.9982284975665952e-05, + "loss": 0.5533, + "step": 611 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 2.777958124265028, + "learning_rate": 1.998213197640326e-05, + "loss": 0.7352, + "step": 612 + }, + { + "epoch": 0.04856407209348386, + "grad_norm": 2.2012757624853174, + "learning_rate": 1.9981978319865204e-05, + "loss": 0.5475, + "step": 613 + }, + { + "epoch": 0.048643295702119234, + "grad_norm": 2.704370088430054, + "learning_rate": 1.9981824006061904e-05, + "loss": 0.6201, + "step": 614 + }, + { + "epoch": 0.0487225193107546, + "grad_norm": 2.5669896657704405, + "learning_rate": 1.998166903500353e-05, + "loss": 0.6223, + "step": 615 + }, + { + "epoch": 0.048801742919389976, + "grad_norm": 2.323261161206825, + "learning_rate": 1.998151340670027e-05, + "loss": 0.6524, + "step": 616 + }, + { + "epoch": 0.04888096652802535, + "grad_norm": 2.8889437618273837, + "learning_rate": 1.9981357121162385e-05, + "loss": 0.6029, + "step": 617 + }, + { + "epoch": 0.048960190136660725, + "grad_norm": 2.402792617113219, + "learning_rate": 1.998120017840016e-05, + "loss": 0.609, + "step": 618 + }, + { + "epoch": 0.0490394137452961, + "grad_norm": 2.6508978559231022, + "learning_rate": 1.998104257842393e-05, + "loss": 0.5763, + "step": 619 + }, + { + "epoch": 0.049118637353931474, + "grad_norm": 2.4978265070674053, + "learning_rate": 1.9980884321244072e-05, + "loss": 0.6082, + "step": 620 + }, + { + "epoch": 0.04919786096256684, + "grad_norm": 2.7797387302993912, + "learning_rate": 1.9980725406871007e-05, + "loss": 0.6711, + "step": 621 + }, + { + "epoch": 0.049277084571202216, + "grad_norm": 3.043533496043816, + "learning_rate": 1.9980565835315196e-05, + "loss": 0.7228, + "step": 622 + }, + { + "epoch": 0.04935630817983759, + "grad_norm": 2.536032972949813, + "learning_rate": 1.9980405606587148e-05, + "loss": 0.5813, + "step": 623 + }, + { + "epoch": 0.049435531788472965, + "grad_norm": 2.742709408107949, + "learning_rate": 1.9980244720697417e-05, + "loss": 0.6454, + "step": 624 + }, + { + "epoch": 0.04951475539710834, + "grad_norm": 2.7586212198110602, + "learning_rate": 1.9980083177656588e-05, + "loss": 0.6294, + "step": 625 + }, + { + "epoch": 0.049593979005743714, + "grad_norm": 2.670816947943694, + "learning_rate": 1.9979920977475306e-05, + "loss": 0.678, + "step": 626 + }, + { + "epoch": 0.04967320261437908, + "grad_norm": 2.6129776162312037, + "learning_rate": 1.9979758120164248e-05, + "loss": 0.5139, + "step": 627 + }, + { + "epoch": 0.049752426223014456, + "grad_norm": 3.464283442736191, + "learning_rate": 1.997959460573414e-05, + "loss": 0.752, + "step": 628 + }, + { + "epoch": 0.04983164983164983, + "grad_norm": 2.468377768061911, + "learning_rate": 1.9979430434195742e-05, + "loss": 0.6196, + "step": 629 + }, + { + "epoch": 0.049910873440285206, + "grad_norm": 2.5856202402355626, + "learning_rate": 1.9979265605559868e-05, + "loss": 0.5284, + "step": 630 + }, + { + "epoch": 0.04999009704892058, + "grad_norm": 2.8478163456612466, + "learning_rate": 1.997910011983737e-05, + "loss": 0.656, + "step": 631 + }, + { + "epoch": 0.050069320657555955, + "grad_norm": 2.66801621363886, + "learning_rate": 1.997893397703915e-05, + "loss": 0.7702, + "step": 632 + }, + { + "epoch": 0.05014854426619132, + "grad_norm": 2.6782434329778253, + "learning_rate": 1.997876717717614e-05, + "loss": 0.6588, + "step": 633 + }, + { + "epoch": 0.0502277678748267, + "grad_norm": 2.8357230739787807, + "learning_rate": 1.9978599720259325e-05, + "loss": 0.636, + "step": 634 + }, + { + "epoch": 0.05030699148346207, + "grad_norm": 2.67219616582363, + "learning_rate": 1.9978431606299736e-05, + "loss": 0.6349, + "step": 635 + }, + { + "epoch": 0.050386215092097446, + "grad_norm": 2.7799780644999563, + "learning_rate": 1.9978262835308437e-05, + "loss": 0.604, + "step": 636 + }, + { + "epoch": 0.05046543870073282, + "grad_norm": 2.6727525862396773, + "learning_rate": 1.997809340729654e-05, + "loss": 0.6337, + "step": 637 + }, + { + "epoch": 0.050544662309368195, + "grad_norm": 2.910480479175523, + "learning_rate": 1.9977923322275206e-05, + "loss": 0.7388, + "step": 638 + }, + { + "epoch": 0.05062388591800356, + "grad_norm": 2.600217686259559, + "learning_rate": 1.997775258025563e-05, + "loss": 0.696, + "step": 639 + }, + { + "epoch": 0.05070310952663894, + "grad_norm": 2.627565437899395, + "learning_rate": 1.997758118124906e-05, + "loss": 0.6522, + "step": 640 + }, + { + "epoch": 0.05078233313527431, + "grad_norm": 2.8081736199031337, + "learning_rate": 1.997740912526678e-05, + "loss": 0.5927, + "step": 641 + }, + { + "epoch": 0.050861556743909686, + "grad_norm": 2.552202695982183, + "learning_rate": 1.9977236412320112e-05, + "loss": 0.6015, + "step": 642 + }, + { + "epoch": 0.05094078035254506, + "grad_norm": 2.8910998372845014, + "learning_rate": 1.9977063042420438e-05, + "loss": 0.7683, + "step": 643 + }, + { + "epoch": 0.051020003961180435, + "grad_norm": 2.9512496299669198, + "learning_rate": 1.9976889015579167e-05, + "loss": 0.729, + "step": 644 + }, + { + "epoch": 0.0510992275698158, + "grad_norm": 2.5518080864931387, + "learning_rate": 1.997671433180776e-05, + "loss": 0.6124, + "step": 645 + }, + { + "epoch": 0.05117845117845118, + "grad_norm": 2.37320117472414, + "learning_rate": 1.997653899111772e-05, + "loss": 0.5442, + "step": 646 + }, + { + "epoch": 0.05125767478708655, + "grad_norm": 2.795261688628885, + "learning_rate": 1.9976362993520587e-05, + "loss": 0.6342, + "step": 647 + }, + { + "epoch": 0.051336898395721926, + "grad_norm": 3.4267679656696703, + "learning_rate": 1.9976186339027958e-05, + "loss": 0.7155, + "step": 648 + }, + { + "epoch": 0.0514161220043573, + "grad_norm": 3.1752769459428363, + "learning_rate": 1.9976009027651463e-05, + "loss": 0.7265, + "step": 649 + }, + { + "epoch": 0.05149534561299267, + "grad_norm": 2.8347118851828905, + "learning_rate": 1.9975831059402774e-05, + "loss": 0.7014, + "step": 650 + }, + { + "epoch": 0.05157456922162804, + "grad_norm": 2.6490634250358607, + "learning_rate": 1.9975652434293607e-05, + "loss": 0.6357, + "step": 651 + }, + { + "epoch": 0.05165379283026342, + "grad_norm": 2.3694212990980352, + "learning_rate": 1.9975473152335726e-05, + "loss": 0.5775, + "step": 652 + }, + { + "epoch": 0.05173301643889879, + "grad_norm": 2.3910356841694957, + "learning_rate": 1.9975293213540942e-05, + "loss": 0.5648, + "step": 653 + }, + { + "epoch": 0.05181224004753417, + "grad_norm": 2.764664832140692, + "learning_rate": 1.9975112617921097e-05, + "loss": 0.6438, + "step": 654 + }, + { + "epoch": 0.05189146365616954, + "grad_norm": 2.4686374239659363, + "learning_rate": 1.997493136548808e-05, + "loss": 0.6736, + "step": 655 + }, + { + "epoch": 0.05197068726480491, + "grad_norm": 2.7663992114573945, + "learning_rate": 1.9974749456253834e-05, + "loss": 0.6061, + "step": 656 + }, + { + "epoch": 0.05204991087344028, + "grad_norm": 2.811873503676391, + "learning_rate": 1.9974566890230327e-05, + "loss": 0.6387, + "step": 657 + }, + { + "epoch": 0.05212913448207566, + "grad_norm": 2.521915684645968, + "learning_rate": 1.9974383667429585e-05, + "loss": 0.544, + "step": 658 + }, + { + "epoch": 0.05220835809071103, + "grad_norm": 3.0204963682286685, + "learning_rate": 1.9974199787863674e-05, + "loss": 0.7979, + "step": 659 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 2.4868389956062953, + "learning_rate": 1.99740152515447e-05, + "loss": 0.5599, + "step": 660 + }, + { + "epoch": 0.05236680530798178, + "grad_norm": 2.3646509533453766, + "learning_rate": 1.9973830058484813e-05, + "loss": 0.5704, + "step": 661 + }, + { + "epoch": 0.05244602891661715, + "grad_norm": 2.679785776657189, + "learning_rate": 1.9973644208696208e-05, + "loss": 0.6531, + "step": 662 + }, + { + "epoch": 0.052525252525252523, + "grad_norm": 2.5504100604708824, + "learning_rate": 1.9973457702191123e-05, + "loss": 0.6136, + "step": 663 + }, + { + "epoch": 0.0526044761338879, + "grad_norm": 2.4527852886555537, + "learning_rate": 1.9973270538981835e-05, + "loss": 0.6394, + "step": 664 + }, + { + "epoch": 0.05268369974252327, + "grad_norm": 2.633989018059531, + "learning_rate": 1.9973082719080673e-05, + "loss": 0.635, + "step": 665 + }, + { + "epoch": 0.05276292335115865, + "grad_norm": 2.6275397938688188, + "learning_rate": 1.9972894242499997e-05, + "loss": 0.6583, + "step": 666 + }, + { + "epoch": 0.05284214695979402, + "grad_norm": 2.6031168190738745, + "learning_rate": 1.9972705109252227e-05, + "loss": 0.4877, + "step": 667 + }, + { + "epoch": 0.05292137056842939, + "grad_norm": 2.7618723427671794, + "learning_rate": 1.997251531934981e-05, + "loss": 0.6604, + "step": 668 + }, + { + "epoch": 0.053000594177064764, + "grad_norm": 2.4246126961990746, + "learning_rate": 1.997232487280524e-05, + "loss": 0.6461, + "step": 669 + }, + { + "epoch": 0.05307981778570014, + "grad_norm": 2.71269251734555, + "learning_rate": 1.9972133769631065e-05, + "loss": 0.5875, + "step": 670 + }, + { + "epoch": 0.05315904139433551, + "grad_norm": 2.842160240744585, + "learning_rate": 1.9971942009839862e-05, + "loss": 0.5902, + "step": 671 + }, + { + "epoch": 0.05323826500297089, + "grad_norm": 2.5058083969029608, + "learning_rate": 1.997174959344426e-05, + "loss": 0.5104, + "step": 672 + }, + { + "epoch": 0.05331748861160626, + "grad_norm": 3.095755714523418, + "learning_rate": 1.9971556520456928e-05, + "loss": 0.7196, + "step": 673 + }, + { + "epoch": 0.05339671222024163, + "grad_norm": 2.6440602877302766, + "learning_rate": 1.997136279089058e-05, + "loss": 0.5158, + "step": 674 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 2.380378443290473, + "learning_rate": 1.9971168404757972e-05, + "loss": 0.5321, + "step": 675 + }, + { + "epoch": 0.05355515943751238, + "grad_norm": 2.943670103029875, + "learning_rate": 1.99709733620719e-05, + "loss": 0.5452, + "step": 676 + }, + { + "epoch": 0.05363438304614775, + "grad_norm": 2.698056488057479, + "learning_rate": 1.9970777662845212e-05, + "loss": 0.5922, + "step": 677 + }, + { + "epoch": 0.05371360665478313, + "grad_norm": 2.485391823486526, + "learning_rate": 1.997058130709079e-05, + "loss": 0.6386, + "step": 678 + }, + { + "epoch": 0.0537928302634185, + "grad_norm": 2.371974473127211, + "learning_rate": 1.9970384294821565e-05, + "loss": 0.6121, + "step": 679 + }, + { + "epoch": 0.05387205387205387, + "grad_norm": 2.287721938999729, + "learning_rate": 1.9970186626050507e-05, + "loss": 0.5171, + "step": 680 + }, + { + "epoch": 0.053951277480689244, + "grad_norm": 2.6430969086515033, + "learning_rate": 1.9969988300790636e-05, + "loss": 0.6118, + "step": 681 + }, + { + "epoch": 0.05403050108932462, + "grad_norm": 2.741916159906486, + "learning_rate": 1.9969789319055007e-05, + "loss": 0.5779, + "step": 682 + }, + { + "epoch": 0.05410972469795999, + "grad_norm": 3.158648452548921, + "learning_rate": 1.996958968085672e-05, + "loss": 0.6893, + "step": 683 + }, + { + "epoch": 0.05418894830659537, + "grad_norm": 3.020262211509068, + "learning_rate": 1.9969389386208927e-05, + "loss": 0.5984, + "step": 684 + }, + { + "epoch": 0.054268171915230735, + "grad_norm": 2.5100076680365424, + "learning_rate": 1.9969188435124812e-05, + "loss": 0.6247, + "step": 685 + }, + { + "epoch": 0.05434739552386611, + "grad_norm": 2.4949733461128303, + "learning_rate": 1.9968986827617603e-05, + "loss": 0.6169, + "step": 686 + }, + { + "epoch": 0.054426619132501484, + "grad_norm": 2.429748799371095, + "learning_rate": 1.9968784563700586e-05, + "loss": 0.6555, + "step": 687 + }, + { + "epoch": 0.05450584274113686, + "grad_norm": 2.3898313595164744, + "learning_rate": 1.9968581643387065e-05, + "loss": 0.5174, + "step": 688 + }, + { + "epoch": 0.054585066349772234, + "grad_norm": 2.637504743240127, + "learning_rate": 1.9968378066690414e-05, + "loss": 0.6571, + "step": 689 + }, + { + "epoch": 0.05466428995840761, + "grad_norm": 2.6575214702891583, + "learning_rate": 1.996817383362403e-05, + "loss": 0.5789, + "step": 690 + }, + { + "epoch": 0.054743513567042976, + "grad_norm": 2.786459308227437, + "learning_rate": 1.996796894420136e-05, + "loss": 0.7131, + "step": 691 + }, + { + "epoch": 0.05482273717567835, + "grad_norm": 2.4692945603362593, + "learning_rate": 1.9967763398435904e-05, + "loss": 0.5474, + "step": 692 + }, + { + "epoch": 0.054901960784313725, + "grad_norm": 2.134773643144399, + "learning_rate": 1.9967557196341184e-05, + "loss": 0.5043, + "step": 693 + }, + { + "epoch": 0.0549811843929491, + "grad_norm": 2.3494652818548194, + "learning_rate": 1.996735033793079e-05, + "loss": 0.5797, + "step": 694 + }, + { + "epoch": 0.055060408001584474, + "grad_norm": 2.301315502766717, + "learning_rate": 1.996714282321833e-05, + "loss": 0.5363, + "step": 695 + }, + { + "epoch": 0.05513963161021985, + "grad_norm": 2.646761240488808, + "learning_rate": 1.9966934652217477e-05, + "loss": 0.6053, + "step": 696 + }, + { + "epoch": 0.055218855218855216, + "grad_norm": 2.415552203020133, + "learning_rate": 1.9966725824941933e-05, + "loss": 0.5301, + "step": 697 + }, + { + "epoch": 0.05529807882749059, + "grad_norm": 3.0458682362950897, + "learning_rate": 1.9966516341405452e-05, + "loss": 0.6386, + "step": 698 + }, + { + "epoch": 0.055377302436125965, + "grad_norm": 2.7936587781734903, + "learning_rate": 1.9966306201621826e-05, + "loss": 0.7439, + "step": 699 + }, + { + "epoch": 0.05545652604476134, + "grad_norm": 2.8706291071527685, + "learning_rate": 1.996609540560489e-05, + "loss": 0.6652, + "step": 700 + }, + { + "epoch": 0.055535749653396714, + "grad_norm": 2.3161894148621514, + "learning_rate": 1.9965883953368527e-05, + "loss": 0.5688, + "step": 701 + }, + { + "epoch": 0.05561497326203209, + "grad_norm": 2.8461840065841497, + "learning_rate": 1.9965671844926656e-05, + "loss": 0.5667, + "step": 702 + }, + { + "epoch": 0.055694196870667456, + "grad_norm": 2.9600619125581993, + "learning_rate": 1.9965459080293247e-05, + "loss": 0.6043, + "step": 703 + }, + { + "epoch": 0.05577342047930283, + "grad_norm": 2.397708845722197, + "learning_rate": 1.9965245659482312e-05, + "loss": 0.5245, + "step": 704 + }, + { + "epoch": 0.055852644087938205, + "grad_norm": 2.405841582394235, + "learning_rate": 1.9965031582507896e-05, + "loss": 0.5556, + "step": 705 + }, + { + "epoch": 0.05593186769657358, + "grad_norm": 3.100420003235673, + "learning_rate": 1.99648168493841e-05, + "loss": 0.6072, + "step": 706 + }, + { + "epoch": 0.056011091305208954, + "grad_norm": 2.889026280113388, + "learning_rate": 1.996460146012506e-05, + "loss": 0.7871, + "step": 707 + }, + { + "epoch": 0.05609031491384433, + "grad_norm": 2.3461840103487672, + "learning_rate": 1.996438541474496e-05, + "loss": 0.6629, + "step": 708 + }, + { + "epoch": 0.056169538522479696, + "grad_norm": 2.923544197881536, + "learning_rate": 1.996416871325803e-05, + "loss": 0.7147, + "step": 709 + }, + { + "epoch": 0.05624876213111507, + "grad_norm": 2.3962285535277017, + "learning_rate": 1.9963951355678533e-05, + "loss": 0.5197, + "step": 710 + }, + { + "epoch": 0.056327985739750445, + "grad_norm": 3.027128432204251, + "learning_rate": 1.996373334202078e-05, + "loss": 0.7684, + "step": 711 + }, + { + "epoch": 0.05640720934838582, + "grad_norm": 2.477609611604304, + "learning_rate": 1.9963514672299135e-05, + "loss": 0.6056, + "step": 712 + }, + { + "epoch": 0.056486432957021195, + "grad_norm": 2.3765566687849207, + "learning_rate": 1.9963295346527984e-05, + "loss": 0.5918, + "step": 713 + }, + { + "epoch": 0.05656565656565657, + "grad_norm": 2.939459877213976, + "learning_rate": 1.996307536472178e-05, + "loss": 0.6208, + "step": 714 + }, + { + "epoch": 0.05664488017429194, + "grad_norm": 2.9043336498673287, + "learning_rate": 1.9962854726894997e-05, + "loss": 0.6694, + "step": 715 + }, + { + "epoch": 0.05672410378292731, + "grad_norm": 2.4184683194597203, + "learning_rate": 1.9962633433062174e-05, + "loss": 0.5481, + "step": 716 + }, + { + "epoch": 0.056803327391562686, + "grad_norm": 2.4163807137940245, + "learning_rate": 1.996241148323787e-05, + "loss": 0.5587, + "step": 717 + }, + { + "epoch": 0.05688255100019806, + "grad_norm": 2.891850644404592, + "learning_rate": 1.996218887743671e-05, + "loss": 0.6685, + "step": 718 + }, + { + "epoch": 0.056961774608833435, + "grad_norm": 2.651254931314016, + "learning_rate": 1.996196561567335e-05, + "loss": 0.653, + "step": 719 + }, + { + "epoch": 0.0570409982174688, + "grad_norm": 2.57945858033426, + "learning_rate": 1.996174169796248e-05, + "loss": 0.5611, + "step": 720 + }, + { + "epoch": 0.05712022182610418, + "grad_norm": 2.2201838539606813, + "learning_rate": 1.996151712431886e-05, + "loss": 0.4784, + "step": 721 + }, + { + "epoch": 0.05719944543473955, + "grad_norm": 2.5032642726550627, + "learning_rate": 1.9961291894757267e-05, + "loss": 0.6104, + "step": 722 + }, + { + "epoch": 0.057278669043374926, + "grad_norm": 2.5685714936150292, + "learning_rate": 1.9961066009292532e-05, + "loss": 0.6313, + "step": 723 + }, + { + "epoch": 0.0573578926520103, + "grad_norm": 2.8109719313699215, + "learning_rate": 1.9960839467939534e-05, + "loss": 0.5291, + "step": 724 + }, + { + "epoch": 0.057437116260645675, + "grad_norm": 2.7771055788225065, + "learning_rate": 1.996061227071318e-05, + "loss": 0.6983, + "step": 725 + }, + { + "epoch": 0.05751633986928104, + "grad_norm": 2.0332474807315934, + "learning_rate": 1.996038441762844e-05, + "loss": 0.5158, + "step": 726 + }, + { + "epoch": 0.05759556347791642, + "grad_norm": 2.531385531966582, + "learning_rate": 1.9960155908700306e-05, + "loss": 0.3911, + "step": 727 + }, + { + "epoch": 0.05767478708655179, + "grad_norm": 2.5610767100035656, + "learning_rate": 1.9959926743943836e-05, + "loss": 0.7105, + "step": 728 + }, + { + "epoch": 0.057754010695187166, + "grad_norm": 2.4959407649057144, + "learning_rate": 1.9959696923374113e-05, + "loss": 0.5186, + "step": 729 + }, + { + "epoch": 0.05783323430382254, + "grad_norm": 2.509142926211617, + "learning_rate": 1.995946644700627e-05, + "loss": 0.4419, + "step": 730 + }, + { + "epoch": 0.057912457912457915, + "grad_norm": 2.222831618451293, + "learning_rate": 1.9959235314855485e-05, + "loss": 0.6019, + "step": 731 + }, + { + "epoch": 0.05799168152109328, + "grad_norm": 2.889053167249598, + "learning_rate": 1.9959003526936972e-05, + "loss": 0.6789, + "step": 732 + }, + { + "epoch": 0.05807090512972866, + "grad_norm": 2.5681290397033747, + "learning_rate": 1.9958771083266e-05, + "loss": 0.4969, + "step": 733 + }, + { + "epoch": 0.05815012873836403, + "grad_norm": 3.39418674973344, + "learning_rate": 1.995853798385787e-05, + "loss": 0.7614, + "step": 734 + }, + { + "epoch": 0.058229352346999406, + "grad_norm": 2.5749721111731576, + "learning_rate": 1.9958304228727928e-05, + "loss": 0.5518, + "step": 735 + }, + { + "epoch": 0.05830857595563478, + "grad_norm": 2.0726530624526394, + "learning_rate": 1.995806981789157e-05, + "loss": 0.5375, + "step": 736 + }, + { + "epoch": 0.058387799564270156, + "grad_norm": 2.3779351058228593, + "learning_rate": 1.9957834751364232e-05, + "loss": 0.6003, + "step": 737 + }, + { + "epoch": 0.05846702317290552, + "grad_norm": 2.4830709763837526, + "learning_rate": 1.995759902916139e-05, + "loss": 0.6315, + "step": 738 + }, + { + "epoch": 0.0585462467815409, + "grad_norm": 2.7198984614433095, + "learning_rate": 1.995736265129856e-05, + "loss": 0.5255, + "step": 739 + }, + { + "epoch": 0.05862547039017627, + "grad_norm": 2.491938822626573, + "learning_rate": 1.9957125617791314e-05, + "loss": 0.6831, + "step": 740 + }, + { + "epoch": 0.05870469399881165, + "grad_norm": 2.5592724079088436, + "learning_rate": 1.995688792865526e-05, + "loss": 0.5711, + "step": 741 + }, + { + "epoch": 0.05878391760744702, + "grad_norm": 2.334868396839467, + "learning_rate": 1.995664958390604e-05, + "loss": 0.7081, + "step": 742 + }, + { + "epoch": 0.058863141216082396, + "grad_norm": 2.1865386690707487, + "learning_rate": 1.995641058355936e-05, + "loss": 0.489, + "step": 743 + }, + { + "epoch": 0.05894236482471776, + "grad_norm": 2.3231421524398934, + "learning_rate": 1.9956170927630946e-05, + "loss": 0.4743, + "step": 744 + }, + { + "epoch": 0.05902158843335314, + "grad_norm": 2.494813284093219, + "learning_rate": 1.9955930616136582e-05, + "loss": 0.6098, + "step": 745 + }, + { + "epoch": 0.05910081204198851, + "grad_norm": 2.599401725454122, + "learning_rate": 1.995568964909209e-05, + "loss": 0.623, + "step": 746 + }, + { + "epoch": 0.05918003565062389, + "grad_norm": 2.3179147089299192, + "learning_rate": 1.995544802651334e-05, + "loss": 0.584, + "step": 747 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 2.691895102421814, + "learning_rate": 1.995520574841624e-05, + "loss": 0.5988, + "step": 748 + }, + { + "epoch": 0.059338482867894636, + "grad_norm": 3.3258773585574244, + "learning_rate": 1.9954962814816744e-05, + "loss": 0.6456, + "step": 749 + }, + { + "epoch": 0.059417706476530004, + "grad_norm": 2.6380093104467717, + "learning_rate": 1.9954719225730847e-05, + "loss": 0.5185, + "step": 750 + }, + { + "epoch": 0.05949693008516538, + "grad_norm": 3.212343708527098, + "learning_rate": 1.995447498117459e-05, + "loss": 0.551, + "step": 751 + }, + { + "epoch": 0.05957615369380075, + "grad_norm": 2.697230578361117, + "learning_rate": 1.9954230081164047e-05, + "loss": 0.5654, + "step": 752 + }, + { + "epoch": 0.05965537730243613, + "grad_norm": 2.9450702913170868, + "learning_rate": 1.9953984525715354e-05, + "loss": 0.6749, + "step": 753 + }, + { + "epoch": 0.0597346009110715, + "grad_norm": 2.55178518989023, + "learning_rate": 1.9953738314844676e-05, + "loss": 0.572, + "step": 754 + }, + { + "epoch": 0.05981382451970687, + "grad_norm": 3.200334715793516, + "learning_rate": 1.9953491448568222e-05, + "loss": 0.66, + "step": 755 + }, + { + "epoch": 0.059893048128342244, + "grad_norm": 3.345363979215846, + "learning_rate": 1.9953243926902254e-05, + "loss": 0.6548, + "step": 756 + }, + { + "epoch": 0.05997227173697762, + "grad_norm": 2.5378054199272144, + "learning_rate": 1.995299574986306e-05, + "loss": 0.6511, + "step": 757 + }, + { + "epoch": 0.06005149534561299, + "grad_norm": 2.452376974233593, + "learning_rate": 1.9952746917466988e-05, + "loss": 0.6255, + "step": 758 + }, + { + "epoch": 0.06013071895424837, + "grad_norm": 2.7109833783005177, + "learning_rate": 1.9952497429730423e-05, + "loss": 0.6317, + "step": 759 + }, + { + "epoch": 0.06020994256288374, + "grad_norm": 3.591650209262339, + "learning_rate": 1.9952247286669787e-05, + "loss": 0.6245, + "step": 760 + }, + { + "epoch": 0.06028916617151911, + "grad_norm": 2.663116080398295, + "learning_rate": 1.995199648830156e-05, + "loss": 0.5889, + "step": 761 + }, + { + "epoch": 0.060368389780154484, + "grad_norm": 2.355172346699632, + "learning_rate": 1.9951745034642245e-05, + "loss": 0.6441, + "step": 762 + }, + { + "epoch": 0.06044761338878986, + "grad_norm": 2.7028345660740127, + "learning_rate": 1.995149292570841e-05, + "loss": 0.7464, + "step": 763 + }, + { + "epoch": 0.06052683699742523, + "grad_norm": 2.340022724644699, + "learning_rate": 1.9951240161516643e-05, + "loss": 0.5095, + "step": 764 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 2.714666785154811, + "learning_rate": 1.9950986742083594e-05, + "loss": 0.7877, + "step": 765 + }, + { + "epoch": 0.06068528421469598, + "grad_norm": 2.580043567565731, + "learning_rate": 1.9950732667425953e-05, + "loss": 0.7249, + "step": 766 + }, + { + "epoch": 0.06076450782333135, + "grad_norm": 2.752091896509089, + "learning_rate": 1.9950477937560442e-05, + "loss": 0.6021, + "step": 767 + }, + { + "epoch": 0.060843731431966724, + "grad_norm": 2.8974863989681463, + "learning_rate": 1.995022255250384e-05, + "loss": 0.6006, + "step": 768 + }, + { + "epoch": 0.0609229550406021, + "grad_norm": 2.607699970135225, + "learning_rate": 1.9949966512272964e-05, + "loss": 0.6395, + "step": 769 + }, + { + "epoch": 0.06100217864923747, + "grad_norm": 2.615834203622895, + "learning_rate": 1.994970981688466e-05, + "loss": 0.6573, + "step": 770 + }, + { + "epoch": 0.06108140225787285, + "grad_norm": 2.9657669350241207, + "learning_rate": 1.9949452466355847e-05, + "loss": 0.5741, + "step": 771 + }, + { + "epoch": 0.06116062586650822, + "grad_norm": 2.5923109185032245, + "learning_rate": 1.9949194460703462e-05, + "loss": 0.5857, + "step": 772 + }, + { + "epoch": 0.06123984947514359, + "grad_norm": 2.561067586714167, + "learning_rate": 1.9948935799944492e-05, + "loss": 0.627, + "step": 773 + }, + { + "epoch": 0.061319073083778965, + "grad_norm": 2.581691505754139, + "learning_rate": 1.994867648409597e-05, + "loss": 0.5704, + "step": 774 + }, + { + "epoch": 0.06139829669241434, + "grad_norm": 2.734930223592304, + "learning_rate": 1.9948416513174976e-05, + "loss": 0.6628, + "step": 775 + }, + { + "epoch": 0.061477520301049714, + "grad_norm": 2.3722126461915667, + "learning_rate": 1.994815588719862e-05, + "loss": 0.5661, + "step": 776 + }, + { + "epoch": 0.06155674390968509, + "grad_norm": 2.7298165839923345, + "learning_rate": 1.9947894606184065e-05, + "loss": 0.6022, + "step": 777 + }, + { + "epoch": 0.06163596751832046, + "grad_norm": 2.750632686742977, + "learning_rate": 1.9947632670148517e-05, + "loss": 0.663, + "step": 778 + }, + { + "epoch": 0.06171519112695583, + "grad_norm": 2.6867070031547122, + "learning_rate": 1.9947370079109224e-05, + "loss": 0.6674, + "step": 779 + }, + { + "epoch": 0.061794414735591205, + "grad_norm": 2.981532808935078, + "learning_rate": 1.9947106833083474e-05, + "loss": 0.6409, + "step": 780 + }, + { + "epoch": 0.06187363834422658, + "grad_norm": 2.6442669257600557, + "learning_rate": 1.9946842932088603e-05, + "loss": 0.6983, + "step": 781 + }, + { + "epoch": 0.061952861952861954, + "grad_norm": 2.3548220132098945, + "learning_rate": 1.9946578376141985e-05, + "loss": 0.6266, + "step": 782 + }, + { + "epoch": 0.06203208556149733, + "grad_norm": 2.358681961536268, + "learning_rate": 1.9946313165261042e-05, + "loss": 0.4969, + "step": 783 + }, + { + "epoch": 0.062111309170132696, + "grad_norm": 2.2581926458343697, + "learning_rate": 1.9946047299463234e-05, + "loss": 0.4531, + "step": 784 + }, + { + "epoch": 0.06219053277876807, + "grad_norm": 2.962108618442753, + "learning_rate": 1.994578077876607e-05, + "loss": 0.6063, + "step": 785 + }, + { + "epoch": 0.062269756387403445, + "grad_norm": 3.110260041430944, + "learning_rate": 1.9945513603187096e-05, + "loss": 0.6105, + "step": 786 + }, + { + "epoch": 0.06234897999603882, + "grad_norm": 2.346190564223158, + "learning_rate": 1.994524577274391e-05, + "loss": 0.5769, + "step": 787 + }, + { + "epoch": 0.062428203604674194, + "grad_norm": 2.5600674199496574, + "learning_rate": 1.994497728745414e-05, + "loss": 0.6509, + "step": 788 + }, + { + "epoch": 0.06250742721330957, + "grad_norm": 2.431098877667765, + "learning_rate": 1.9944708147335466e-05, + "loss": 0.5991, + "step": 789 + }, + { + "epoch": 0.06258665082194494, + "grad_norm": 3.2183602707719987, + "learning_rate": 1.9944438352405614e-05, + "loss": 0.6811, + "step": 790 + }, + { + "epoch": 0.06266587443058032, + "grad_norm": 2.3598556923531757, + "learning_rate": 1.9944167902682345e-05, + "loss": 0.5922, + "step": 791 + }, + { + "epoch": 0.06274509803921569, + "grad_norm": 2.43703512731962, + "learning_rate": 1.994389679818347e-05, + "loss": 0.5734, + "step": 792 + }, + { + "epoch": 0.06282432164785105, + "grad_norm": 2.9309176599034057, + "learning_rate": 1.9943625038926834e-05, + "loss": 0.6582, + "step": 793 + }, + { + "epoch": 0.06290354525648643, + "grad_norm": 2.161936410664361, + "learning_rate": 1.9943352624930336e-05, + "loss": 0.567, + "step": 794 + }, + { + "epoch": 0.0629827688651218, + "grad_norm": 2.3171996758184026, + "learning_rate": 1.9943079556211915e-05, + "loss": 0.6401, + "step": 795 + }, + { + "epoch": 0.06306199247375718, + "grad_norm": 2.6941908990766374, + "learning_rate": 1.9942805832789548e-05, + "loss": 0.5644, + "step": 796 + }, + { + "epoch": 0.06314121608239255, + "grad_norm": 2.902003772442965, + "learning_rate": 1.9942531454681254e-05, + "loss": 0.5836, + "step": 797 + }, + { + "epoch": 0.06322043969102793, + "grad_norm": 2.6670085133384935, + "learning_rate": 1.994225642190511e-05, + "loss": 0.6748, + "step": 798 + }, + { + "epoch": 0.0632996632996633, + "grad_norm": 3.0540072374606573, + "learning_rate": 1.9941980734479214e-05, + "loss": 0.7282, + "step": 799 + }, + { + "epoch": 0.06337888690829867, + "grad_norm": 2.753682430792006, + "learning_rate": 1.994170439242173e-05, + "loss": 0.5858, + "step": 800 + }, + { + "epoch": 0.06345811051693405, + "grad_norm": 2.8482096534878725, + "learning_rate": 1.9941427395750844e-05, + "loss": 0.703, + "step": 801 + }, + { + "epoch": 0.06353733412556942, + "grad_norm": 2.412255560545647, + "learning_rate": 1.99411497444848e-05, + "loss": 0.5168, + "step": 802 + }, + { + "epoch": 0.0636165577342048, + "grad_norm": 2.3058684501414577, + "learning_rate": 1.994087143864188e-05, + "loss": 0.5101, + "step": 803 + }, + { + "epoch": 0.06369578134284017, + "grad_norm": 2.3312432210310003, + "learning_rate": 1.994059247824041e-05, + "loss": 0.6279, + "step": 804 + }, + { + "epoch": 0.06377500495147553, + "grad_norm": 2.4541675562418814, + "learning_rate": 1.994031286329875e-05, + "loss": 0.4586, + "step": 805 + }, + { + "epoch": 0.06385422856011091, + "grad_norm": 1.861930279281903, + "learning_rate": 1.9940032593835324e-05, + "loss": 0.499, + "step": 806 + }, + { + "epoch": 0.06393345216874628, + "grad_norm": 2.5420727710667137, + "learning_rate": 1.993975166986858e-05, + "loss": 0.6231, + "step": 807 + }, + { + "epoch": 0.06401267577738166, + "grad_norm": 2.488083310242034, + "learning_rate": 1.9939470091417012e-05, + "loss": 0.5683, + "step": 808 + }, + { + "epoch": 0.06409189938601703, + "grad_norm": 2.361916836508993, + "learning_rate": 1.9939187858499166e-05, + "loss": 0.6398, + "step": 809 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 2.6141111543939446, + "learning_rate": 1.9938904971133626e-05, + "loss": 0.5686, + "step": 810 + }, + { + "epoch": 0.06425034660328778, + "grad_norm": 3.0303566545321767, + "learning_rate": 1.9938621429339012e-05, + "loss": 0.5517, + "step": 811 + }, + { + "epoch": 0.06432957021192315, + "grad_norm": 2.524730690336445, + "learning_rate": 1.9938337233134e-05, + "loss": 0.4779, + "step": 812 + }, + { + "epoch": 0.06440879382055853, + "grad_norm": 2.6543283733083998, + "learning_rate": 1.9938052382537304e-05, + "loss": 0.5784, + "step": 813 + }, + { + "epoch": 0.0644880174291939, + "grad_norm": 2.3359737642548395, + "learning_rate": 1.9937766877567676e-05, + "loss": 0.622, + "step": 814 + }, + { + "epoch": 0.06456724103782928, + "grad_norm": 2.53017447133631, + "learning_rate": 1.9937480718243914e-05, + "loss": 0.6311, + "step": 815 + }, + { + "epoch": 0.06464646464646465, + "grad_norm": 2.709306314384742, + "learning_rate": 1.9937193904584865e-05, + "loss": 0.6351, + "step": 816 + }, + { + "epoch": 0.06472568825510001, + "grad_norm": 2.366553536621406, + "learning_rate": 1.9936906436609413e-05, + "loss": 0.5853, + "step": 817 + }, + { + "epoch": 0.0648049118637354, + "grad_norm": 3.01701329921683, + "learning_rate": 1.9936618314336486e-05, + "loss": 0.5695, + "step": 818 + }, + { + "epoch": 0.06488413547237076, + "grad_norm": 2.388744593155798, + "learning_rate": 1.9936329537785054e-05, + "loss": 0.4993, + "step": 819 + }, + { + "epoch": 0.06496335908100614, + "grad_norm": 2.310655509082259, + "learning_rate": 1.9936040106974132e-05, + "loss": 0.5861, + "step": 820 + }, + { + "epoch": 0.06504258268964151, + "grad_norm": 2.336793716650803, + "learning_rate": 1.9935750021922778e-05, + "loss": 0.5873, + "step": 821 + }, + { + "epoch": 0.06512180629827688, + "grad_norm": 2.114529169594679, + "learning_rate": 1.993545928265009e-05, + "loss": 0.5935, + "step": 822 + }, + { + "epoch": 0.06520102990691226, + "grad_norm": 2.692553344124309, + "learning_rate": 1.993516788917522e-05, + "loss": 0.6219, + "step": 823 + }, + { + "epoch": 0.06528025351554763, + "grad_norm": 2.389893720618022, + "learning_rate": 1.9934875841517346e-05, + "loss": 0.5645, + "step": 824 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 2.484716621739794, + "learning_rate": 1.9934583139695703e-05, + "loss": 0.5553, + "step": 825 + }, + { + "epoch": 0.06543870073281838, + "grad_norm": 3.0462642760778618, + "learning_rate": 1.9934289783729564e-05, + "loss": 0.6167, + "step": 826 + }, + { + "epoch": 0.06551792434145376, + "grad_norm": 2.2452716816803076, + "learning_rate": 1.993399577363824e-05, + "loss": 0.5349, + "step": 827 + }, + { + "epoch": 0.06559714795008913, + "grad_norm": 2.8885614555168337, + "learning_rate": 1.9933701109441093e-05, + "loss": 0.6317, + "step": 828 + }, + { + "epoch": 0.0656763715587245, + "grad_norm": 2.6602159378281987, + "learning_rate": 1.993340579115753e-05, + "loss": 0.6245, + "step": 829 + }, + { + "epoch": 0.06575559516735988, + "grad_norm": 2.6165913358272075, + "learning_rate": 1.993310981880699e-05, + "loss": 0.6299, + "step": 830 + }, + { + "epoch": 0.06583481877599524, + "grad_norm": 2.371449325243517, + "learning_rate": 1.9932813192408964e-05, + "loss": 0.5249, + "step": 831 + }, + { + "epoch": 0.06591404238463063, + "grad_norm": 2.357942686977399, + "learning_rate": 1.9932515911982983e-05, + "loss": 0.6708, + "step": 832 + }, + { + "epoch": 0.06599326599326599, + "grad_norm": 2.880553589653365, + "learning_rate": 1.993221797754862e-05, + "loss": 0.653, + "step": 833 + }, + { + "epoch": 0.06607248960190136, + "grad_norm": 2.310695758148777, + "learning_rate": 1.9931919389125496e-05, + "loss": 0.5041, + "step": 834 + }, + { + "epoch": 0.06615171321053674, + "grad_norm": 2.4386376078799747, + "learning_rate": 1.9931620146733264e-05, + "loss": 0.5969, + "step": 835 + }, + { + "epoch": 0.06623093681917211, + "grad_norm": 2.400077208564521, + "learning_rate": 1.993132025039164e-05, + "loss": 0.5913, + "step": 836 + }, + { + "epoch": 0.06631016042780749, + "grad_norm": 2.3442029871090146, + "learning_rate": 1.9931019700120363e-05, + "loss": 0.5431, + "step": 837 + }, + { + "epoch": 0.06638938403644286, + "grad_norm": 2.2883192943085717, + "learning_rate": 1.9930718495939222e-05, + "loss": 0.5296, + "step": 838 + }, + { + "epoch": 0.06646860764507824, + "grad_norm": 2.346789549641395, + "learning_rate": 1.9930416637868053e-05, + "loss": 0.4923, + "step": 839 + }, + { + "epoch": 0.06654783125371361, + "grad_norm": 2.614121672017315, + "learning_rate": 1.993011412592673e-05, + "loss": 0.6086, + "step": 840 + }, + { + "epoch": 0.06662705486234898, + "grad_norm": 3.0534538592411766, + "learning_rate": 1.992981096013517e-05, + "loss": 0.6922, + "step": 841 + }, + { + "epoch": 0.06670627847098436, + "grad_norm": 3.017815351014294, + "learning_rate": 1.9929507140513342e-05, + "loss": 0.6948, + "step": 842 + }, + { + "epoch": 0.06678550207961972, + "grad_norm": 2.360489736437008, + "learning_rate": 1.9929202667081246e-05, + "loss": 0.5377, + "step": 843 + }, + { + "epoch": 0.0668647256882551, + "grad_norm": 2.5539470848512007, + "learning_rate": 1.9928897539858926e-05, + "loss": 0.5094, + "step": 844 + }, + { + "epoch": 0.06694394929689047, + "grad_norm": 2.2777283063451534, + "learning_rate": 1.992859175886648e-05, + "loss": 0.5961, + "step": 845 + }, + { + "epoch": 0.06702317290552584, + "grad_norm": 2.0505572977239828, + "learning_rate": 1.9928285324124038e-05, + "loss": 0.5134, + "step": 846 + }, + { + "epoch": 0.06710239651416122, + "grad_norm": 2.5129650303866047, + "learning_rate": 1.9927978235651782e-05, + "loss": 0.5376, + "step": 847 + }, + { + "epoch": 0.06718162012279659, + "grad_norm": 3.221723448132506, + "learning_rate": 1.992767049346993e-05, + "loss": 0.6556, + "step": 848 + }, + { + "epoch": 0.06726084373143197, + "grad_norm": 2.76830146639477, + "learning_rate": 1.9927362097598746e-05, + "loss": 0.6053, + "step": 849 + }, + { + "epoch": 0.06734006734006734, + "grad_norm": 2.233404358137078, + "learning_rate": 1.9927053048058534e-05, + "loss": 0.5713, + "step": 850 + }, + { + "epoch": 0.06741929094870272, + "grad_norm": 2.9886397534117837, + "learning_rate": 1.9926743344869645e-05, + "loss": 0.5631, + "step": 851 + }, + { + "epoch": 0.06749851455733809, + "grad_norm": 2.7133581210801947, + "learning_rate": 1.992643298805247e-05, + "loss": 0.63, + "step": 852 + }, + { + "epoch": 0.06757773816597346, + "grad_norm": 2.7880183431147634, + "learning_rate": 1.9926121977627447e-05, + "loss": 0.5981, + "step": 853 + }, + { + "epoch": 0.06765696177460884, + "grad_norm": 2.283195629083946, + "learning_rate": 1.9925810313615052e-05, + "loss": 0.5193, + "step": 854 + }, + { + "epoch": 0.0677361853832442, + "grad_norm": 2.4685472107825013, + "learning_rate": 1.9925497996035807e-05, + "loss": 0.608, + "step": 855 + }, + { + "epoch": 0.06781540899187959, + "grad_norm": 2.340252814536461, + "learning_rate": 1.992518502491028e-05, + "loss": 0.6015, + "step": 856 + }, + { + "epoch": 0.06789463260051495, + "grad_norm": 2.670901520367536, + "learning_rate": 1.9924871400259074e-05, + "loss": 0.617, + "step": 857 + }, + { + "epoch": 0.06797385620915032, + "grad_norm": 2.5836058464362823, + "learning_rate": 1.9924557122102843e-05, + "loss": 0.6712, + "step": 858 + }, + { + "epoch": 0.0680530798177857, + "grad_norm": 2.373915203317525, + "learning_rate": 1.9924242190462276e-05, + "loss": 0.6829, + "step": 859 + }, + { + "epoch": 0.06813230342642107, + "grad_norm": 2.9855954138436975, + "learning_rate": 1.992392660535812e-05, + "loss": 0.7835, + "step": 860 + }, + { + "epoch": 0.06821152703505645, + "grad_norm": 2.922510848840876, + "learning_rate": 1.9923610366811142e-05, + "loss": 0.7843, + "step": 861 + }, + { + "epoch": 0.06829075064369182, + "grad_norm": 2.279622275703524, + "learning_rate": 1.9923293474842175e-05, + "loss": 0.6269, + "step": 862 + }, + { + "epoch": 0.06836997425232719, + "grad_norm": 2.5275665929230935, + "learning_rate": 1.9922975929472076e-05, + "loss": 0.669, + "step": 863 + }, + { + "epoch": 0.06844919786096257, + "grad_norm": 2.6381708069523375, + "learning_rate": 1.9922657730721758e-05, + "loss": 0.6102, + "step": 864 + }, + { + "epoch": 0.06852842146959794, + "grad_norm": 2.4006202588321903, + "learning_rate": 1.9922338878612177e-05, + "loss": 0.5943, + "step": 865 + }, + { + "epoch": 0.06860764507823332, + "grad_norm": 2.2801290873509177, + "learning_rate": 1.9922019373164324e-05, + "loss": 0.473, + "step": 866 + }, + { + "epoch": 0.06868686868686869, + "grad_norm": 2.2776237544692366, + "learning_rate": 1.9921699214399238e-05, + "loss": 0.5833, + "step": 867 + }, + { + "epoch": 0.06876609229550407, + "grad_norm": 2.67064385104672, + "learning_rate": 1.9921378402337996e-05, + "loss": 0.5563, + "step": 868 + }, + { + "epoch": 0.06884531590413943, + "grad_norm": 2.3761074502160997, + "learning_rate": 1.9921056937001725e-05, + "loss": 0.5406, + "step": 869 + }, + { + "epoch": 0.0689245395127748, + "grad_norm": 2.6282969248165413, + "learning_rate": 1.9920734818411592e-05, + "loss": 0.5174, + "step": 870 + }, + { + "epoch": 0.06900376312141018, + "grad_norm": 2.768547190282355, + "learning_rate": 1.9920412046588807e-05, + "loss": 0.5631, + "step": 871 + }, + { + "epoch": 0.06908298673004555, + "grad_norm": 2.738677217126626, + "learning_rate": 1.992008862155462e-05, + "loss": 0.5762, + "step": 872 + }, + { + "epoch": 0.06916221033868093, + "grad_norm": 2.5047554390696862, + "learning_rate": 1.9919764543330334e-05, + "loss": 0.4896, + "step": 873 + }, + { + "epoch": 0.0692414339473163, + "grad_norm": 2.7292928450358644, + "learning_rate": 1.9919439811937283e-05, + "loss": 0.651, + "step": 874 + }, + { + "epoch": 0.06932065755595167, + "grad_norm": 2.4716401373212395, + "learning_rate": 1.991911442739685e-05, + "loss": 0.6114, + "step": 875 + }, + { + "epoch": 0.06939988116458705, + "grad_norm": 2.427938301520501, + "learning_rate": 1.9918788389730457e-05, + "loss": 0.5364, + "step": 876 + }, + { + "epoch": 0.06947910477322242, + "grad_norm": 2.3805652833263915, + "learning_rate": 1.9918461698959576e-05, + "loss": 0.6165, + "step": 877 + }, + { + "epoch": 0.0695583283818578, + "grad_norm": 2.147475091562914, + "learning_rate": 1.9918134355105717e-05, + "loss": 0.4989, + "step": 878 + }, + { + "epoch": 0.06963755199049317, + "grad_norm": 2.2841456029641796, + "learning_rate": 1.9917806358190434e-05, + "loss": 0.4942, + "step": 879 + }, + { + "epoch": 0.06971677559912855, + "grad_norm": 2.1208825659876975, + "learning_rate": 1.9917477708235324e-05, + "loss": 0.5941, + "step": 880 + }, + { + "epoch": 0.06979599920776391, + "grad_norm": 2.4247452574228547, + "learning_rate": 1.9917148405262027e-05, + "loss": 0.6446, + "step": 881 + }, + { + "epoch": 0.06987522281639928, + "grad_norm": 2.5975848014190666, + "learning_rate": 1.9916818449292223e-05, + "loss": 0.5433, + "step": 882 + }, + { + "epoch": 0.06995444642503466, + "grad_norm": 2.238138002042094, + "learning_rate": 1.9916487840347644e-05, + "loss": 0.6243, + "step": 883 + }, + { + "epoch": 0.07003367003367003, + "grad_norm": 2.6730194021003677, + "learning_rate": 1.9916156578450052e-05, + "loss": 0.7004, + "step": 884 + }, + { + "epoch": 0.07011289364230541, + "grad_norm": 2.22251494252007, + "learning_rate": 1.9915824663621267e-05, + "loss": 0.5966, + "step": 885 + }, + { + "epoch": 0.07019211725094078, + "grad_norm": 2.102441178207004, + "learning_rate": 1.991549209588314e-05, + "loss": 0.4584, + "step": 886 + }, + { + "epoch": 0.07027134085957615, + "grad_norm": 2.480618541833406, + "learning_rate": 1.9915158875257566e-05, + "loss": 0.5416, + "step": 887 + }, + { + "epoch": 0.07035056446821153, + "grad_norm": 2.2487807086735523, + "learning_rate": 1.991482500176649e-05, + "loss": 0.4912, + "step": 888 + }, + { + "epoch": 0.0704297880768469, + "grad_norm": 2.3755036554761855, + "learning_rate": 1.9914490475431892e-05, + "loss": 0.5761, + "step": 889 + }, + { + "epoch": 0.07050901168548228, + "grad_norm": 2.0612764095223457, + "learning_rate": 1.9914155296275804e-05, + "loss": 0.5633, + "step": 890 + }, + { + "epoch": 0.07058823529411765, + "grad_norm": 2.450533835665693, + "learning_rate": 1.9913819464320295e-05, + "loss": 0.446, + "step": 891 + }, + { + "epoch": 0.07066745890275301, + "grad_norm": 2.085665128858314, + "learning_rate": 1.9913482979587473e-05, + "loss": 0.543, + "step": 892 + }, + { + "epoch": 0.0707466825113884, + "grad_norm": 2.311629385161046, + "learning_rate": 1.9913145842099503e-05, + "loss": 0.5764, + "step": 893 + }, + { + "epoch": 0.07082590612002376, + "grad_norm": 2.384799683655331, + "learning_rate": 1.9912808051878575e-05, + "loss": 0.5429, + "step": 894 + }, + { + "epoch": 0.07090512972865914, + "grad_norm": 2.5044835997295856, + "learning_rate": 1.9912469608946932e-05, + "loss": 0.5312, + "step": 895 + }, + { + "epoch": 0.07098435333729451, + "grad_norm": 2.1919015800368196, + "learning_rate": 1.9912130513326863e-05, + "loss": 0.6025, + "step": 896 + }, + { + "epoch": 0.0710635769459299, + "grad_norm": 2.9519507325462317, + "learning_rate": 1.9911790765040697e-05, + "loss": 0.6424, + "step": 897 + }, + { + "epoch": 0.07114280055456526, + "grad_norm": 2.50623357058056, + "learning_rate": 1.9911450364110798e-05, + "loss": 0.6322, + "step": 898 + }, + { + "epoch": 0.07122202416320063, + "grad_norm": 2.2688446065718675, + "learning_rate": 1.9911109310559583e-05, + "loss": 0.5411, + "step": 899 + }, + { + "epoch": 0.07130124777183601, + "grad_norm": 2.1621910834786737, + "learning_rate": 1.991076760440951e-05, + "loss": 0.5956, + "step": 900 + }, + { + "epoch": 0.07138047138047138, + "grad_norm": 2.323059316434927, + "learning_rate": 1.991042524568308e-05, + "loss": 0.5644, + "step": 901 + }, + { + "epoch": 0.07145969498910676, + "grad_norm": 2.139321254896373, + "learning_rate": 1.991008223440283e-05, + "loss": 0.5794, + "step": 902 + }, + { + "epoch": 0.07153891859774213, + "grad_norm": 2.255598119425402, + "learning_rate": 1.9909738570591352e-05, + "loss": 0.5271, + "step": 903 + }, + { + "epoch": 0.0716181422063775, + "grad_norm": 2.4163889519624657, + "learning_rate": 1.990939425427127e-05, + "loss": 0.5916, + "step": 904 + }, + { + "epoch": 0.07169736581501288, + "grad_norm": 2.457627836614485, + "learning_rate": 1.9909049285465258e-05, + "loss": 0.6217, + "step": 905 + }, + { + "epoch": 0.07177658942364824, + "grad_norm": 2.284404778088907, + "learning_rate": 1.990870366419603e-05, + "loss": 0.5575, + "step": 906 + }, + { + "epoch": 0.07185581303228362, + "grad_norm": 2.2658243473432287, + "learning_rate": 1.9908357390486342e-05, + "loss": 0.5829, + "step": 907 + }, + { + "epoch": 0.07193503664091899, + "grad_norm": 2.527317543491671, + "learning_rate": 1.9908010464358997e-05, + "loss": 0.5473, + "step": 908 + }, + { + "epoch": 0.07201426024955437, + "grad_norm": 2.578772380055916, + "learning_rate": 1.9907662885836836e-05, + "loss": 0.6369, + "step": 909 + }, + { + "epoch": 0.07209348385818974, + "grad_norm": 2.6512834477426526, + "learning_rate": 1.9907314654942748e-05, + "loss": 0.5202, + "step": 910 + }, + { + "epoch": 0.07217270746682511, + "grad_norm": 2.7526226665382842, + "learning_rate": 1.990696577169966e-05, + "loss": 0.6889, + "step": 911 + }, + { + "epoch": 0.07225193107546049, + "grad_norm": 2.25739059904445, + "learning_rate": 1.9906616236130543e-05, + "loss": 0.4722, + "step": 912 + }, + { + "epoch": 0.07233115468409586, + "grad_norm": 2.1917311953311533, + "learning_rate": 1.990626604825842e-05, + "loss": 0.5757, + "step": 913 + }, + { + "epoch": 0.07241037829273124, + "grad_norm": 2.5090957024568743, + "learning_rate": 1.9905915208106342e-05, + "loss": 0.4883, + "step": 914 + }, + { + "epoch": 0.07248960190136661, + "grad_norm": 2.4103195657044925, + "learning_rate": 1.990556371569741e-05, + "loss": 0.6352, + "step": 915 + }, + { + "epoch": 0.07256882551000197, + "grad_norm": 2.741830933995138, + "learning_rate": 1.990521157105477e-05, + "loss": 0.5133, + "step": 916 + }, + { + "epoch": 0.07264804911863736, + "grad_norm": 2.4749257639171796, + "learning_rate": 1.990485877420161e-05, + "loss": 0.5508, + "step": 917 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 2.7743935671534246, + "learning_rate": 1.990450532516116e-05, + "loss": 0.5859, + "step": 918 + }, + { + "epoch": 0.0728064963359081, + "grad_norm": 2.6901306746022775, + "learning_rate": 1.9904151223956688e-05, + "loss": 0.5363, + "step": 919 + }, + { + "epoch": 0.07288571994454347, + "grad_norm": 2.270580183753606, + "learning_rate": 1.9903796470611515e-05, + "loss": 0.4755, + "step": 920 + }, + { + "epoch": 0.07296494355317884, + "grad_norm": 2.4964563143971383, + "learning_rate": 1.9903441065149e-05, + "loss": 0.5628, + "step": 921 + }, + { + "epoch": 0.07304416716181422, + "grad_norm": 2.3148142675786, + "learning_rate": 1.990308500759254e-05, + "loss": 0.5291, + "step": 922 + }, + { + "epoch": 0.07312339077044959, + "grad_norm": 2.759305898022117, + "learning_rate": 1.9902728297965586e-05, + "loss": 0.7155, + "step": 923 + }, + { + "epoch": 0.07320261437908497, + "grad_norm": 2.391039701229346, + "learning_rate": 1.990237093629162e-05, + "loss": 0.6283, + "step": 924 + }, + { + "epoch": 0.07328183798772034, + "grad_norm": 2.4341409272036567, + "learning_rate": 1.9902012922594178e-05, + "loss": 0.6436, + "step": 925 + }, + { + "epoch": 0.07336106159635572, + "grad_norm": 2.629153944855865, + "learning_rate": 1.990165425689683e-05, + "loss": 0.5817, + "step": 926 + }, + { + "epoch": 0.07344028520499109, + "grad_norm": 2.999800542377723, + "learning_rate": 1.9901294939223192e-05, + "loss": 0.7025, + "step": 927 + }, + { + "epoch": 0.07351950881362646, + "grad_norm": 2.3315986739064147, + "learning_rate": 1.9900934969596925e-05, + "loss": 0.5782, + "step": 928 + }, + { + "epoch": 0.07359873242226184, + "grad_norm": 2.19586525154139, + "learning_rate": 1.9900574348041728e-05, + "loss": 0.4544, + "step": 929 + }, + { + "epoch": 0.0736779560308972, + "grad_norm": 2.333981826493017, + "learning_rate": 1.990021307458135e-05, + "loss": 0.5572, + "step": 930 + }, + { + "epoch": 0.07375717963953259, + "grad_norm": 2.762353900418872, + "learning_rate": 1.989985114923958e-05, + "loss": 0.622, + "step": 931 + }, + { + "epoch": 0.07383640324816795, + "grad_norm": 2.2765380969186335, + "learning_rate": 1.9899488572040244e-05, + "loss": 0.4791, + "step": 932 + }, + { + "epoch": 0.07391562685680332, + "grad_norm": 2.3601711611150793, + "learning_rate": 1.989912534300722e-05, + "loss": 0.5191, + "step": 933 + }, + { + "epoch": 0.0739948504654387, + "grad_norm": 2.588364259136482, + "learning_rate": 1.9898761462164425e-05, + "loss": 0.6849, + "step": 934 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 2.64012916369795, + "learning_rate": 1.989839692953581e-05, + "loss": 0.59, + "step": 935 + }, + { + "epoch": 0.07415329768270945, + "grad_norm": 2.384470253660101, + "learning_rate": 1.9898031745145397e-05, + "loss": 0.5794, + "step": 936 + }, + { + "epoch": 0.07423252129134482, + "grad_norm": 2.7702633138713377, + "learning_rate": 1.989766590901721e-05, + "loss": 0.6083, + "step": 937 + }, + { + "epoch": 0.0743117448999802, + "grad_norm": 3.1284189033255627, + "learning_rate": 1.9897299421175353e-05, + "loss": 0.7033, + "step": 938 + }, + { + "epoch": 0.07439096850861557, + "grad_norm": 3.0776085969214386, + "learning_rate": 1.989693228164395e-05, + "loss": 0.6533, + "step": 939 + }, + { + "epoch": 0.07447019211725094, + "grad_norm": 2.4372575665676286, + "learning_rate": 1.989656449044718e-05, + "loss": 0.5605, + "step": 940 + }, + { + "epoch": 0.07454941572588632, + "grad_norm": 2.3548609576621025, + "learning_rate": 1.9896196047609255e-05, + "loss": 0.6597, + "step": 941 + }, + { + "epoch": 0.07462863933452168, + "grad_norm": 2.5150272857529754, + "learning_rate": 1.9895826953154437e-05, + "loss": 0.5906, + "step": 942 + }, + { + "epoch": 0.07470786294315707, + "grad_norm": 2.34275629040273, + "learning_rate": 1.9895457207107032e-05, + "loss": 0.593, + "step": 943 + }, + { + "epoch": 0.07478708655179243, + "grad_norm": 2.321333121037487, + "learning_rate": 1.9895086809491384e-05, + "loss": 0.5768, + "step": 944 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 2.6139333663849125, + "learning_rate": 1.989471576033188e-05, + "loss": 0.5813, + "step": 945 + }, + { + "epoch": 0.07494553376906318, + "grad_norm": 2.6229210683811566, + "learning_rate": 1.9894344059652953e-05, + "loss": 0.5276, + "step": 946 + }, + { + "epoch": 0.07502475737769855, + "grad_norm": 2.691579310859323, + "learning_rate": 1.989397170747908e-05, + "loss": 0.6681, + "step": 947 + }, + { + "epoch": 0.07510398098633393, + "grad_norm": 2.2628838224488996, + "learning_rate": 1.9893598703834773e-05, + "loss": 0.5727, + "step": 948 + }, + { + "epoch": 0.0751832045949693, + "grad_norm": 2.5417131403763076, + "learning_rate": 1.98932250487446e-05, + "loss": 0.5811, + "step": 949 + }, + { + "epoch": 0.07526242820360468, + "grad_norm": 2.6594808728801533, + "learning_rate": 1.989285074223316e-05, + "loss": 0.6061, + "step": 950 + }, + { + "epoch": 0.07534165181224005, + "grad_norm": 2.1338151286853706, + "learning_rate": 1.98924757843251e-05, + "loss": 0.6154, + "step": 951 + }, + { + "epoch": 0.07542087542087542, + "grad_norm": 2.2963129103002236, + "learning_rate": 1.989210017504511e-05, + "loss": 0.5281, + "step": 952 + }, + { + "epoch": 0.0755000990295108, + "grad_norm": 2.457216823104847, + "learning_rate": 1.989172391441792e-05, + "loss": 0.6455, + "step": 953 + }, + { + "epoch": 0.07557932263814617, + "grad_norm": 2.893209777574209, + "learning_rate": 1.9891347002468307e-05, + "loss": 0.6824, + "step": 954 + }, + { + "epoch": 0.07565854624678155, + "grad_norm": 2.0640640545519116, + "learning_rate": 1.9890969439221086e-05, + "loss": 0.4339, + "step": 955 + }, + { + "epoch": 0.07573776985541691, + "grad_norm": 2.543695427342178, + "learning_rate": 1.989059122470112e-05, + "loss": 0.6042, + "step": 956 + }, + { + "epoch": 0.07581699346405228, + "grad_norm": 2.2370983000132925, + "learning_rate": 1.9890212358933316e-05, + "loss": 0.5077, + "step": 957 + }, + { + "epoch": 0.07589621707268766, + "grad_norm": 2.4802801842018987, + "learning_rate": 1.9889832841942613e-05, + "loss": 0.629, + "step": 958 + }, + { + "epoch": 0.07597544068132303, + "grad_norm": 2.8108090868001896, + "learning_rate": 1.988945267375401e-05, + "loss": 0.6172, + "step": 959 + }, + { + "epoch": 0.07605466428995841, + "grad_norm": 2.268798044596798, + "learning_rate": 1.9889071854392528e-05, + "loss": 0.5143, + "step": 960 + }, + { + "epoch": 0.07613388789859378, + "grad_norm": 2.4041933150755406, + "learning_rate": 1.9888690383883247e-05, + "loss": 0.4789, + "step": 961 + }, + { + "epoch": 0.07621311150722915, + "grad_norm": 2.2787345334316584, + "learning_rate": 1.9888308262251286e-05, + "loss": 0.5178, + "step": 962 + }, + { + "epoch": 0.07629233511586453, + "grad_norm": 2.3698462560557805, + "learning_rate": 1.988792548952181e-05, + "loss": 0.5473, + "step": 963 + }, + { + "epoch": 0.0763715587244999, + "grad_norm": 2.444233132166127, + "learning_rate": 1.9887542065720013e-05, + "loss": 0.6192, + "step": 964 + }, + { + "epoch": 0.07645078233313528, + "grad_norm": 2.219272390994219, + "learning_rate": 1.988715799087115e-05, + "loss": 0.575, + "step": 965 + }, + { + "epoch": 0.07653000594177065, + "grad_norm": 2.298572045122237, + "learning_rate": 1.9886773265000502e-05, + "loss": 0.527, + "step": 966 + }, + { + "epoch": 0.07660922955040603, + "grad_norm": 2.1482108494168135, + "learning_rate": 1.9886387888133413e-05, + "loss": 0.4366, + "step": 967 + }, + { + "epoch": 0.0766884531590414, + "grad_norm": 2.1101315267819203, + "learning_rate": 1.988600186029525e-05, + "loss": 0.5012, + "step": 968 + }, + { + "epoch": 0.07676767676767676, + "grad_norm": 2.7321741556839667, + "learning_rate": 1.988561518151143e-05, + "loss": 0.477, + "step": 969 + }, + { + "epoch": 0.07684690037631214, + "grad_norm": 2.480594396625295, + "learning_rate": 1.988522785180742e-05, + "loss": 0.6292, + "step": 970 + }, + { + "epoch": 0.07692612398494751, + "grad_norm": 2.1796398221460196, + "learning_rate": 1.9884839871208717e-05, + "loss": 0.5449, + "step": 971 + }, + { + "epoch": 0.07700534759358289, + "grad_norm": 2.072570670527955, + "learning_rate": 1.9884451239740877e-05, + "loss": 0.4721, + "step": 972 + }, + { + "epoch": 0.07708457120221826, + "grad_norm": 2.299809558046713, + "learning_rate": 1.988406195742948e-05, + "loss": 0.5759, + "step": 973 + }, + { + "epoch": 0.07716379481085363, + "grad_norm": 2.13576691936463, + "learning_rate": 1.9883672024300163e-05, + "loss": 0.4433, + "step": 974 + }, + { + "epoch": 0.07724301841948901, + "grad_norm": 2.4520707098449064, + "learning_rate": 1.98832814403786e-05, + "loss": 0.5766, + "step": 975 + }, + { + "epoch": 0.07732224202812438, + "grad_norm": 2.766327843132799, + "learning_rate": 1.988289020569051e-05, + "loss": 0.6727, + "step": 976 + }, + { + "epoch": 0.07740146563675976, + "grad_norm": 2.5064951346112, + "learning_rate": 1.9882498320261652e-05, + "loss": 0.5935, + "step": 977 + }, + { + "epoch": 0.07748068924539513, + "grad_norm": 2.425900782785911, + "learning_rate": 1.9882105784117835e-05, + "loss": 0.5033, + "step": 978 + }, + { + "epoch": 0.07755991285403051, + "grad_norm": 2.5647047758604216, + "learning_rate": 1.98817125972849e-05, + "loss": 0.6163, + "step": 979 + }, + { + "epoch": 0.07763913646266588, + "grad_norm": 2.3390776043312136, + "learning_rate": 1.9881318759788738e-05, + "loss": 0.6479, + "step": 980 + }, + { + "epoch": 0.07771836007130124, + "grad_norm": 2.29016226420427, + "learning_rate": 1.988092427165528e-05, + "loss": 0.4766, + "step": 981 + }, + { + "epoch": 0.07779758367993662, + "grad_norm": 2.93436897178446, + "learning_rate": 1.98805291329105e-05, + "loss": 0.6694, + "step": 982 + }, + { + "epoch": 0.07787680728857199, + "grad_norm": 2.2474360030425125, + "learning_rate": 1.9880133343580423e-05, + "loss": 0.6023, + "step": 983 + }, + { + "epoch": 0.07795603089720737, + "grad_norm": 2.459388822542379, + "learning_rate": 1.9879736903691107e-05, + "loss": 0.6372, + "step": 984 + }, + { + "epoch": 0.07803525450584274, + "grad_norm": 2.619687500158315, + "learning_rate": 1.9879339813268653e-05, + "loss": 0.6342, + "step": 985 + }, + { + "epoch": 0.07811447811447811, + "grad_norm": 2.145655543525274, + "learning_rate": 1.9878942072339208e-05, + "loss": 0.4446, + "step": 986 + }, + { + "epoch": 0.07819370172311349, + "grad_norm": 2.647277753965703, + "learning_rate": 1.987854368092896e-05, + "loss": 0.704, + "step": 987 + }, + { + "epoch": 0.07827292533174886, + "grad_norm": 2.2017066413096207, + "learning_rate": 1.9878144639064145e-05, + "loss": 0.5217, + "step": 988 + }, + { + "epoch": 0.07835214894038424, + "grad_norm": 2.095758426892819, + "learning_rate": 1.9877744946771034e-05, + "loss": 0.4057, + "step": 989 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 2.2117560897301702, + "learning_rate": 1.987734460407595e-05, + "loss": 0.4695, + "step": 990 + }, + { + "epoch": 0.07851059615765497, + "grad_norm": 2.3987320414172473, + "learning_rate": 1.9876943611005252e-05, + "loss": 0.5071, + "step": 991 + }, + { + "epoch": 0.07858981976629036, + "grad_norm": 2.939845595210767, + "learning_rate": 1.9876541967585337e-05, + "loss": 0.6177, + "step": 992 + }, + { + "epoch": 0.07866904337492572, + "grad_norm": 2.7068922017867645, + "learning_rate": 1.987613967384266e-05, + "loss": 0.5082, + "step": 993 + }, + { + "epoch": 0.0787482669835611, + "grad_norm": 2.503160603491663, + "learning_rate": 1.9875736729803705e-05, + "loss": 0.6302, + "step": 994 + }, + { + "epoch": 0.07882749059219647, + "grad_norm": 2.56265855673719, + "learning_rate": 1.9875333135495e-05, + "loss": 0.6615, + "step": 995 + }, + { + "epoch": 0.07890671420083185, + "grad_norm": 2.2596065545697566, + "learning_rate": 1.9874928890943134e-05, + "loss": 0.6682, + "step": 996 + }, + { + "epoch": 0.07898593780946722, + "grad_norm": 2.448360339483264, + "learning_rate": 1.9874523996174714e-05, + "loss": 0.57, + "step": 997 + }, + { + "epoch": 0.07906516141810259, + "grad_norm": 2.3321655062409046, + "learning_rate": 1.98741184512164e-05, + "loss": 0.667, + "step": 998 + }, + { + "epoch": 0.07914438502673797, + "grad_norm": 2.3746430097998656, + "learning_rate": 1.9873712256094898e-05, + "loss": 0.6208, + "step": 999 + }, + { + "epoch": 0.07922360863537334, + "grad_norm": 2.4382360386725024, + "learning_rate": 1.987330541083695e-05, + "loss": 0.6245, + "step": 1000 + }, + { + "epoch": 0.07930283224400872, + "grad_norm": 2.502665713196127, + "learning_rate": 1.9872897915469353e-05, + "loss": 0.4758, + "step": 1001 + }, + { + "epoch": 0.07938205585264409, + "grad_norm": 2.203990802523697, + "learning_rate": 1.987248977001893e-05, + "loss": 0.5672, + "step": 1002 + }, + { + "epoch": 0.07946127946127945, + "grad_norm": 2.162036536603663, + "learning_rate": 1.987208097451256e-05, + "loss": 0.555, + "step": 1003 + }, + { + "epoch": 0.07954050306991484, + "grad_norm": 2.473045243499081, + "learning_rate": 1.987167152897716e-05, + "loss": 0.5935, + "step": 1004 + }, + { + "epoch": 0.0796197266785502, + "grad_norm": 2.119374803626877, + "learning_rate": 1.987126143343969e-05, + "loss": 0.5198, + "step": 1005 + }, + { + "epoch": 0.07969895028718559, + "grad_norm": 2.659645048479735, + "learning_rate": 1.987085068792715e-05, + "loss": 0.5732, + "step": 1006 + }, + { + "epoch": 0.07977817389582095, + "grad_norm": 2.3463721828983473, + "learning_rate": 1.9870439292466587e-05, + "loss": 0.5767, + "step": 1007 + }, + { + "epoch": 0.07985739750445633, + "grad_norm": 2.901642126328267, + "learning_rate": 1.9870027247085093e-05, + "loss": 0.7066, + "step": 1008 + }, + { + "epoch": 0.0799366211130917, + "grad_norm": 2.180806212088373, + "learning_rate": 1.9869614551809793e-05, + "loss": 0.4841, + "step": 1009 + }, + { + "epoch": 0.08001584472172707, + "grad_norm": 2.660905041614718, + "learning_rate": 1.986920120666787e-05, + "loss": 0.4858, + "step": 1010 + }, + { + "epoch": 0.08009506833036245, + "grad_norm": 2.368586242448567, + "learning_rate": 1.986878721168653e-05, + "loss": 0.5848, + "step": 1011 + }, + { + "epoch": 0.08017429193899782, + "grad_norm": 2.4271483938049956, + "learning_rate": 1.986837256689304e-05, + "loss": 0.5724, + "step": 1012 + }, + { + "epoch": 0.0802535155476332, + "grad_norm": 2.4509693602785463, + "learning_rate": 1.98679572723147e-05, + "loss": 0.5219, + "step": 1013 + }, + { + "epoch": 0.08033273915626857, + "grad_norm": 2.443496597980648, + "learning_rate": 1.9867541327978853e-05, + "loss": 0.6714, + "step": 1014 + }, + { + "epoch": 0.08041196276490394, + "grad_norm": 2.367671588770985, + "learning_rate": 1.986712473391289e-05, + "loss": 0.5335, + "step": 1015 + }, + { + "epoch": 0.08049118637353932, + "grad_norm": 2.565088295429447, + "learning_rate": 1.986670749014424e-05, + "loss": 0.59, + "step": 1016 + }, + { + "epoch": 0.08057040998217468, + "grad_norm": 2.0604190565475773, + "learning_rate": 1.9866289596700383e-05, + "loss": 0.4956, + "step": 1017 + }, + { + "epoch": 0.08064963359081007, + "grad_norm": 2.1988560970066233, + "learning_rate": 1.9865871053608823e-05, + "loss": 0.5635, + "step": 1018 + }, + { + "epoch": 0.08072885719944543, + "grad_norm": 2.558336155054883, + "learning_rate": 1.9865451860897126e-05, + "loss": 0.5646, + "step": 1019 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 2.454308626909449, + "learning_rate": 1.98650320185929e-05, + "loss": 0.547, + "step": 1020 + }, + { + "epoch": 0.08088730441671618, + "grad_norm": 2.268676352822214, + "learning_rate": 1.986461152672378e-05, + "loss": 0.6198, + "step": 1021 + }, + { + "epoch": 0.08096652802535155, + "grad_norm": 2.3917768390278527, + "learning_rate": 1.986419038531745e-05, + "loss": 0.5774, + "step": 1022 + }, + { + "epoch": 0.08104575163398693, + "grad_norm": 2.0974142950247554, + "learning_rate": 1.9863768594401654e-05, + "loss": 0.5313, + "step": 1023 + }, + { + "epoch": 0.0811249752426223, + "grad_norm": 2.3658289548688693, + "learning_rate": 1.9863346154004155e-05, + "loss": 0.5298, + "step": 1024 + }, + { + "epoch": 0.08120419885125768, + "grad_norm": 2.2853381004335653, + "learning_rate": 1.986292306415277e-05, + "loss": 0.5569, + "step": 1025 + }, + { + "epoch": 0.08128342245989305, + "grad_norm": 2.2461119361208315, + "learning_rate": 1.9862499324875362e-05, + "loss": 0.4962, + "step": 1026 + }, + { + "epoch": 0.08136264606852842, + "grad_norm": 2.064477503023414, + "learning_rate": 1.9862074936199827e-05, + "loss": 0.4665, + "step": 1027 + }, + { + "epoch": 0.0814418696771638, + "grad_norm": 2.6328229695413503, + "learning_rate": 1.9861649898154107e-05, + "loss": 0.6401, + "step": 1028 + }, + { + "epoch": 0.08152109328579916, + "grad_norm": 2.1821967129970434, + "learning_rate": 1.98612242107662e-05, + "loss": 0.5117, + "step": 1029 + }, + { + "epoch": 0.08160031689443455, + "grad_norm": 1.9693769603604045, + "learning_rate": 1.9860797874064123e-05, + "loss": 0.406, + "step": 1030 + }, + { + "epoch": 0.08167954050306991, + "grad_norm": 2.411832476912573, + "learning_rate": 1.9860370888075954e-05, + "loss": 0.5463, + "step": 1031 + }, + { + "epoch": 0.08175876411170528, + "grad_norm": 2.291319632797604, + "learning_rate": 1.9859943252829804e-05, + "loss": 0.5611, + "step": 1032 + }, + { + "epoch": 0.08183798772034066, + "grad_norm": 2.7509008792279346, + "learning_rate": 1.9859514968353836e-05, + "loss": 0.6889, + "step": 1033 + }, + { + "epoch": 0.08191721132897603, + "grad_norm": 2.117279359438013, + "learning_rate": 1.985908603467625e-05, + "loss": 0.5329, + "step": 1034 + }, + { + "epoch": 0.08199643493761141, + "grad_norm": 2.3927478088787963, + "learning_rate": 1.985865645182529e-05, + "loss": 0.4715, + "step": 1035 + }, + { + "epoch": 0.08207565854624678, + "grad_norm": 2.4279236573667364, + "learning_rate": 1.9858226219829234e-05, + "loss": 0.5258, + "step": 1036 + }, + { + "epoch": 0.08215488215488216, + "grad_norm": 2.2393890880065763, + "learning_rate": 1.985779533871642e-05, + "loss": 0.547, + "step": 1037 + }, + { + "epoch": 0.08223410576351753, + "grad_norm": 2.0590795623077476, + "learning_rate": 1.985736380851521e-05, + "loss": 0.5285, + "step": 1038 + }, + { + "epoch": 0.0823133293721529, + "grad_norm": 2.235554088389397, + "learning_rate": 1.9856931629254032e-05, + "loss": 0.4274, + "step": 1039 + }, + { + "epoch": 0.08239255298078828, + "grad_norm": 2.206928814171015, + "learning_rate": 1.9856498800961328e-05, + "loss": 0.5413, + "step": 1040 + }, + { + "epoch": 0.08247177658942365, + "grad_norm": 2.7100518442397945, + "learning_rate": 1.9856065323665606e-05, + "loss": 0.5916, + "step": 1041 + }, + { + "epoch": 0.08255100019805903, + "grad_norm": 2.291630426847014, + "learning_rate": 1.9855631197395406e-05, + "loss": 0.6018, + "step": 1042 + }, + { + "epoch": 0.0826302238066944, + "grad_norm": 2.3776700075246553, + "learning_rate": 1.985519642217932e-05, + "loss": 0.5523, + "step": 1043 + }, + { + "epoch": 0.08270944741532976, + "grad_norm": 2.565118367367117, + "learning_rate": 1.9854760998045964e-05, + "loss": 0.5729, + "step": 1044 + }, + { + "epoch": 0.08278867102396514, + "grad_norm": 2.7186306604147688, + "learning_rate": 1.9854324925024017e-05, + "loss": 0.5959, + "step": 1045 + }, + { + "epoch": 0.08286789463260051, + "grad_norm": 2.280808570677283, + "learning_rate": 1.9853888203142184e-05, + "loss": 0.5527, + "step": 1046 + }, + { + "epoch": 0.08294711824123589, + "grad_norm": 2.19729709498443, + "learning_rate": 1.9853450832429234e-05, + "loss": 0.4914, + "step": 1047 + }, + { + "epoch": 0.08302634184987126, + "grad_norm": 2.37706804529759, + "learning_rate": 1.9853012812913956e-05, + "loss": 0.4247, + "step": 1048 + }, + { + "epoch": 0.08310556545850664, + "grad_norm": 2.4265574672280357, + "learning_rate": 1.9852574144625193e-05, + "loss": 0.5721, + "step": 1049 + }, + { + "epoch": 0.08318478906714201, + "grad_norm": 2.626927345585469, + "learning_rate": 1.985213482759183e-05, + "loss": 0.5253, + "step": 1050 + }, + { + "epoch": 0.08326401267577738, + "grad_norm": 2.866786707581821, + "learning_rate": 1.9851694861842795e-05, + "loss": 0.6334, + "step": 1051 + }, + { + "epoch": 0.08334323628441276, + "grad_norm": 2.371213699585761, + "learning_rate": 1.9851254247407053e-05, + "loss": 0.4334, + "step": 1052 + }, + { + "epoch": 0.08342245989304813, + "grad_norm": 2.2498020645676218, + "learning_rate": 1.9850812984313626e-05, + "loss": 0.585, + "step": 1053 + }, + { + "epoch": 0.08350168350168351, + "grad_norm": 2.1204360173482395, + "learning_rate": 1.985037107259156e-05, + "loss": 0.5654, + "step": 1054 + }, + { + "epoch": 0.08358090711031887, + "grad_norm": 2.08748411123756, + "learning_rate": 1.984992851226996e-05, + "loss": 0.5612, + "step": 1055 + }, + { + "epoch": 0.08366013071895424, + "grad_norm": 2.455402448279221, + "learning_rate": 1.9849485303377955e-05, + "loss": 0.5161, + "step": 1056 + }, + { + "epoch": 0.08373935432758962, + "grad_norm": 2.5213630334970847, + "learning_rate": 1.984904144594474e-05, + "loss": 0.6956, + "step": 1057 + }, + { + "epoch": 0.08381857793622499, + "grad_norm": 2.4166977001787777, + "learning_rate": 1.9848596939999534e-05, + "loss": 0.658, + "step": 1058 + }, + { + "epoch": 0.08389780154486037, + "grad_norm": 2.1928000964600307, + "learning_rate": 1.984815178557161e-05, + "loss": 0.4511, + "step": 1059 + }, + { + "epoch": 0.08397702515349574, + "grad_norm": 2.4245482282394377, + "learning_rate": 1.9847705982690275e-05, + "loss": 0.6044, + "step": 1060 + }, + { + "epoch": 0.08405624876213111, + "grad_norm": 2.2615363598612728, + "learning_rate": 1.984725953138489e-05, + "loss": 0.5518, + "step": 1061 + }, + { + "epoch": 0.08413547237076649, + "grad_norm": 2.115417015206091, + "learning_rate": 1.9846812431684843e-05, + "loss": 0.4682, + "step": 1062 + }, + { + "epoch": 0.08421469597940186, + "grad_norm": 2.3264873794250747, + "learning_rate": 1.9846364683619575e-05, + "loss": 0.6484, + "step": 1063 + }, + { + "epoch": 0.08429391958803724, + "grad_norm": 2.25754548439187, + "learning_rate": 1.9845916287218575e-05, + "loss": 0.5645, + "step": 1064 + }, + { + "epoch": 0.0843731431966726, + "grad_norm": 2.6747068352563312, + "learning_rate": 1.9845467242511362e-05, + "loss": 0.5747, + "step": 1065 + }, + { + "epoch": 0.08445236680530799, + "grad_norm": 2.837796690514525, + "learning_rate": 1.9845017549527502e-05, + "loss": 0.55, + "step": 1066 + }, + { + "epoch": 0.08453159041394336, + "grad_norm": 2.3349890027042304, + "learning_rate": 1.984456720829661e-05, + "loss": 0.5326, + "step": 1067 + }, + { + "epoch": 0.08461081402257872, + "grad_norm": 2.6760624736859575, + "learning_rate": 1.9844116218848335e-05, + "loss": 0.5845, + "step": 1068 + }, + { + "epoch": 0.0846900376312141, + "grad_norm": 2.3164545198774347, + "learning_rate": 1.9843664581212374e-05, + "loss": 0.4944, + "step": 1069 + }, + { + "epoch": 0.08476926123984947, + "grad_norm": 2.4392096315760377, + "learning_rate": 1.9843212295418464e-05, + "loss": 0.549, + "step": 1070 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 2.807065304480291, + "learning_rate": 1.984275936149639e-05, + "loss": 0.5743, + "step": 1071 + }, + { + "epoch": 0.08492770845712022, + "grad_norm": 2.2653560895895457, + "learning_rate": 1.984230577947597e-05, + "loss": 0.5345, + "step": 1072 + }, + { + "epoch": 0.08500693206575559, + "grad_norm": 2.392687671543606, + "learning_rate": 1.9841851549387074e-05, + "loss": 0.5886, + "step": 1073 + }, + { + "epoch": 0.08508615567439097, + "grad_norm": 2.0049570218640143, + "learning_rate": 1.9841396671259606e-05, + "loss": 0.5386, + "step": 1074 + }, + { + "epoch": 0.08516537928302634, + "grad_norm": 2.58416099512533, + "learning_rate": 1.9840941145123524e-05, + "loss": 0.5644, + "step": 1075 + }, + { + "epoch": 0.08524460289166172, + "grad_norm": 2.2363938724823105, + "learning_rate": 1.984048497100882e-05, + "loss": 0.5658, + "step": 1076 + }, + { + "epoch": 0.08532382650029709, + "grad_norm": 2.1951930411728773, + "learning_rate": 1.9840028148945526e-05, + "loss": 0.6041, + "step": 1077 + }, + { + "epoch": 0.08540305010893247, + "grad_norm": 2.196124730052582, + "learning_rate": 1.983957067896373e-05, + "loss": 0.3754, + "step": 1078 + }, + { + "epoch": 0.08548227371756784, + "grad_norm": 2.1384632951667246, + "learning_rate": 1.9839112561093548e-05, + "loss": 0.4146, + "step": 1079 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 2.5943666634004696, + "learning_rate": 1.983865379536515e-05, + "loss": 0.5801, + "step": 1080 + }, + { + "epoch": 0.08564072093483859, + "grad_norm": 2.4808275383811194, + "learning_rate": 1.9838194381808737e-05, + "loss": 0.5941, + "step": 1081 + }, + { + "epoch": 0.08571994454347395, + "grad_norm": 2.1275604317129106, + "learning_rate": 1.983773432045456e-05, + "loss": 0.4698, + "step": 1082 + }, + { + "epoch": 0.08579916815210933, + "grad_norm": 2.360931024696737, + "learning_rate": 1.9837273611332918e-05, + "loss": 0.6583, + "step": 1083 + }, + { + "epoch": 0.0858783917607447, + "grad_norm": 2.1226193362488455, + "learning_rate": 1.983681225447414e-05, + "loss": 0.5295, + "step": 1084 + }, + { + "epoch": 0.08595761536938007, + "grad_norm": 2.484145678782945, + "learning_rate": 1.9836350249908606e-05, + "loss": 0.7594, + "step": 1085 + }, + { + "epoch": 0.08603683897801545, + "grad_norm": 2.035756418623099, + "learning_rate": 1.983588759766674e-05, + "loss": 0.3336, + "step": 1086 + }, + { + "epoch": 0.08611606258665082, + "grad_norm": 2.2760914643266874, + "learning_rate": 1.9835424297779002e-05, + "loss": 0.503, + "step": 1087 + }, + { + "epoch": 0.0861952861952862, + "grad_norm": 2.1878326301137827, + "learning_rate": 1.98349603502759e-05, + "loss": 0.4444, + "step": 1088 + }, + { + "epoch": 0.08627450980392157, + "grad_norm": 2.0592784746412325, + "learning_rate": 1.983449575518798e-05, + "loss": 0.4367, + "step": 1089 + }, + { + "epoch": 0.08635373341255695, + "grad_norm": 2.2643735590126486, + "learning_rate": 1.983403051254584e-05, + "loss": 0.4014, + "step": 1090 + }, + { + "epoch": 0.08643295702119232, + "grad_norm": 2.701903165426508, + "learning_rate": 1.9833564622380105e-05, + "loss": 0.5097, + "step": 1091 + }, + { + "epoch": 0.08651218062982768, + "grad_norm": 2.5365783980407643, + "learning_rate": 1.9833098084721455e-05, + "loss": 0.565, + "step": 1092 + }, + { + "epoch": 0.08659140423846307, + "grad_norm": 2.312263058319201, + "learning_rate": 1.9832630899600607e-05, + "loss": 0.5472, + "step": 1093 + }, + { + "epoch": 0.08667062784709843, + "grad_norm": 2.6998444078873174, + "learning_rate": 1.9832163067048335e-05, + "loss": 0.5052, + "step": 1094 + }, + { + "epoch": 0.08674985145573381, + "grad_norm": 2.718849308212614, + "learning_rate": 1.9831694587095428e-05, + "loss": 0.495, + "step": 1095 + }, + { + "epoch": 0.08682907506436918, + "grad_norm": 2.2805710437838465, + "learning_rate": 1.983122545977274e-05, + "loss": 0.559, + "step": 1096 + }, + { + "epoch": 0.08690829867300455, + "grad_norm": 2.194525402699448, + "learning_rate": 1.983075568511116e-05, + "loss": 0.5577, + "step": 1097 + }, + { + "epoch": 0.08698752228163993, + "grad_norm": 2.3875645035218125, + "learning_rate": 1.983028526314162e-05, + "loss": 0.6205, + "step": 1098 + }, + { + "epoch": 0.0870667458902753, + "grad_norm": 2.0795549229475863, + "learning_rate": 1.98298141938951e-05, + "loss": 0.5904, + "step": 1099 + }, + { + "epoch": 0.08714596949891068, + "grad_norm": 1.9693597637996387, + "learning_rate": 1.982934247740261e-05, + "loss": 0.4593, + "step": 1100 + }, + { + "epoch": 0.08722519310754605, + "grad_norm": 2.3509050828577576, + "learning_rate": 1.9828870113695217e-05, + "loss": 0.6522, + "step": 1101 + }, + { + "epoch": 0.08730441671618142, + "grad_norm": 2.503366566979004, + "learning_rate": 1.9828397102804016e-05, + "loss": 0.4066, + "step": 1102 + }, + { + "epoch": 0.0873836403248168, + "grad_norm": 2.402161114785244, + "learning_rate": 1.982792344476016e-05, + "loss": 0.519, + "step": 1103 + }, + { + "epoch": 0.08746286393345216, + "grad_norm": 2.5738632917015827, + "learning_rate": 1.982744913959483e-05, + "loss": 0.6475, + "step": 1104 + }, + { + "epoch": 0.08754208754208755, + "grad_norm": 2.0720674857952384, + "learning_rate": 1.9826974187339267e-05, + "loss": 0.5736, + "step": 1105 + }, + { + "epoch": 0.08762131115072291, + "grad_norm": 2.0509287976489996, + "learning_rate": 1.9826498588024738e-05, + "loss": 0.4438, + "step": 1106 + }, + { + "epoch": 0.0877005347593583, + "grad_norm": 2.358130101245386, + "learning_rate": 1.982602234168255e-05, + "loss": 0.621, + "step": 1107 + }, + { + "epoch": 0.08777975836799366, + "grad_norm": 2.2116427993237946, + "learning_rate": 1.9825545448344078e-05, + "loss": 0.5359, + "step": 1108 + }, + { + "epoch": 0.08785898197662903, + "grad_norm": 2.7682717064872837, + "learning_rate": 1.9825067908040716e-05, + "loss": 0.4988, + "step": 1109 + }, + { + "epoch": 0.08793820558526441, + "grad_norm": 2.4150945446945795, + "learning_rate": 1.9824589720803906e-05, + "loss": 0.4937, + "step": 1110 + }, + { + "epoch": 0.08801742919389978, + "grad_norm": 2.588873075138313, + "learning_rate": 1.9824110886665137e-05, + "loss": 0.5405, + "step": 1111 + }, + { + "epoch": 0.08809665280253516, + "grad_norm": 2.5414871777571184, + "learning_rate": 1.9823631405655933e-05, + "loss": 0.5157, + "step": 1112 + }, + { + "epoch": 0.08817587641117053, + "grad_norm": 2.4131642122876213, + "learning_rate": 1.9823151277807873e-05, + "loss": 0.5428, + "step": 1113 + }, + { + "epoch": 0.0882551000198059, + "grad_norm": 2.1832959392334, + "learning_rate": 1.9822670503152567e-05, + "loss": 0.4526, + "step": 1114 + }, + { + "epoch": 0.08833432362844128, + "grad_norm": 2.1678480146487313, + "learning_rate": 1.982218908172167e-05, + "loss": 0.3613, + "step": 1115 + }, + { + "epoch": 0.08841354723707665, + "grad_norm": 2.411533307772899, + "learning_rate": 1.9821707013546885e-05, + "loss": 0.677, + "step": 1116 + }, + { + "epoch": 0.08849277084571203, + "grad_norm": 2.0337302013065544, + "learning_rate": 1.9821224298659953e-05, + "loss": 0.4903, + "step": 1117 + }, + { + "epoch": 0.0885719944543474, + "grad_norm": 2.9432963459147716, + "learning_rate": 1.9820740937092656e-05, + "loss": 0.6193, + "step": 1118 + }, + { + "epoch": 0.08865121806298278, + "grad_norm": 2.6375121992616, + "learning_rate": 1.982025692887682e-05, + "loss": 0.6142, + "step": 1119 + }, + { + "epoch": 0.08873044167161814, + "grad_norm": 2.2098968055286083, + "learning_rate": 1.9819772274044323e-05, + "loss": 0.4294, + "step": 1120 + }, + { + "epoch": 0.08880966528025351, + "grad_norm": 1.9195425193672744, + "learning_rate": 1.9819286972627066e-05, + "loss": 0.4044, + "step": 1121 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.4016411822158292, + "learning_rate": 1.9818801024657014e-05, + "loss": 0.5483, + "step": 1122 + }, + { + "epoch": 0.08896811249752426, + "grad_norm": 2.4155108762995323, + "learning_rate": 1.9818314430166158e-05, + "loss": 0.4949, + "step": 1123 + }, + { + "epoch": 0.08904733610615964, + "grad_norm": 2.78135300689024, + "learning_rate": 1.981782718918654e-05, + "loss": 0.5327, + "step": 1124 + }, + { + "epoch": 0.08912655971479501, + "grad_norm": 2.346909948332823, + "learning_rate": 1.981733930175024e-05, + "loss": 0.4374, + "step": 1125 + }, + { + "epoch": 0.08920578332343038, + "grad_norm": 2.1665616960343685, + "learning_rate": 1.9816850767889387e-05, + "loss": 0.4909, + "step": 1126 + }, + { + "epoch": 0.08928500693206576, + "grad_norm": 2.496609749684578, + "learning_rate": 1.9816361587636143e-05, + "loss": 0.5709, + "step": 1127 + }, + { + "epoch": 0.08936423054070113, + "grad_norm": 2.349931343807538, + "learning_rate": 1.9815871761022727e-05, + "loss": 0.6173, + "step": 1128 + }, + { + "epoch": 0.08944345414933651, + "grad_norm": 2.269799196215516, + "learning_rate": 1.9815381288081382e-05, + "loss": 0.4906, + "step": 1129 + }, + { + "epoch": 0.08952267775797187, + "grad_norm": 2.673440163310611, + "learning_rate": 1.9814890168844412e-05, + "loss": 0.5888, + "step": 1130 + }, + { + "epoch": 0.08960190136660724, + "grad_norm": 2.118942300398741, + "learning_rate": 1.981439840334415e-05, + "loss": 0.528, + "step": 1131 + }, + { + "epoch": 0.08968112497524262, + "grad_norm": 2.271213627250479, + "learning_rate": 1.9813905991612974e-05, + "loss": 0.5625, + "step": 1132 + }, + { + "epoch": 0.08976034858387799, + "grad_norm": 2.580820532147506, + "learning_rate": 1.9813412933683312e-05, + "loss": 0.4875, + "step": 1133 + }, + { + "epoch": 0.08983957219251337, + "grad_norm": 2.4625476284905776, + "learning_rate": 1.9812919229587626e-05, + "loss": 0.4058, + "step": 1134 + }, + { + "epoch": 0.08991879580114874, + "grad_norm": 2.191370044534374, + "learning_rate": 1.9812424879358424e-05, + "loss": 0.5066, + "step": 1135 + }, + { + "epoch": 0.08999801940978412, + "grad_norm": 2.351674586942393, + "learning_rate": 1.981192988302826e-05, + "loss": 0.6397, + "step": 1136 + }, + { + "epoch": 0.09007724301841949, + "grad_norm": 2.2191575410966253, + "learning_rate": 1.981143424062973e-05, + "loss": 0.4388, + "step": 1137 + }, + { + "epoch": 0.09015646662705486, + "grad_norm": 2.377638942591902, + "learning_rate": 1.981093795219546e-05, + "loss": 0.5633, + "step": 1138 + }, + { + "epoch": 0.09023569023569024, + "grad_norm": 2.456143566746411, + "learning_rate": 1.9810441017758132e-05, + "loss": 0.5233, + "step": 1139 + }, + { + "epoch": 0.0903149138443256, + "grad_norm": 2.1400603190019813, + "learning_rate": 1.980994343735047e-05, + "loss": 0.4868, + "step": 1140 + }, + { + "epoch": 0.09039413745296099, + "grad_norm": 2.4365862201405344, + "learning_rate": 1.9809445211005235e-05, + "loss": 0.527, + "step": 1141 + }, + { + "epoch": 0.09047336106159636, + "grad_norm": 2.3171574143156324, + "learning_rate": 1.980894633875523e-05, + "loss": 0.5416, + "step": 1142 + }, + { + "epoch": 0.09055258467023172, + "grad_norm": 2.2672458176524035, + "learning_rate": 1.980844682063331e-05, + "loss": 0.5176, + "step": 1143 + }, + { + "epoch": 0.0906318082788671, + "grad_norm": 2.336202433378965, + "learning_rate": 1.980794665667236e-05, + "loss": 0.57, + "step": 1144 + }, + { + "epoch": 0.09071103188750247, + "grad_norm": 2.213737381332928, + "learning_rate": 1.9807445846905316e-05, + "loss": 0.5883, + "step": 1145 + }, + { + "epoch": 0.09079025549613785, + "grad_norm": 2.4832808174617442, + "learning_rate": 1.980694439136515e-05, + "loss": 0.6151, + "step": 1146 + }, + { + "epoch": 0.09086947910477322, + "grad_norm": 2.035309695248436, + "learning_rate": 1.980644229008489e-05, + "loss": 0.5427, + "step": 1147 + }, + { + "epoch": 0.0909487027134086, + "grad_norm": 2.3437276485275036, + "learning_rate": 1.9805939543097586e-05, + "loss": 0.5379, + "step": 1148 + }, + { + "epoch": 0.09102792632204397, + "grad_norm": 2.1635536174164813, + "learning_rate": 1.9805436150436352e-05, + "loss": 0.4341, + "step": 1149 + }, + { + "epoch": 0.09110714993067934, + "grad_norm": 2.1598382643032106, + "learning_rate": 1.9804932112134323e-05, + "loss": 0.4478, + "step": 1150 + }, + { + "epoch": 0.09118637353931472, + "grad_norm": 2.621712715581236, + "learning_rate": 1.9804427428224696e-05, + "loss": 0.5489, + "step": 1151 + }, + { + "epoch": 0.09126559714795009, + "grad_norm": 2.741732777540899, + "learning_rate": 1.9803922098740696e-05, + "loss": 0.5285, + "step": 1152 + }, + { + "epoch": 0.09134482075658547, + "grad_norm": 2.2823604314766195, + "learning_rate": 1.98034161237156e-05, + "loss": 0.5482, + "step": 1153 + }, + { + "epoch": 0.09142404436522084, + "grad_norm": 2.2583037186943073, + "learning_rate": 1.9802909503182722e-05, + "loss": 0.5335, + "step": 1154 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 2.336794350316473, + "learning_rate": 1.9802402237175426e-05, + "loss": 0.4927, + "step": 1155 + }, + { + "epoch": 0.09158249158249158, + "grad_norm": 2.109334400919138, + "learning_rate": 1.9801894325727104e-05, + "loss": 0.426, + "step": 1156 + }, + { + "epoch": 0.09166171519112695, + "grad_norm": 2.6767397455004978, + "learning_rate": 1.980138576887121e-05, + "loss": 0.7275, + "step": 1157 + }, + { + "epoch": 0.09174093879976233, + "grad_norm": 2.1200292119942468, + "learning_rate": 1.980087656664122e-05, + "loss": 0.4499, + "step": 1158 + }, + { + "epoch": 0.0918201624083977, + "grad_norm": 2.4409068807120162, + "learning_rate": 1.9800366719070668e-05, + "loss": 0.603, + "step": 1159 + }, + { + "epoch": 0.09189938601703307, + "grad_norm": 2.502240881484407, + "learning_rate": 1.9799856226193125e-05, + "loss": 0.5064, + "step": 1160 + }, + { + "epoch": 0.09197860962566845, + "grad_norm": 2.5107586371318655, + "learning_rate": 1.97993450880422e-05, + "loss": 0.4761, + "step": 1161 + }, + { + "epoch": 0.09205783323430382, + "grad_norm": 2.52045368777118, + "learning_rate": 1.9798833304651555e-05, + "loss": 0.5551, + "step": 1162 + }, + { + "epoch": 0.0921370568429392, + "grad_norm": 2.100859266252998, + "learning_rate": 1.9798320876054882e-05, + "loss": 0.491, + "step": 1163 + }, + { + "epoch": 0.09221628045157457, + "grad_norm": 2.629915640264029, + "learning_rate": 1.9797807802285933e-05, + "loss": 0.5826, + "step": 1164 + }, + { + "epoch": 0.09229550406020995, + "grad_norm": 2.410017640664231, + "learning_rate": 1.979729408337848e-05, + "loss": 0.5008, + "step": 1165 + }, + { + "epoch": 0.09237472766884532, + "grad_norm": 2.2935743009461493, + "learning_rate": 1.9796779719366355e-05, + "loss": 0.5593, + "step": 1166 + }, + { + "epoch": 0.09245395127748068, + "grad_norm": 2.2391645715067505, + "learning_rate": 1.9796264710283425e-05, + "loss": 0.6607, + "step": 1167 + }, + { + "epoch": 0.09253317488611607, + "grad_norm": 2.2076146526130525, + "learning_rate": 1.9795749056163595e-05, + "loss": 0.5248, + "step": 1168 + }, + { + "epoch": 0.09261239849475143, + "grad_norm": 1.8103433460053742, + "learning_rate": 1.9795232757040827e-05, + "loss": 0.3633, + "step": 1169 + }, + { + "epoch": 0.09269162210338681, + "grad_norm": 2.2385250682269557, + "learning_rate": 1.9794715812949117e-05, + "loss": 0.4679, + "step": 1170 + }, + { + "epoch": 0.09277084571202218, + "grad_norm": 2.230733998375238, + "learning_rate": 1.9794198223922496e-05, + "loss": 0.5891, + "step": 1171 + }, + { + "epoch": 0.09285006932065755, + "grad_norm": 2.309282906613327, + "learning_rate": 1.979367998999505e-05, + "loss": 0.4029, + "step": 1172 + }, + { + "epoch": 0.09292929292929293, + "grad_norm": 2.4423149386148384, + "learning_rate": 1.97931611112009e-05, + "loss": 0.4837, + "step": 1173 + }, + { + "epoch": 0.0930085165379283, + "grad_norm": 2.1330231983301196, + "learning_rate": 1.9792641587574212e-05, + "loss": 0.4331, + "step": 1174 + }, + { + "epoch": 0.09308774014656368, + "grad_norm": 2.2764042540338076, + "learning_rate": 1.9792121419149196e-05, + "loss": 0.4995, + "step": 1175 + }, + { + "epoch": 0.09316696375519905, + "grad_norm": 2.304755667573725, + "learning_rate": 1.97916006059601e-05, + "loss": 0.4912, + "step": 1176 + }, + { + "epoch": 0.09324618736383443, + "grad_norm": 2.481570323484625, + "learning_rate": 1.979107914804122e-05, + "loss": 0.549, + "step": 1177 + }, + { + "epoch": 0.0933254109724698, + "grad_norm": 2.8406646903648682, + "learning_rate": 1.979055704542689e-05, + "loss": 0.5797, + "step": 1178 + }, + { + "epoch": 0.09340463458110516, + "grad_norm": 2.3209900863875386, + "learning_rate": 1.9790034298151486e-05, + "loss": 0.4511, + "step": 1179 + }, + { + "epoch": 0.09348385818974055, + "grad_norm": 2.386819623697311, + "learning_rate": 1.9789510906249432e-05, + "loss": 0.4525, + "step": 1180 + }, + { + "epoch": 0.09356308179837591, + "grad_norm": 2.284857855201307, + "learning_rate": 1.9788986869755187e-05, + "loss": 0.529, + "step": 1181 + }, + { + "epoch": 0.0936423054070113, + "grad_norm": 2.574328508526162, + "learning_rate": 1.978846218870326e-05, + "loss": 0.8237, + "step": 1182 + }, + { + "epoch": 0.09372152901564666, + "grad_norm": 2.0393555958534635, + "learning_rate": 1.9787936863128195e-05, + "loss": 0.4356, + "step": 1183 + }, + { + "epoch": 0.09380075262428203, + "grad_norm": 2.444795081415653, + "learning_rate": 1.9787410893064584e-05, + "loss": 0.5858, + "step": 1184 + }, + { + "epoch": 0.09387997623291741, + "grad_norm": 2.6478975907834936, + "learning_rate": 1.978688427854706e-05, + "loss": 0.4975, + "step": 1185 + }, + { + "epoch": 0.09395919984155278, + "grad_norm": 2.2927301216239138, + "learning_rate": 1.97863570196103e-05, + "loss": 0.5294, + "step": 1186 + }, + { + "epoch": 0.09403842345018816, + "grad_norm": 2.2859707838860808, + "learning_rate": 1.9785829116289017e-05, + "loss": 0.5564, + "step": 1187 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 2.242425012732724, + "learning_rate": 1.9785300568617973e-05, + "loss": 0.5383, + "step": 1188 + }, + { + "epoch": 0.09419687066745891, + "grad_norm": 2.2826212487488258, + "learning_rate": 1.978477137663197e-05, + "loss": 0.5753, + "step": 1189 + }, + { + "epoch": 0.09427609427609428, + "grad_norm": 2.0926510648331527, + "learning_rate": 1.9784241540365856e-05, + "loss": 0.609, + "step": 1190 + }, + { + "epoch": 0.09435531788472964, + "grad_norm": 1.8044647273101764, + "learning_rate": 1.9783711059854514e-05, + "loss": 0.5156, + "step": 1191 + }, + { + "epoch": 0.09443454149336503, + "grad_norm": 2.3067515982080926, + "learning_rate": 1.9783179935132874e-05, + "loss": 0.5784, + "step": 1192 + }, + { + "epoch": 0.0945137651020004, + "grad_norm": 2.3706657264655155, + "learning_rate": 1.978264816623591e-05, + "loss": 0.4633, + "step": 1193 + }, + { + "epoch": 0.09459298871063578, + "grad_norm": 2.175872253953192, + "learning_rate": 1.9782115753198633e-05, + "loss": 0.5168, + "step": 1194 + }, + { + "epoch": 0.09467221231927114, + "grad_norm": 2.109943903248783, + "learning_rate": 1.9781582696056105e-05, + "loss": 0.5042, + "step": 1195 + }, + { + "epoch": 0.09475143592790651, + "grad_norm": 2.267825030136225, + "learning_rate": 1.9781048994843423e-05, + "loss": 0.531, + "step": 1196 + }, + { + "epoch": 0.09483065953654189, + "grad_norm": 1.9213490245286327, + "learning_rate": 1.9780514649595727e-05, + "loss": 0.3961, + "step": 1197 + }, + { + "epoch": 0.09490988314517726, + "grad_norm": 2.729647736955396, + "learning_rate": 1.97799796603482e-05, + "loss": 0.5893, + "step": 1198 + }, + { + "epoch": 0.09498910675381264, + "grad_norm": 2.0323492123321825, + "learning_rate": 1.9779444027136075e-05, + "loss": 0.5612, + "step": 1199 + }, + { + "epoch": 0.09506833036244801, + "grad_norm": 2.23912741901332, + "learning_rate": 1.977890774999461e-05, + "loss": 0.4758, + "step": 1200 + }, + { + "epoch": 0.09514755397108338, + "grad_norm": 2.3558358687105567, + "learning_rate": 1.977837082895913e-05, + "loss": 0.5562, + "step": 1201 + }, + { + "epoch": 0.09522677757971876, + "grad_norm": 2.2231082077832207, + "learning_rate": 1.9777833264064977e-05, + "loss": 0.449, + "step": 1202 + }, + { + "epoch": 0.09530600118835413, + "grad_norm": 2.3733770072652747, + "learning_rate": 1.9777295055347553e-05, + "loss": 0.4318, + "step": 1203 + }, + { + "epoch": 0.0953852247969895, + "grad_norm": 2.302576114795998, + "learning_rate": 1.9776756202842297e-05, + "loss": 0.4529, + "step": 1204 + }, + { + "epoch": 0.09546444840562487, + "grad_norm": 2.786686457430988, + "learning_rate": 1.9776216706584682e-05, + "loss": 0.6389, + "step": 1205 + }, + { + "epoch": 0.09554367201426026, + "grad_norm": 2.1640912723222296, + "learning_rate": 1.977567656661024e-05, + "loss": 0.492, + "step": 1206 + }, + { + "epoch": 0.09562289562289562, + "grad_norm": 2.0051288858369505, + "learning_rate": 1.9775135782954534e-05, + "loss": 0.4634, + "step": 1207 + }, + { + "epoch": 0.09570211923153099, + "grad_norm": 2.52693859178143, + "learning_rate": 1.9774594355653175e-05, + "loss": 0.5035, + "step": 1208 + }, + { + "epoch": 0.09578134284016637, + "grad_norm": 2.0072780933553402, + "learning_rate": 1.9774052284741804e-05, + "loss": 0.4775, + "step": 1209 + }, + { + "epoch": 0.09586056644880174, + "grad_norm": 2.5248290352741196, + "learning_rate": 1.9773509570256124e-05, + "loss": 0.6186, + "step": 1210 + }, + { + "epoch": 0.09593979005743712, + "grad_norm": 2.7138568006642743, + "learning_rate": 1.9772966212231863e-05, + "loss": 0.5514, + "step": 1211 + }, + { + "epoch": 0.09601901366607249, + "grad_norm": 2.4705366215900186, + "learning_rate": 1.9772422210704803e-05, + "loss": 0.5646, + "step": 1212 + }, + { + "epoch": 0.09609823727470786, + "grad_norm": 1.8637671901095176, + "learning_rate": 1.977187756571076e-05, + "loss": 0.4846, + "step": 1213 + }, + { + "epoch": 0.09617746088334324, + "grad_norm": 2.393257399096758, + "learning_rate": 1.9771332277285603e-05, + "loss": 0.5056, + "step": 1214 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 2.4048595101962946, + "learning_rate": 1.977078634546523e-05, + "loss": 0.5664, + "step": 1215 + }, + { + "epoch": 0.09633590810061399, + "grad_norm": 2.3086846618645778, + "learning_rate": 1.977023977028559e-05, + "loss": 0.6001, + "step": 1216 + }, + { + "epoch": 0.09641513170924935, + "grad_norm": 2.375784985667264, + "learning_rate": 1.9769692551782672e-05, + "loss": 0.6104, + "step": 1217 + }, + { + "epoch": 0.09649435531788474, + "grad_norm": 1.9729956772425852, + "learning_rate": 1.976914468999251e-05, + "loss": 0.5019, + "step": 1218 + }, + { + "epoch": 0.0965735789265201, + "grad_norm": 2.777477555441641, + "learning_rate": 1.9768596184951174e-05, + "loss": 0.6158, + "step": 1219 + }, + { + "epoch": 0.09665280253515547, + "grad_norm": 2.4295227812181426, + "learning_rate": 1.9768047036694785e-05, + "loss": 0.6095, + "step": 1220 + }, + { + "epoch": 0.09673202614379085, + "grad_norm": 2.431015576210552, + "learning_rate": 1.9767497245259496e-05, + "loss": 0.5322, + "step": 1221 + }, + { + "epoch": 0.09681124975242622, + "grad_norm": 1.9937253550706193, + "learning_rate": 1.9766946810681517e-05, + "loss": 0.5104, + "step": 1222 + }, + { + "epoch": 0.0968904733610616, + "grad_norm": 2.3508815834530217, + "learning_rate": 1.9766395732997082e-05, + "loss": 0.4864, + "step": 1223 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 2.1732492944130612, + "learning_rate": 1.9765844012242482e-05, + "loss": 0.4722, + "step": 1224 + }, + { + "epoch": 0.09704892057833234, + "grad_norm": 2.1792013845006792, + "learning_rate": 1.9765291648454042e-05, + "loss": 0.4567, + "step": 1225 + }, + { + "epoch": 0.09712814418696772, + "grad_norm": 2.216061633850962, + "learning_rate": 1.9764738641668137e-05, + "loss": 0.5426, + "step": 1226 + }, + { + "epoch": 0.09720736779560309, + "grad_norm": 2.3564778208703823, + "learning_rate": 1.9764184991921178e-05, + "loss": 0.5025, + "step": 1227 + }, + { + "epoch": 0.09728659140423847, + "grad_norm": 2.20319294384829, + "learning_rate": 1.9763630699249615e-05, + "loss": 0.4232, + "step": 1228 + }, + { + "epoch": 0.09736581501287384, + "grad_norm": 2.071192665133225, + "learning_rate": 1.9763075763689956e-05, + "loss": 0.4236, + "step": 1229 + }, + { + "epoch": 0.0974450386215092, + "grad_norm": 1.9634205377783411, + "learning_rate": 1.9762520185278734e-05, + "loss": 0.416, + "step": 1230 + }, + { + "epoch": 0.09752426223014458, + "grad_norm": 2.741419263392541, + "learning_rate": 1.9761963964052528e-05, + "loss": 0.5124, + "step": 1231 + }, + { + "epoch": 0.09760348583877995, + "grad_norm": 2.146037377471575, + "learning_rate": 1.976140710004797e-05, + "loss": 0.5936, + "step": 1232 + }, + { + "epoch": 0.09768270944741533, + "grad_norm": 2.216905805926447, + "learning_rate": 1.976084959330172e-05, + "loss": 0.5554, + "step": 1233 + }, + { + "epoch": 0.0977619330560507, + "grad_norm": 2.0828080968632268, + "learning_rate": 1.9760291443850496e-05, + "loss": 0.5043, + "step": 1234 + }, + { + "epoch": 0.09784115666468608, + "grad_norm": 2.353320748286769, + "learning_rate": 1.9759732651731037e-05, + "loss": 0.4785, + "step": 1235 + }, + { + "epoch": 0.09792038027332145, + "grad_norm": 2.645633383650802, + "learning_rate": 1.975917321698015e-05, + "loss": 0.5462, + "step": 1236 + }, + { + "epoch": 0.09799960388195682, + "grad_norm": 1.977526715455474, + "learning_rate": 1.9758613139634662e-05, + "loss": 0.5362, + "step": 1237 + }, + { + "epoch": 0.0980788274905922, + "grad_norm": 2.3591579859369203, + "learning_rate": 1.975805241973145e-05, + "loss": 0.599, + "step": 1238 + }, + { + "epoch": 0.09815805109922757, + "grad_norm": 2.1680867235864634, + "learning_rate": 1.9757491057307448e-05, + "loss": 0.6011, + "step": 1239 + }, + { + "epoch": 0.09823727470786295, + "grad_norm": 1.9512126136847754, + "learning_rate": 1.9756929052399606e-05, + "loss": 0.4796, + "step": 1240 + }, + { + "epoch": 0.09831649831649832, + "grad_norm": 2.173023721130156, + "learning_rate": 1.9756366405044928e-05, + "loss": 0.4921, + "step": 1241 + }, + { + "epoch": 0.09839572192513368, + "grad_norm": 2.1913021786140483, + "learning_rate": 1.9755803115280476e-05, + "loss": 0.5157, + "step": 1242 + }, + { + "epoch": 0.09847494553376906, + "grad_norm": 2.766369888233421, + "learning_rate": 1.9755239183143323e-05, + "loss": 0.5181, + "step": 1243 + }, + { + "epoch": 0.09855416914240443, + "grad_norm": 2.2744231441113434, + "learning_rate": 1.9754674608670613e-05, + "loss": 0.6038, + "step": 1244 + }, + { + "epoch": 0.09863339275103981, + "grad_norm": 2.267371434111587, + "learning_rate": 1.9754109391899514e-05, + "loss": 0.555, + "step": 1245 + }, + { + "epoch": 0.09871261635967518, + "grad_norm": 2.25288566372562, + "learning_rate": 1.975354353286725e-05, + "loss": 0.4967, + "step": 1246 + }, + { + "epoch": 0.09879183996831056, + "grad_norm": 2.1701598042655204, + "learning_rate": 1.9752977031611072e-05, + "loss": 0.4921, + "step": 1247 + }, + { + "epoch": 0.09887106357694593, + "grad_norm": 2.6282075195812613, + "learning_rate": 1.9752409888168285e-05, + "loss": 0.5678, + "step": 1248 + }, + { + "epoch": 0.0989502871855813, + "grad_norm": 2.543478358588145, + "learning_rate": 1.975184210257623e-05, + "loss": 0.4775, + "step": 1249 + }, + { + "epoch": 0.09902951079421668, + "grad_norm": 1.9627462180607333, + "learning_rate": 1.97512736748723e-05, + "loss": 0.4183, + "step": 1250 + }, + { + "epoch": 0.09910873440285205, + "grad_norm": 2.4357277056177944, + "learning_rate": 1.975070460509392e-05, + "loss": 0.5256, + "step": 1251 + }, + { + "epoch": 0.09918795801148743, + "grad_norm": 2.5276796768433734, + "learning_rate": 1.9750134893278553e-05, + "loss": 0.5206, + "step": 1252 + }, + { + "epoch": 0.0992671816201228, + "grad_norm": 2.0876619760814834, + "learning_rate": 1.974956453946372e-05, + "loss": 0.4243, + "step": 1253 + }, + { + "epoch": 0.09934640522875816, + "grad_norm": 2.187654364862005, + "learning_rate": 1.9748993543686973e-05, + "loss": 0.4743, + "step": 1254 + }, + { + "epoch": 0.09942562883739355, + "grad_norm": 2.424028485743396, + "learning_rate": 1.9748421905985915e-05, + "loss": 0.5166, + "step": 1255 + }, + { + "epoch": 0.09950485244602891, + "grad_norm": 2.1951241985135312, + "learning_rate": 1.9747849626398176e-05, + "loss": 0.5178, + "step": 1256 + }, + { + "epoch": 0.0995840760546643, + "grad_norm": 2.0811995259320386, + "learning_rate": 1.9747276704961447e-05, + "loss": 0.4329, + "step": 1257 + }, + { + "epoch": 0.09966329966329966, + "grad_norm": 2.279498263179114, + "learning_rate": 1.9746703141713444e-05, + "loss": 0.5354, + "step": 1258 + }, + { + "epoch": 0.09974252327193504, + "grad_norm": 1.9895821829242717, + "learning_rate": 1.974612893669194e-05, + "loss": 0.4478, + "step": 1259 + }, + { + "epoch": 0.09982174688057041, + "grad_norm": 2.9207547936648877, + "learning_rate": 1.974555408993474e-05, + "loss": 0.5537, + "step": 1260 + }, + { + "epoch": 0.09990097048920578, + "grad_norm": 2.3649710729599565, + "learning_rate": 1.9744978601479693e-05, + "loss": 0.5399, + "step": 1261 + }, + { + "epoch": 0.09998019409784116, + "grad_norm": 2.3819317618700215, + "learning_rate": 1.97444024713647e-05, + "loss": 0.5064, + "step": 1262 + }, + { + "epoch": 0.10005941770647653, + "grad_norm": 2.1908432352275855, + "learning_rate": 1.9743825699627687e-05, + "loss": 0.5147, + "step": 1263 + }, + { + "epoch": 0.10013864131511191, + "grad_norm": 2.287527706598705, + "learning_rate": 1.974324828630664e-05, + "loss": 0.5913, + "step": 1264 + }, + { + "epoch": 0.10021786492374728, + "grad_norm": 2.406175882809149, + "learning_rate": 1.974267023143957e-05, + "loss": 0.4876, + "step": 1265 + }, + { + "epoch": 0.10029708853238264, + "grad_norm": 2.6522953771586018, + "learning_rate": 1.974209153506455e-05, + "loss": 0.6547, + "step": 1266 + }, + { + "epoch": 0.10037631214101803, + "grad_norm": 2.6774454544898947, + "learning_rate": 1.9741512197219675e-05, + "loss": 0.5214, + "step": 1267 + }, + { + "epoch": 0.1004555357496534, + "grad_norm": 2.284614930368995, + "learning_rate": 1.9740932217943095e-05, + "loss": 0.4807, + "step": 1268 + }, + { + "epoch": 0.10053475935828877, + "grad_norm": 2.0462928836102816, + "learning_rate": 1.9740351597272998e-05, + "loss": 0.5032, + "step": 1269 + }, + { + "epoch": 0.10061398296692414, + "grad_norm": 2.3892670018590443, + "learning_rate": 1.9739770335247616e-05, + "loss": 0.6089, + "step": 1270 + }, + { + "epoch": 0.10069320657555951, + "grad_norm": 2.1011891572579957, + "learning_rate": 1.9739188431905223e-05, + "loss": 0.6247, + "step": 1271 + }, + { + "epoch": 0.10077243018419489, + "grad_norm": 2.069333043250612, + "learning_rate": 1.9738605887284134e-05, + "loss": 0.5171, + "step": 1272 + }, + { + "epoch": 0.10085165379283026, + "grad_norm": 1.9830086566428888, + "learning_rate": 1.9738022701422705e-05, + "loss": 0.604, + "step": 1273 + }, + { + "epoch": 0.10093087740146564, + "grad_norm": 1.9189881387192915, + "learning_rate": 1.973743887435934e-05, + "loss": 0.4395, + "step": 1274 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.158610283217366, + "learning_rate": 1.9736854406132476e-05, + "loss": 0.4273, + "step": 1275 + }, + { + "epoch": 0.10108932461873639, + "grad_norm": 2.479550264026852, + "learning_rate": 1.9736269296780603e-05, + "loss": 0.475, + "step": 1276 + }, + { + "epoch": 0.10116854822737176, + "grad_norm": 2.2418316267104403, + "learning_rate": 1.9735683546342243e-05, + "loss": 0.5047, + "step": 1277 + }, + { + "epoch": 0.10124777183600712, + "grad_norm": 2.2333766677349445, + "learning_rate": 1.9735097154855968e-05, + "loss": 0.5292, + "step": 1278 + }, + { + "epoch": 0.1013269954446425, + "grad_norm": 2.3155805068531956, + "learning_rate": 1.9734510122360383e-05, + "loss": 0.6675, + "step": 1279 + }, + { + "epoch": 0.10140621905327787, + "grad_norm": 2.233679218583782, + "learning_rate": 1.973392244889415e-05, + "loss": 0.5609, + "step": 1280 + }, + { + "epoch": 0.10148544266191326, + "grad_norm": 2.2578654982470776, + "learning_rate": 1.9733334134495963e-05, + "loss": 0.5459, + "step": 1281 + }, + { + "epoch": 0.10156466627054862, + "grad_norm": 2.235927539895228, + "learning_rate": 1.9732745179204553e-05, + "loss": 0.5324, + "step": 1282 + }, + { + "epoch": 0.10164388987918399, + "grad_norm": 2.2734544531055936, + "learning_rate": 1.9732155583058705e-05, + "loss": 0.5646, + "step": 1283 + }, + { + "epoch": 0.10172311348781937, + "grad_norm": 2.3225330071712644, + "learning_rate": 1.973156534609724e-05, + "loss": 0.5223, + "step": 1284 + }, + { + "epoch": 0.10180233709645474, + "grad_norm": 2.1874778254951797, + "learning_rate": 1.973097446835902e-05, + "loss": 0.5121, + "step": 1285 + }, + { + "epoch": 0.10188156070509012, + "grad_norm": 1.7930253360562414, + "learning_rate": 1.9730382949882955e-05, + "loss": 0.3641, + "step": 1286 + }, + { + "epoch": 0.10196078431372549, + "grad_norm": 2.1533712556327, + "learning_rate": 1.9729790790707995e-05, + "loss": 0.478, + "step": 1287 + }, + { + "epoch": 0.10204000792236087, + "grad_norm": 2.188433134663067, + "learning_rate": 1.9729197990873127e-05, + "loss": 0.4731, + "step": 1288 + }, + { + "epoch": 0.10211923153099624, + "grad_norm": 3.7277942529749044, + "learning_rate": 1.9728604550417385e-05, + "loss": 0.5092, + "step": 1289 + }, + { + "epoch": 0.1021984551396316, + "grad_norm": 2.917114218579833, + "learning_rate": 1.9728010469379844e-05, + "loss": 0.4841, + "step": 1290 + }, + { + "epoch": 0.10227767874826699, + "grad_norm": 2.467788717668694, + "learning_rate": 1.972741574779962e-05, + "loss": 0.5663, + "step": 1291 + }, + { + "epoch": 0.10235690235690235, + "grad_norm": 2.166985276940756, + "learning_rate": 1.9726820385715877e-05, + "loss": 0.428, + "step": 1292 + }, + { + "epoch": 0.10243612596553774, + "grad_norm": 2.734128340973256, + "learning_rate": 1.9726224383167815e-05, + "loss": 0.5253, + "step": 1293 + }, + { + "epoch": 0.1025153495741731, + "grad_norm": 2.0422415700032412, + "learning_rate": 1.9725627740194673e-05, + "loss": 0.4421, + "step": 1294 + }, + { + "epoch": 0.10259457318280847, + "grad_norm": 2.089549173570466, + "learning_rate": 1.9725030456835745e-05, + "loss": 0.4337, + "step": 1295 + }, + { + "epoch": 0.10267379679144385, + "grad_norm": 2.459132497291346, + "learning_rate": 1.9724432533130355e-05, + "loss": 0.629, + "step": 1296 + }, + { + "epoch": 0.10275302040007922, + "grad_norm": 2.2745960847474733, + "learning_rate": 1.972383396911787e-05, + "loss": 0.5844, + "step": 1297 + }, + { + "epoch": 0.1028322440087146, + "grad_norm": 2.4412656469129512, + "learning_rate": 1.9723234764837708e-05, + "loss": 0.4796, + "step": 1298 + }, + { + "epoch": 0.10291146761734997, + "grad_norm": 2.092936022873505, + "learning_rate": 1.9722634920329323e-05, + "loss": 0.4889, + "step": 1299 + }, + { + "epoch": 0.10299069122598534, + "grad_norm": 2.376181841165693, + "learning_rate": 1.9722034435632207e-05, + "loss": 0.6405, + "step": 1300 + }, + { + "epoch": 0.10306991483462072, + "grad_norm": 2.6309636850104483, + "learning_rate": 1.972143331078591e-05, + "loss": 0.6636, + "step": 1301 + }, + { + "epoch": 0.10314913844325609, + "grad_norm": 2.125338740252849, + "learning_rate": 1.972083154583e-05, + "loss": 0.4635, + "step": 1302 + }, + { + "epoch": 0.10322836205189147, + "grad_norm": 2.015033388272449, + "learning_rate": 1.972022914080411e-05, + "loss": 0.4261, + "step": 1303 + }, + { + "epoch": 0.10330758566052684, + "grad_norm": 2.2598935735723997, + "learning_rate": 1.9719626095747897e-05, + "loss": 0.5767, + "step": 1304 + }, + { + "epoch": 0.10338680926916222, + "grad_norm": 1.8116809672507894, + "learning_rate": 1.971902241070108e-05, + "loss": 0.4311, + "step": 1305 + }, + { + "epoch": 0.10346603287779758, + "grad_norm": 2.628031805876651, + "learning_rate": 1.9718418085703397e-05, + "loss": 0.6494, + "step": 1306 + }, + { + "epoch": 0.10354525648643295, + "grad_norm": 2.04617520653546, + "learning_rate": 1.971781312079465e-05, + "loss": 0.4754, + "step": 1307 + }, + { + "epoch": 0.10362448009506833, + "grad_norm": 1.95774659537593, + "learning_rate": 1.9717207516014664e-05, + "loss": 0.4569, + "step": 1308 + }, + { + "epoch": 0.1037037037037037, + "grad_norm": 2.117897026087532, + "learning_rate": 1.9716601271403322e-05, + "loss": 0.4425, + "step": 1309 + }, + { + "epoch": 0.10378292731233908, + "grad_norm": 2.815940052246426, + "learning_rate": 1.9715994387000537e-05, + "loss": 0.5858, + "step": 1310 + }, + { + "epoch": 0.10386215092097445, + "grad_norm": 2.4192115644317846, + "learning_rate": 1.9715386862846272e-05, + "loss": 0.5589, + "step": 1311 + }, + { + "epoch": 0.10394137452960982, + "grad_norm": 2.779990687395962, + "learning_rate": 1.971477869898053e-05, + "loss": 0.4776, + "step": 1312 + }, + { + "epoch": 0.1040205981382452, + "grad_norm": 2.3997539806034967, + "learning_rate": 1.9714169895443357e-05, + "loss": 0.4748, + "step": 1313 + }, + { + "epoch": 0.10409982174688057, + "grad_norm": 2.4012592625019913, + "learning_rate": 1.971356045227484e-05, + "loss": 0.5633, + "step": 1314 + }, + { + "epoch": 0.10417904535551595, + "grad_norm": 2.2427710239188583, + "learning_rate": 1.97129503695151e-05, + "loss": 0.5419, + "step": 1315 + }, + { + "epoch": 0.10425826896415132, + "grad_norm": 2.28467564395694, + "learning_rate": 1.9712339647204313e-05, + "loss": 0.4789, + "step": 1316 + }, + { + "epoch": 0.1043374925727867, + "grad_norm": 3.0056755188812057, + "learning_rate": 1.97117282853827e-05, + "loss": 0.3512, + "step": 1317 + }, + { + "epoch": 0.10441671618142206, + "grad_norm": 2.7055295071619554, + "learning_rate": 1.9711116284090506e-05, + "loss": 0.6086, + "step": 1318 + }, + { + "epoch": 0.10449593979005743, + "grad_norm": 3.3132267338184374, + "learning_rate": 1.971050364336803e-05, + "loss": 0.5974, + "step": 1319 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 3.565549385918516, + "learning_rate": 1.9709890363255617e-05, + "loss": 0.5051, + "step": 1320 + }, + { + "epoch": 0.10465438700732818, + "grad_norm": 3.882957308066483, + "learning_rate": 1.9709276443793638e-05, + "loss": 0.6568, + "step": 1321 + }, + { + "epoch": 0.10473361061596356, + "grad_norm": 2.776105672136285, + "learning_rate": 1.970866188502253e-05, + "loss": 0.6399, + "step": 1322 + }, + { + "epoch": 0.10481283422459893, + "grad_norm": 4.536532459565255, + "learning_rate": 1.970804668698275e-05, + "loss": 0.4474, + "step": 1323 + }, + { + "epoch": 0.1048920578332343, + "grad_norm": 2.7320798978149883, + "learning_rate": 1.970743084971481e-05, + "loss": 0.5845, + "step": 1324 + }, + { + "epoch": 0.10497128144186968, + "grad_norm": 1.8859528357289852, + "learning_rate": 1.970681437325925e-05, + "loss": 0.4127, + "step": 1325 + }, + { + "epoch": 0.10505050505050505, + "grad_norm": 2.351724306388326, + "learning_rate": 1.9706197257656675e-05, + "loss": 0.6094, + "step": 1326 + }, + { + "epoch": 0.10512972865914043, + "grad_norm": 2.1602675824098365, + "learning_rate": 1.9705579502947712e-05, + "loss": 0.4579, + "step": 1327 + }, + { + "epoch": 0.1052089522677758, + "grad_norm": 2.1566597592068426, + "learning_rate": 1.9704961109173042e-05, + "loss": 0.6245, + "step": 1328 + }, + { + "epoch": 0.10528817587641116, + "grad_norm": 2.456475444549144, + "learning_rate": 1.9704342076373378e-05, + "loss": 0.5567, + "step": 1329 + }, + { + "epoch": 0.10536739948504655, + "grad_norm": 2.361525570523304, + "learning_rate": 1.9703722404589484e-05, + "loss": 0.4448, + "step": 1330 + }, + { + "epoch": 0.10544662309368191, + "grad_norm": 2.089131308783069, + "learning_rate": 1.970310209386216e-05, + "loss": 0.4669, + "step": 1331 + }, + { + "epoch": 0.1055258467023173, + "grad_norm": 2.215335156635847, + "learning_rate": 1.9702481144232253e-05, + "loss": 0.462, + "step": 1332 + }, + { + "epoch": 0.10560507031095266, + "grad_norm": 2.1499694316036804, + "learning_rate": 1.9701859555740647e-05, + "loss": 0.4532, + "step": 1333 + }, + { + "epoch": 0.10568429391958804, + "grad_norm": 2.0674533804386206, + "learning_rate": 1.9701237328428272e-05, + "loss": 0.5602, + "step": 1334 + }, + { + "epoch": 0.10576351752822341, + "grad_norm": 2.2777257235135764, + "learning_rate": 1.9700614462336096e-05, + "loss": 0.5804, + "step": 1335 + }, + { + "epoch": 0.10584274113685878, + "grad_norm": 2.450934116180437, + "learning_rate": 1.9699990957505136e-05, + "loss": 0.552, + "step": 1336 + }, + { + "epoch": 0.10592196474549416, + "grad_norm": 2.074400066835221, + "learning_rate": 1.9699366813976443e-05, + "loss": 0.4621, + "step": 1337 + }, + { + "epoch": 0.10600118835412953, + "grad_norm": 2.0631351800189734, + "learning_rate": 1.9698742031791118e-05, + "loss": 0.582, + "step": 1338 + }, + { + "epoch": 0.10608041196276491, + "grad_norm": 2.252480410369213, + "learning_rate": 1.96981166109903e-05, + "loss": 0.4432, + "step": 1339 + }, + { + "epoch": 0.10615963557140028, + "grad_norm": 2.383505553669203, + "learning_rate": 1.9697490551615162e-05, + "loss": 0.5494, + "step": 1340 + }, + { + "epoch": 0.10623885918003564, + "grad_norm": 2.4455835402480877, + "learning_rate": 1.9696863853706937e-05, + "loss": 0.4431, + "step": 1341 + }, + { + "epoch": 0.10631808278867103, + "grad_norm": 1.7569167065533893, + "learning_rate": 1.969623651730688e-05, + "loss": 0.3387, + "step": 1342 + }, + { + "epoch": 0.1063973063973064, + "grad_norm": 2.409163867936381, + "learning_rate": 1.969560854245631e-05, + "loss": 0.5591, + "step": 1343 + }, + { + "epoch": 0.10647653000594177, + "grad_norm": 1.9511644788047329, + "learning_rate": 1.9694979929196566e-05, + "loss": 0.4673, + "step": 1344 + }, + { + "epoch": 0.10655575361457714, + "grad_norm": 2.057363452047913, + "learning_rate": 1.9694350677569043e-05, + "loss": 0.4632, + "step": 1345 + }, + { + "epoch": 0.10663497722321252, + "grad_norm": 2.214010071480097, + "learning_rate": 1.9693720787615174e-05, + "loss": 0.5368, + "step": 1346 + }, + { + "epoch": 0.10671420083184789, + "grad_norm": 1.8988530420406244, + "learning_rate": 1.9693090259376436e-05, + "loss": 0.3494, + "step": 1347 + }, + { + "epoch": 0.10679342444048326, + "grad_norm": 2.074989357202546, + "learning_rate": 1.9692459092894343e-05, + "loss": 0.5136, + "step": 1348 + }, + { + "epoch": 0.10687264804911864, + "grad_norm": 2.207738979422821, + "learning_rate": 1.969182728821046e-05, + "loss": 0.5687, + "step": 1349 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 2.4109550297326314, + "learning_rate": 1.969119484536638e-05, + "loss": 0.4685, + "step": 1350 + }, + { + "epoch": 0.10703109526638939, + "grad_norm": 2.1401189992372496, + "learning_rate": 1.969056176440375e-05, + "loss": 0.4236, + "step": 1351 + }, + { + "epoch": 0.10711031887502476, + "grad_norm": 2.0901917705160407, + "learning_rate": 1.9689928045364258e-05, + "loss": 0.4008, + "step": 1352 + }, + { + "epoch": 0.10718954248366012, + "grad_norm": 2.191145863247345, + "learning_rate": 1.9689293688289627e-05, + "loss": 0.5192, + "step": 1353 + }, + { + "epoch": 0.1072687660922955, + "grad_norm": 1.9571435588422554, + "learning_rate": 1.968865869322163e-05, + "loss": 0.4484, + "step": 1354 + }, + { + "epoch": 0.10734798970093087, + "grad_norm": 2.5446998878971017, + "learning_rate": 1.968802306020208e-05, + "loss": 0.587, + "step": 1355 + }, + { + "epoch": 0.10742721330956626, + "grad_norm": 2.396013303266199, + "learning_rate": 1.968738678927282e-05, + "loss": 0.6018, + "step": 1356 + }, + { + "epoch": 0.10750643691820162, + "grad_norm": 2.3787680427272186, + "learning_rate": 1.9686749880475756e-05, + "loss": 0.4426, + "step": 1357 + }, + { + "epoch": 0.107585660526837, + "grad_norm": 2.2608244331470178, + "learning_rate": 1.9686112333852826e-05, + "loss": 0.5174, + "step": 1358 + }, + { + "epoch": 0.10766488413547237, + "grad_norm": 2.1900368568921755, + "learning_rate": 1.9685474149446e-05, + "loss": 0.5377, + "step": 1359 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 2.2683388133593625, + "learning_rate": 1.9684835327297306e-05, + "loss": 0.4892, + "step": 1360 + }, + { + "epoch": 0.10782333135274312, + "grad_norm": 2.377612781702832, + "learning_rate": 1.9684195867448806e-05, + "loss": 0.4858, + "step": 1361 + }, + { + "epoch": 0.10790255496137849, + "grad_norm": 2.721941567580665, + "learning_rate": 1.9683555769942608e-05, + "loss": 0.5264, + "step": 1362 + }, + { + "epoch": 0.10798177857001387, + "grad_norm": 2.1630529412613586, + "learning_rate": 1.968291503482086e-05, + "loss": 0.4109, + "step": 1363 + }, + { + "epoch": 0.10806100217864924, + "grad_norm": 2.532963871025193, + "learning_rate": 1.968227366212574e-05, + "loss": 0.5461, + "step": 1364 + }, + { + "epoch": 0.1081402257872846, + "grad_norm": 2.0279043105152805, + "learning_rate": 1.968163165189949e-05, + "loss": 0.5266, + "step": 1365 + }, + { + "epoch": 0.10821944939591999, + "grad_norm": 2.181519166040357, + "learning_rate": 1.9680989004184383e-05, + "loss": 0.4409, + "step": 1366 + }, + { + "epoch": 0.10829867300455535, + "grad_norm": 2.0071130131159545, + "learning_rate": 1.968034571902273e-05, + "loss": 0.5636, + "step": 1367 + }, + { + "epoch": 0.10837789661319074, + "grad_norm": 2.0732762064516606, + "learning_rate": 1.967970179645689e-05, + "loss": 0.3248, + "step": 1368 + }, + { + "epoch": 0.1084571202218261, + "grad_norm": 2.323664796075749, + "learning_rate": 1.9679057236529266e-05, + "loss": 0.5848, + "step": 1369 + }, + { + "epoch": 0.10853634383046147, + "grad_norm": 2.770374447369148, + "learning_rate": 1.9678412039282292e-05, + "loss": 0.6797, + "step": 1370 + }, + { + "epoch": 0.10861556743909685, + "grad_norm": 2.418803718526639, + "learning_rate": 1.967776620475846e-05, + "loss": 0.443, + "step": 1371 + }, + { + "epoch": 0.10869479104773222, + "grad_norm": 2.2391812851513375, + "learning_rate": 1.9677119733000283e-05, + "loss": 0.5881, + "step": 1372 + }, + { + "epoch": 0.1087740146563676, + "grad_norm": 2.5260582610737243, + "learning_rate": 1.967647262405034e-05, + "loss": 0.4752, + "step": 1373 + }, + { + "epoch": 0.10885323826500297, + "grad_norm": 2.0597832877880284, + "learning_rate": 1.967582487795123e-05, + "loss": 0.3699, + "step": 1374 + }, + { + "epoch": 0.10893246187363835, + "grad_norm": 2.0817716920621945, + "learning_rate": 1.967517649474561e-05, + "loss": 0.4187, + "step": 1375 + }, + { + "epoch": 0.10901168548227372, + "grad_norm": 1.9458466176770466, + "learning_rate": 1.9674527474476175e-05, + "loss": 0.4809, + "step": 1376 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 2.10176318772313, + "learning_rate": 1.9673877817185656e-05, + "loss": 0.4342, + "step": 1377 + }, + { + "epoch": 0.10917013269954447, + "grad_norm": 2.2836851871431145, + "learning_rate": 1.9673227522916827e-05, + "loss": 0.5271, + "step": 1378 + }, + { + "epoch": 0.10924935630817983, + "grad_norm": 1.90201178977515, + "learning_rate": 1.9672576591712517e-05, + "loss": 0.4403, + "step": 1379 + }, + { + "epoch": 0.10932857991681522, + "grad_norm": 1.985590042962168, + "learning_rate": 1.9671925023615572e-05, + "loss": 0.4756, + "step": 1380 + }, + { + "epoch": 0.10940780352545058, + "grad_norm": 2.0719820061206757, + "learning_rate": 1.9671272818668906e-05, + "loss": 0.4786, + "step": 1381 + }, + { + "epoch": 0.10948702713408595, + "grad_norm": 2.1166472216413292, + "learning_rate": 1.967061997691546e-05, + "loss": 0.394, + "step": 1382 + }, + { + "epoch": 0.10956625074272133, + "grad_norm": 2.3947940276839836, + "learning_rate": 1.966996649839822e-05, + "loss": 0.5613, + "step": 1383 + }, + { + "epoch": 0.1096454743513567, + "grad_norm": 1.9176786321029364, + "learning_rate": 1.9669312383160217e-05, + "loss": 0.4751, + "step": 1384 + }, + { + "epoch": 0.10972469795999208, + "grad_norm": 2.4938684252566565, + "learning_rate": 1.966865763124452e-05, + "loss": 0.5545, + "step": 1385 + }, + { + "epoch": 0.10980392156862745, + "grad_norm": 2.1487134280280507, + "learning_rate": 1.966800224269424e-05, + "loss": 0.4548, + "step": 1386 + }, + { + "epoch": 0.10988314517726283, + "grad_norm": 2.1239835539558767, + "learning_rate": 1.9667346217552528e-05, + "loss": 0.5169, + "step": 1387 + }, + { + "epoch": 0.1099623687858982, + "grad_norm": 2.0449107987637816, + "learning_rate": 1.9666689555862586e-05, + "loss": 0.4512, + "step": 1388 + }, + { + "epoch": 0.11004159239453357, + "grad_norm": 1.8557761825547496, + "learning_rate": 1.966603225766765e-05, + "loss": 0.4105, + "step": 1389 + }, + { + "epoch": 0.11012081600316895, + "grad_norm": 1.9996639683551252, + "learning_rate": 1.9665374323011002e-05, + "loss": 0.4897, + "step": 1390 + }, + { + "epoch": 0.11020003961180432, + "grad_norm": 1.9564383286329299, + "learning_rate": 1.9664715751935958e-05, + "loss": 0.3754, + "step": 1391 + }, + { + "epoch": 0.1102792632204397, + "grad_norm": 1.9868567196309925, + "learning_rate": 1.9664056544485887e-05, + "loss": 0.5438, + "step": 1392 + }, + { + "epoch": 0.11035848682907506, + "grad_norm": 2.327529420723129, + "learning_rate": 1.9663396700704195e-05, + "loss": 0.4414, + "step": 1393 + }, + { + "epoch": 0.11043771043771043, + "grad_norm": 1.8880730434393933, + "learning_rate": 1.9662736220634325e-05, + "loss": 0.4322, + "step": 1394 + }, + { + "epoch": 0.11051693404634581, + "grad_norm": 2.1486232593957486, + "learning_rate": 1.966207510431977e-05, + "loss": 0.4426, + "step": 1395 + }, + { + "epoch": 0.11059615765498118, + "grad_norm": 1.9612609312177982, + "learning_rate": 1.966141335180406e-05, + "loss": 0.4507, + "step": 1396 + }, + { + "epoch": 0.11067538126361656, + "grad_norm": 1.9754263857552985, + "learning_rate": 1.966075096313077e-05, + "loss": 0.4621, + "step": 1397 + }, + { + "epoch": 0.11075460487225193, + "grad_norm": 2.0349815536675075, + "learning_rate": 1.966008793834351e-05, + "loss": 0.5256, + "step": 1398 + }, + { + "epoch": 0.1108338284808873, + "grad_norm": 1.9520324765110302, + "learning_rate": 1.9659424277485943e-05, + "loss": 0.4565, + "step": 1399 + }, + { + "epoch": 0.11091305208952268, + "grad_norm": 2.2345829054299577, + "learning_rate": 1.9658759980601766e-05, + "loss": 0.3995, + "step": 1400 + }, + { + "epoch": 0.11099227569815805, + "grad_norm": 2.1041284053669203, + "learning_rate": 1.9658095047734718e-05, + "loss": 0.507, + "step": 1401 + }, + { + "epoch": 0.11107149930679343, + "grad_norm": 1.851611024030357, + "learning_rate": 1.965742947892858e-05, + "loss": 0.3452, + "step": 1402 + }, + { + "epoch": 0.1111507229154288, + "grad_norm": 2.1897687510782746, + "learning_rate": 1.9656763274227188e-05, + "loss": 0.4197, + "step": 1403 + }, + { + "epoch": 0.11122994652406418, + "grad_norm": 2.0738946031359617, + "learning_rate": 1.9656096433674393e-05, + "loss": 0.4331, + "step": 1404 + }, + { + "epoch": 0.11130917013269954, + "grad_norm": 2.077409424094191, + "learning_rate": 1.965542895731411e-05, + "loss": 0.4854, + "step": 1405 + }, + { + "epoch": 0.11138839374133491, + "grad_norm": 2.1631282545969426, + "learning_rate": 1.965476084519029e-05, + "loss": 0.4984, + "step": 1406 + }, + { + "epoch": 0.1114676173499703, + "grad_norm": 2.1110815671663454, + "learning_rate": 1.9654092097346925e-05, + "loss": 0.548, + "step": 1407 + }, + { + "epoch": 0.11154684095860566, + "grad_norm": 2.350641338008749, + "learning_rate": 1.965342271382805e-05, + "loss": 0.4873, + "step": 1408 + }, + { + "epoch": 0.11162606456724104, + "grad_norm": 2.3327262249107763, + "learning_rate": 1.9652752694677735e-05, + "loss": 0.467, + "step": 1409 + }, + { + "epoch": 0.11170528817587641, + "grad_norm": 2.1904935250138218, + "learning_rate": 1.9652082039940102e-05, + "loss": 0.4448, + "step": 1410 + }, + { + "epoch": 0.11178451178451178, + "grad_norm": 2.1684146924547227, + "learning_rate": 1.965141074965931e-05, + "loss": 0.5793, + "step": 1411 + }, + { + "epoch": 0.11186373539314716, + "grad_norm": 2.394931541780458, + "learning_rate": 1.965073882387956e-05, + "loss": 0.5232, + "step": 1412 + }, + { + "epoch": 0.11194295900178253, + "grad_norm": 1.8008395756504112, + "learning_rate": 1.9650066262645097e-05, + "loss": 0.3579, + "step": 1413 + }, + { + "epoch": 0.11202218261041791, + "grad_norm": 1.7880135468872924, + "learning_rate": 1.96493930660002e-05, + "loss": 0.4693, + "step": 1414 + }, + { + "epoch": 0.11210140621905328, + "grad_norm": 2.2555014374847113, + "learning_rate": 1.9648719233989202e-05, + "loss": 0.4871, + "step": 1415 + }, + { + "epoch": 0.11218062982768866, + "grad_norm": 2.123410435394117, + "learning_rate": 1.9648044766656466e-05, + "loss": 0.4627, + "step": 1416 + }, + { + "epoch": 0.11225985343632403, + "grad_norm": 1.939601690782903, + "learning_rate": 1.9647369664046407e-05, + "loss": 0.4424, + "step": 1417 + }, + { + "epoch": 0.11233907704495939, + "grad_norm": 2.281327273341833, + "learning_rate": 1.9646693926203477e-05, + "loss": 0.6591, + "step": 1418 + }, + { + "epoch": 0.11241830065359477, + "grad_norm": 2.1106466856930286, + "learning_rate": 1.964601755317217e-05, + "loss": 0.4574, + "step": 1419 + }, + { + "epoch": 0.11249752426223014, + "grad_norm": 1.9544101324947374, + "learning_rate": 1.9645340544997017e-05, + "loss": 0.4516, + "step": 1420 + }, + { + "epoch": 0.11257674787086552, + "grad_norm": 1.9559669198281695, + "learning_rate": 1.9644662901722603e-05, + "loss": 0.469, + "step": 1421 + }, + { + "epoch": 0.11265597147950089, + "grad_norm": 2.117404733421457, + "learning_rate": 1.9643984623393542e-05, + "loss": 0.3933, + "step": 1422 + }, + { + "epoch": 0.11273519508813626, + "grad_norm": 1.9197144873396272, + "learning_rate": 1.96433057100545e-05, + "loss": 0.5246, + "step": 1423 + }, + { + "epoch": 0.11281441869677164, + "grad_norm": 2.154385418286073, + "learning_rate": 1.9642626161750176e-05, + "loss": 0.5501, + "step": 1424 + }, + { + "epoch": 0.11289364230540701, + "grad_norm": 2.1084457698898036, + "learning_rate": 1.9641945978525318e-05, + "loss": 0.4261, + "step": 1425 + }, + { + "epoch": 0.11297286591404239, + "grad_norm": 2.2243457030478746, + "learning_rate": 1.9641265160424705e-05, + "loss": 0.568, + "step": 1426 + }, + { + "epoch": 0.11305208952267776, + "grad_norm": 2.5108768972557707, + "learning_rate": 1.9640583707493176e-05, + "loss": 0.4744, + "step": 1427 + }, + { + "epoch": 0.11313131313131314, + "grad_norm": 2.0892895286912894, + "learning_rate": 1.96399016197756e-05, + "loss": 0.4505, + "step": 1428 + }, + { + "epoch": 0.1132105367399485, + "grad_norm": 2.3158352460670066, + "learning_rate": 1.9639218897316885e-05, + "loss": 0.5378, + "step": 1429 + }, + { + "epoch": 0.11328976034858387, + "grad_norm": 2.340186869923995, + "learning_rate": 1.9638535540161988e-05, + "loss": 0.4724, + "step": 1430 + }, + { + "epoch": 0.11336898395721925, + "grad_norm": 2.6492115525694704, + "learning_rate": 1.96378515483559e-05, + "loss": 0.496, + "step": 1431 + }, + { + "epoch": 0.11344820756585462, + "grad_norm": 2.4004721933968187, + "learning_rate": 1.9637166921943663e-05, + "loss": 0.5341, + "step": 1432 + }, + { + "epoch": 0.11352743117449, + "grad_norm": 1.9682433988575243, + "learning_rate": 1.963648166097036e-05, + "loss": 0.4253, + "step": 1433 + }, + { + "epoch": 0.11360665478312537, + "grad_norm": 2.1500121868081763, + "learning_rate": 1.9635795765481102e-05, + "loss": 0.4655, + "step": 1434 + }, + { + "epoch": 0.11368587839176074, + "grad_norm": 2.1591082401339805, + "learning_rate": 1.9635109235521057e-05, + "loss": 0.5482, + "step": 1435 + }, + { + "epoch": 0.11376510200039612, + "grad_norm": 2.1846671812269802, + "learning_rate": 1.963442207113543e-05, + "loss": 0.5818, + "step": 1436 + }, + { + "epoch": 0.11384432560903149, + "grad_norm": 1.9687121902879365, + "learning_rate": 1.9633734272369473e-05, + "loss": 0.5662, + "step": 1437 + }, + { + "epoch": 0.11392354921766687, + "grad_norm": 2.3071778648916594, + "learning_rate": 1.9633045839268464e-05, + "loss": 0.4821, + "step": 1438 + }, + { + "epoch": 0.11400277282630224, + "grad_norm": 2.140676102643196, + "learning_rate": 1.9632356771877735e-05, + "loss": 0.4727, + "step": 1439 + }, + { + "epoch": 0.1140819964349376, + "grad_norm": 1.9071046885629488, + "learning_rate": 1.9631667070242667e-05, + "loss": 0.444, + "step": 1440 + }, + { + "epoch": 0.11416122004357299, + "grad_norm": 2.291727933785507, + "learning_rate": 1.963097673440866e-05, + "loss": 0.5207, + "step": 1441 + }, + { + "epoch": 0.11424044365220835, + "grad_norm": 2.34082057747678, + "learning_rate": 1.9630285764421183e-05, + "loss": 0.5034, + "step": 1442 + }, + { + "epoch": 0.11431966726084374, + "grad_norm": 1.8877917306566079, + "learning_rate": 1.9629594160325725e-05, + "loss": 0.425, + "step": 1443 + }, + { + "epoch": 0.1143988908694791, + "grad_norm": 2.3941728626559975, + "learning_rate": 1.9628901922167823e-05, + "loss": 0.5708, + "step": 1444 + }, + { + "epoch": 0.11447811447811448, + "grad_norm": 2.434665449999772, + "learning_rate": 1.9628209049993064e-05, + "loss": 0.5163, + "step": 1445 + }, + { + "epoch": 0.11455733808674985, + "grad_norm": 1.956106966260622, + "learning_rate": 1.9627515543847068e-05, + "loss": 0.5267, + "step": 1446 + }, + { + "epoch": 0.11463656169538522, + "grad_norm": 2.2697471101411324, + "learning_rate": 1.9626821403775494e-05, + "loss": 0.4266, + "step": 1447 + }, + { + "epoch": 0.1147157853040206, + "grad_norm": 2.248906086278467, + "learning_rate": 1.9626126629824056e-05, + "loss": 0.5469, + "step": 1448 + }, + { + "epoch": 0.11479500891265597, + "grad_norm": 2.111508000638065, + "learning_rate": 1.9625431222038494e-05, + "loss": 0.5685, + "step": 1449 + }, + { + "epoch": 0.11487423252129135, + "grad_norm": 2.236947028253924, + "learning_rate": 1.9624735180464602e-05, + "loss": 0.5866, + "step": 1450 + }, + { + "epoch": 0.11495345612992672, + "grad_norm": 2.7930780903054147, + "learning_rate": 1.962403850514821e-05, + "loss": 0.6704, + "step": 1451 + }, + { + "epoch": 0.11503267973856209, + "grad_norm": 1.918162529432279, + "learning_rate": 1.962334119613519e-05, + "loss": 0.4194, + "step": 1452 + }, + { + "epoch": 0.11511190334719747, + "grad_norm": 1.8806060879712632, + "learning_rate": 1.9622643253471457e-05, + "loss": 0.432, + "step": 1453 + }, + { + "epoch": 0.11519112695583283, + "grad_norm": 2.4150856120857673, + "learning_rate": 1.9621944677202966e-05, + "loss": 0.5439, + "step": 1454 + }, + { + "epoch": 0.11527035056446822, + "grad_norm": 1.8366272493038598, + "learning_rate": 1.9621245467375715e-05, + "loss": 0.4434, + "step": 1455 + }, + { + "epoch": 0.11534957417310358, + "grad_norm": 2.2143508571733865, + "learning_rate": 1.9620545624035748e-05, + "loss": 0.6679, + "step": 1456 + }, + { + "epoch": 0.11542879778173896, + "grad_norm": 2.0768577960184995, + "learning_rate": 1.961984514722914e-05, + "loss": 0.419, + "step": 1457 + }, + { + "epoch": 0.11550802139037433, + "grad_norm": 2.19651941167457, + "learning_rate": 1.9619144037002015e-05, + "loss": 0.3883, + "step": 1458 + }, + { + "epoch": 0.1155872449990097, + "grad_norm": 2.0519791683137574, + "learning_rate": 1.9618442293400544e-05, + "loss": 0.4742, + "step": 1459 + }, + { + "epoch": 0.11566646860764508, + "grad_norm": 2.041938262311429, + "learning_rate": 1.9617739916470926e-05, + "loss": 0.5295, + "step": 1460 + }, + { + "epoch": 0.11574569221628045, + "grad_norm": 2.291458185449157, + "learning_rate": 1.9617036906259416e-05, + "loss": 0.563, + "step": 1461 + }, + { + "epoch": 0.11582491582491583, + "grad_norm": 2.2623712408457335, + "learning_rate": 1.9616333262812298e-05, + "loss": 0.4523, + "step": 1462 + }, + { + "epoch": 0.1159041394335512, + "grad_norm": 2.121016687689861, + "learning_rate": 1.9615628986175902e-05, + "loss": 0.4096, + "step": 1463 + }, + { + "epoch": 0.11598336304218657, + "grad_norm": 1.8709177349527826, + "learning_rate": 1.9614924076396605e-05, + "loss": 0.461, + "step": 1464 + }, + { + "epoch": 0.11606258665082195, + "grad_norm": 1.9476571336567652, + "learning_rate": 1.9614218533520827e-05, + "loss": 0.4157, + "step": 1465 + }, + { + "epoch": 0.11614181025945731, + "grad_norm": 1.7256799191617038, + "learning_rate": 1.9613512357595014e-05, + "loss": 0.374, + "step": 1466 + }, + { + "epoch": 0.1162210338680927, + "grad_norm": 2.8457128537080214, + "learning_rate": 1.9612805548665673e-05, + "loss": 0.503, + "step": 1467 + }, + { + "epoch": 0.11630025747672806, + "grad_norm": 2.518083460143727, + "learning_rate": 1.961209810677934e-05, + "loss": 0.5476, + "step": 1468 + }, + { + "epoch": 0.11637948108536343, + "grad_norm": 2.23278768736079, + "learning_rate": 1.9611390031982595e-05, + "loss": 0.525, + "step": 1469 + }, + { + "epoch": 0.11645870469399881, + "grad_norm": 1.9076000265786033, + "learning_rate": 1.9610681324322068e-05, + "loss": 0.3774, + "step": 1470 + }, + { + "epoch": 0.11653792830263418, + "grad_norm": 2.2776735570874536, + "learning_rate": 1.9609971983844412e-05, + "loss": 0.5078, + "step": 1471 + }, + { + "epoch": 0.11661715191126956, + "grad_norm": 1.8411926260518676, + "learning_rate": 1.9609262010596346e-05, + "loss": 0.3922, + "step": 1472 + }, + { + "epoch": 0.11669637551990493, + "grad_norm": 1.9701215709698423, + "learning_rate": 1.9608551404624613e-05, + "loss": 0.5038, + "step": 1473 + }, + { + "epoch": 0.11677559912854031, + "grad_norm": 1.9612736617063422, + "learning_rate": 1.9607840165976003e-05, + "loss": 0.4469, + "step": 1474 + }, + { + "epoch": 0.11685482273717568, + "grad_norm": 2.2336520853868955, + "learning_rate": 1.960712829469735e-05, + "loss": 0.5173, + "step": 1475 + }, + { + "epoch": 0.11693404634581105, + "grad_norm": 1.9833072142438084, + "learning_rate": 1.9606415790835523e-05, + "loss": 0.4824, + "step": 1476 + }, + { + "epoch": 0.11701326995444643, + "grad_norm": 1.9177108554461322, + "learning_rate": 1.9605702654437438e-05, + "loss": 0.3869, + "step": 1477 + }, + { + "epoch": 0.1170924935630818, + "grad_norm": 1.981158209605318, + "learning_rate": 1.9604988885550056e-05, + "loss": 0.4094, + "step": 1478 + }, + { + "epoch": 0.11717171717171718, + "grad_norm": 2.0259390748646298, + "learning_rate": 1.960427448422037e-05, + "loss": 0.5093, + "step": 1479 + }, + { + "epoch": 0.11725094078035254, + "grad_norm": 2.0448675329673716, + "learning_rate": 1.9603559450495423e-05, + "loss": 0.4534, + "step": 1480 + }, + { + "epoch": 0.11733016438898791, + "grad_norm": 1.8984610544193488, + "learning_rate": 1.9602843784422297e-05, + "loss": 0.3554, + "step": 1481 + }, + { + "epoch": 0.1174093879976233, + "grad_norm": 1.9395398700099775, + "learning_rate": 1.9602127486048112e-05, + "loss": 0.4022, + "step": 1482 + }, + { + "epoch": 0.11748861160625866, + "grad_norm": 2.0163720396854505, + "learning_rate": 1.9601410555420035e-05, + "loss": 0.3974, + "step": 1483 + }, + { + "epoch": 0.11756783521489404, + "grad_norm": 2.05054203776624, + "learning_rate": 1.9600692992585275e-05, + "loss": 0.4397, + "step": 1484 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 2.214079054262502, + "learning_rate": 1.959997479759107e-05, + "loss": 0.4751, + "step": 1485 + }, + { + "epoch": 0.11772628243216479, + "grad_norm": 2.2950353059232116, + "learning_rate": 1.959925597048472e-05, + "loss": 0.5284, + "step": 1486 + }, + { + "epoch": 0.11780550604080016, + "grad_norm": 1.9920905764637256, + "learning_rate": 1.9598536511313553e-05, + "loss": 0.4613, + "step": 1487 + }, + { + "epoch": 0.11788472964943553, + "grad_norm": 2.1118488569468488, + "learning_rate": 1.9597816420124945e-05, + "loss": 0.5998, + "step": 1488 + }, + { + "epoch": 0.11796395325807091, + "grad_norm": 2.2433354406277664, + "learning_rate": 1.95970956969663e-05, + "loss": 0.5527, + "step": 1489 + }, + { + "epoch": 0.11804317686670628, + "grad_norm": 1.8367627795571106, + "learning_rate": 1.9596374341885093e-05, + "loss": 0.5335, + "step": 1490 + }, + { + "epoch": 0.11812240047534166, + "grad_norm": 1.7407102159496246, + "learning_rate": 1.95956523549288e-05, + "loss": 0.4402, + "step": 1491 + }, + { + "epoch": 0.11820162408397702, + "grad_norm": 2.112722245176321, + "learning_rate": 1.9594929736144978e-05, + "loss": 0.4479, + "step": 1492 + }, + { + "epoch": 0.11828084769261239, + "grad_norm": 2.462689716995226, + "learning_rate": 1.9594206485581196e-05, + "loss": 0.557, + "step": 1493 + }, + { + "epoch": 0.11836007130124777, + "grad_norm": 2.056292590324996, + "learning_rate": 1.959348260328508e-05, + "loss": 0.4044, + "step": 1494 + }, + { + "epoch": 0.11843929490988314, + "grad_norm": 2.172687419632236, + "learning_rate": 1.95927580893043e-05, + "loss": 0.6337, + "step": 1495 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 2.114823681771656, + "learning_rate": 1.9592032943686554e-05, + "loss": 0.5385, + "step": 1496 + }, + { + "epoch": 0.11859774212715389, + "grad_norm": 2.3100892535716655, + "learning_rate": 1.9591307166479595e-05, + "loss": 0.4535, + "step": 1497 + }, + { + "epoch": 0.11867696573578927, + "grad_norm": 2.349652632487777, + "learning_rate": 1.959058075773121e-05, + "loss": 0.5193, + "step": 1498 + }, + { + "epoch": 0.11875618934442464, + "grad_norm": 1.8137312542239925, + "learning_rate": 1.9589853717489228e-05, + "loss": 0.4829, + "step": 1499 + }, + { + "epoch": 0.11883541295306001, + "grad_norm": 1.9286817746349163, + "learning_rate": 1.958912604580152e-05, + "loss": 0.4112, + "step": 1500 + }, + { + "epoch": 0.11891463656169539, + "grad_norm": 1.8759861059939869, + "learning_rate": 1.9588397742716004e-05, + "loss": 0.5042, + "step": 1501 + }, + { + "epoch": 0.11899386017033076, + "grad_norm": 2.023392474184696, + "learning_rate": 1.9587668808280632e-05, + "loss": 0.5244, + "step": 1502 + }, + { + "epoch": 0.11907308377896614, + "grad_norm": 2.338711609491881, + "learning_rate": 1.9586939242543402e-05, + "loss": 0.4811, + "step": 1503 + }, + { + "epoch": 0.1191523073876015, + "grad_norm": 2.2632293860196713, + "learning_rate": 1.9586209045552355e-05, + "loss": 0.48, + "step": 1504 + }, + { + "epoch": 0.11923153099623687, + "grad_norm": 2.0604617678476598, + "learning_rate": 1.9585478217355563e-05, + "loss": 0.4963, + "step": 1505 + }, + { + "epoch": 0.11931075460487225, + "grad_norm": 2.2328978445179337, + "learning_rate": 1.9584746758001156e-05, + "loss": 0.5735, + "step": 1506 + }, + { + "epoch": 0.11938997821350762, + "grad_norm": 2.1402047021513386, + "learning_rate": 1.9584014667537293e-05, + "loss": 0.5495, + "step": 1507 + }, + { + "epoch": 0.119469201822143, + "grad_norm": 2.275821810381484, + "learning_rate": 1.9583281946012183e-05, + "loss": 0.5686, + "step": 1508 + }, + { + "epoch": 0.11954842543077837, + "grad_norm": 2.205633271036908, + "learning_rate": 1.9582548593474064e-05, + "loss": 0.597, + "step": 1509 + }, + { + "epoch": 0.11962764903941374, + "grad_norm": 2.032498172198259, + "learning_rate": 1.9581814609971232e-05, + "loss": 0.3864, + "step": 1510 + }, + { + "epoch": 0.11970687264804912, + "grad_norm": 2.1809595038271077, + "learning_rate": 1.958107999555201e-05, + "loss": 0.4366, + "step": 1511 + }, + { + "epoch": 0.11978609625668449, + "grad_norm": 2.0966040877975693, + "learning_rate": 1.958034475026477e-05, + "loss": 0.4563, + "step": 1512 + }, + { + "epoch": 0.11986531986531987, + "grad_norm": 2.0053545068303653, + "learning_rate": 1.957960887415793e-05, + "loss": 0.4182, + "step": 1513 + }, + { + "epoch": 0.11994454347395524, + "grad_norm": 1.967509465069574, + "learning_rate": 1.9578872367279937e-05, + "loss": 0.4695, + "step": 1514 + }, + { + "epoch": 0.12002376708259062, + "grad_norm": 2.31215641518515, + "learning_rate": 1.957813522967929e-05, + "loss": 0.4742, + "step": 1515 + }, + { + "epoch": 0.12010299069122599, + "grad_norm": 1.6709110781226622, + "learning_rate": 1.9577397461404527e-05, + "loss": 0.3521, + "step": 1516 + }, + { + "epoch": 0.12018221429986135, + "grad_norm": 2.45691631192335, + "learning_rate": 1.957665906250422e-05, + "loss": 0.6144, + "step": 1517 + }, + { + "epoch": 0.12026143790849673, + "grad_norm": 1.7185842208518693, + "learning_rate": 1.9575920033027002e-05, + "loss": 0.4392, + "step": 1518 + }, + { + "epoch": 0.1203406615171321, + "grad_norm": 2.1658119142884322, + "learning_rate": 1.9575180373021516e-05, + "loss": 0.4573, + "step": 1519 + }, + { + "epoch": 0.12041988512576748, + "grad_norm": 2.01196096927822, + "learning_rate": 1.9574440082536482e-05, + "loss": 0.419, + "step": 1520 + }, + { + "epoch": 0.12049910873440285, + "grad_norm": 2.3063711390334407, + "learning_rate": 1.9573699161620635e-05, + "loss": 0.575, + "step": 1521 + }, + { + "epoch": 0.12057833234303822, + "grad_norm": 1.8541806380421149, + "learning_rate": 1.9572957610322766e-05, + "loss": 0.3356, + "step": 1522 + }, + { + "epoch": 0.1206575559516736, + "grad_norm": 2.224184920772965, + "learning_rate": 1.95722154286917e-05, + "loss": 0.4105, + "step": 1523 + }, + { + "epoch": 0.12073677956030897, + "grad_norm": 2.1852759463791425, + "learning_rate": 1.9571472616776304e-05, + "loss": 0.4712, + "step": 1524 + }, + { + "epoch": 0.12081600316894435, + "grad_norm": 2.2581373778888603, + "learning_rate": 1.9570729174625493e-05, + "loss": 0.4556, + "step": 1525 + }, + { + "epoch": 0.12089522677757972, + "grad_norm": 1.9576966899336323, + "learning_rate": 1.956998510228822e-05, + "loss": 0.4633, + "step": 1526 + }, + { + "epoch": 0.1209744503862151, + "grad_norm": 1.8843490040594182, + "learning_rate": 1.956924039981347e-05, + "loss": 0.5199, + "step": 1527 + }, + { + "epoch": 0.12105367399485047, + "grad_norm": 1.9876327531780724, + "learning_rate": 1.956849506725029e-05, + "loss": 0.4823, + "step": 1528 + }, + { + "epoch": 0.12113289760348583, + "grad_norm": 2.1313006638182825, + "learning_rate": 1.9567749104647746e-05, + "loss": 0.543, + "step": 1529 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 1.9858746753460998, + "learning_rate": 1.9567002512054964e-05, + "loss": 0.4229, + "step": 1530 + }, + { + "epoch": 0.12129134482075658, + "grad_norm": 2.048676644917086, + "learning_rate": 1.9566255289521096e-05, + "loss": 0.5403, + "step": 1531 + }, + { + "epoch": 0.12137056842939196, + "grad_norm": 1.862883832172104, + "learning_rate": 1.956550743709535e-05, + "loss": 0.3393, + "step": 1532 + }, + { + "epoch": 0.12144979203802733, + "grad_norm": 2.284319341639327, + "learning_rate": 1.9564758954826964e-05, + "loss": 0.4484, + "step": 1533 + }, + { + "epoch": 0.1215290156466627, + "grad_norm": 2.1742344707512498, + "learning_rate": 1.9564009842765225e-05, + "loss": 0.4469, + "step": 1534 + }, + { + "epoch": 0.12160823925529808, + "grad_norm": 2.417652281543239, + "learning_rate": 1.956326010095946e-05, + "loss": 0.552, + "step": 1535 + }, + { + "epoch": 0.12168746286393345, + "grad_norm": 2.0660780166131603, + "learning_rate": 1.9562509729459024e-05, + "loss": 0.4635, + "step": 1536 + }, + { + "epoch": 0.12176668647256883, + "grad_norm": 2.256056768108262, + "learning_rate": 1.956175872831334e-05, + "loss": 0.6252, + "step": 1537 + }, + { + "epoch": 0.1218459100812042, + "grad_norm": 2.1447276940724276, + "learning_rate": 1.9561007097571853e-05, + "loss": 0.4759, + "step": 1538 + }, + { + "epoch": 0.12192513368983957, + "grad_norm": 2.0856712082665125, + "learning_rate": 1.9560254837284053e-05, + "loss": 0.3735, + "step": 1539 + }, + { + "epoch": 0.12200435729847495, + "grad_norm": 2.0140312749191587, + "learning_rate": 1.955950194749947e-05, + "loss": 0.5297, + "step": 1540 + }, + { + "epoch": 0.12208358090711031, + "grad_norm": 2.1950721780582634, + "learning_rate": 1.9558748428267682e-05, + "loss": 0.4727, + "step": 1541 + }, + { + "epoch": 0.1221628045157457, + "grad_norm": 2.107878681326026, + "learning_rate": 1.9557994279638307e-05, + "loss": 0.4633, + "step": 1542 + }, + { + "epoch": 0.12224202812438106, + "grad_norm": 2.129355868492411, + "learning_rate": 1.9557239501660995e-05, + "loss": 0.4734, + "step": 1543 + }, + { + "epoch": 0.12232125173301645, + "grad_norm": 2.183122441993006, + "learning_rate": 1.955648409438545e-05, + "loss": 0.4621, + "step": 1544 + }, + { + "epoch": 0.12240047534165181, + "grad_norm": 2.469968994570861, + "learning_rate": 1.955572805786141e-05, + "loss": 0.5069, + "step": 1545 + }, + { + "epoch": 0.12247969895028718, + "grad_norm": 2.0663506941577903, + "learning_rate": 1.9554971392138655e-05, + "loss": 0.4811, + "step": 1546 + }, + { + "epoch": 0.12255892255892256, + "grad_norm": 2.040254687777327, + "learning_rate": 1.955421409726701e-05, + "loss": 0.5074, + "step": 1547 + }, + { + "epoch": 0.12263814616755793, + "grad_norm": 2.0151917059884146, + "learning_rate": 1.9553456173296342e-05, + "loss": 0.5792, + "step": 1548 + }, + { + "epoch": 0.12271736977619331, + "grad_norm": 2.342460576824375, + "learning_rate": 1.9552697620276547e-05, + "loss": 0.55, + "step": 1549 + }, + { + "epoch": 0.12279659338482868, + "grad_norm": 1.9577816400953072, + "learning_rate": 1.9551938438257583e-05, + "loss": 0.33, + "step": 1550 + }, + { + "epoch": 0.12287581699346405, + "grad_norm": 2.2596378182288372, + "learning_rate": 1.9551178627289436e-05, + "loss": 0.4823, + "step": 1551 + }, + { + "epoch": 0.12295504060209943, + "grad_norm": 1.9260888550406368, + "learning_rate": 1.9550418187422127e-05, + "loss": 0.3066, + "step": 1552 + }, + { + "epoch": 0.1230342642107348, + "grad_norm": 1.9401354769111385, + "learning_rate": 1.954965711870574e-05, + "loss": 0.4799, + "step": 1553 + }, + { + "epoch": 0.12311348781937018, + "grad_norm": 2.435901679159972, + "learning_rate": 1.954889542119038e-05, + "loss": 0.5224, + "step": 1554 + }, + { + "epoch": 0.12319271142800554, + "grad_norm": 2.109284057685913, + "learning_rate": 1.9548133094926203e-05, + "loss": 0.4227, + "step": 1555 + }, + { + "epoch": 0.12327193503664093, + "grad_norm": 2.264513968471351, + "learning_rate": 1.9547370139963406e-05, + "loss": 0.5184, + "step": 1556 + }, + { + "epoch": 0.1233511586452763, + "grad_norm": 1.9963833903416612, + "learning_rate": 1.954660655635222e-05, + "loss": 0.493, + "step": 1557 + }, + { + "epoch": 0.12343038225391166, + "grad_norm": 2.3394609325354683, + "learning_rate": 1.954584234414293e-05, + "loss": 0.5902, + "step": 1558 + }, + { + "epoch": 0.12350960586254704, + "grad_norm": 2.1173850208042855, + "learning_rate": 1.954507750338585e-05, + "loss": 0.4574, + "step": 1559 + }, + { + "epoch": 0.12358882947118241, + "grad_norm": 2.784984556302951, + "learning_rate": 1.954431203413135e-05, + "loss": 0.5032, + "step": 1560 + }, + { + "epoch": 0.12366805307981779, + "grad_norm": 2.4832464418214664, + "learning_rate": 1.9543545936429824e-05, + "loss": 0.5774, + "step": 1561 + }, + { + "epoch": 0.12374727668845316, + "grad_norm": 2.249905926437713, + "learning_rate": 1.954277921033172e-05, + "loss": 0.6038, + "step": 1562 + }, + { + "epoch": 0.12382650029708853, + "grad_norm": 2.270271926697608, + "learning_rate": 1.954201185588752e-05, + "loss": 0.5766, + "step": 1563 + }, + { + "epoch": 0.12390572390572391, + "grad_norm": 2.1328028405772845, + "learning_rate": 1.9541243873147752e-05, + "loss": 0.426, + "step": 1564 + }, + { + "epoch": 0.12398494751435928, + "grad_norm": 2.5046424985676885, + "learning_rate": 1.9540475262162988e-05, + "loss": 0.6653, + "step": 1565 + }, + { + "epoch": 0.12406417112299466, + "grad_norm": 2.1343082635875197, + "learning_rate": 1.9539706022983827e-05, + "loss": 0.4462, + "step": 1566 + }, + { + "epoch": 0.12414339473163002, + "grad_norm": 2.2156024292847767, + "learning_rate": 1.9538936155660934e-05, + "loss": 0.5407, + "step": 1567 + }, + { + "epoch": 0.12422261834026539, + "grad_norm": 2.579028877772703, + "learning_rate": 1.953816566024499e-05, + "loss": 0.6548, + "step": 1568 + }, + { + "epoch": 0.12430184194890077, + "grad_norm": 2.0779856118130016, + "learning_rate": 1.9537394536786734e-05, + "loss": 0.4593, + "step": 1569 + }, + { + "epoch": 0.12438106555753614, + "grad_norm": 2.31625464332722, + "learning_rate": 1.9536622785336936e-05, + "loss": 0.5234, + "step": 1570 + }, + { + "epoch": 0.12446028916617152, + "grad_norm": 2.0622730352018035, + "learning_rate": 1.953585040594642e-05, + "loss": 0.3372, + "step": 1571 + }, + { + "epoch": 0.12453951277480689, + "grad_norm": 2.4841316876648607, + "learning_rate": 1.9535077398666034e-05, + "loss": 0.4423, + "step": 1572 + }, + { + "epoch": 0.12461873638344227, + "grad_norm": 1.9787853395709356, + "learning_rate": 1.953430376354668e-05, + "loss": 0.3854, + "step": 1573 + }, + { + "epoch": 0.12469795999207764, + "grad_norm": 2.525922564771538, + "learning_rate": 1.9533529500639302e-05, + "loss": 0.5425, + "step": 1574 + }, + { + "epoch": 0.12477718360071301, + "grad_norm": 1.93523597768252, + "learning_rate": 1.9532754609994878e-05, + "loss": 0.3317, + "step": 1575 + }, + { + "epoch": 0.12485640720934839, + "grad_norm": 2.0219349869300443, + "learning_rate": 1.953197909166443e-05, + "loss": 0.4876, + "step": 1576 + }, + { + "epoch": 0.12493563081798376, + "grad_norm": 1.78265919950567, + "learning_rate": 1.9531202945699027e-05, + "loss": 0.3151, + "step": 1577 + }, + { + "epoch": 0.12501485442661914, + "grad_norm": 2.5949093041009017, + "learning_rate": 1.953042617214977e-05, + "loss": 0.4207, + "step": 1578 + }, + { + "epoch": 0.12509407803525452, + "grad_norm": 2.3174313296879925, + "learning_rate": 1.9529648771067805e-05, + "loss": 0.4594, + "step": 1579 + }, + { + "epoch": 0.12517330164388987, + "grad_norm": 2.1455311092567535, + "learning_rate": 1.9528870742504328e-05, + "loss": 0.447, + "step": 1580 + }, + { + "epoch": 0.12525252525252525, + "grad_norm": 2.1268622439915683, + "learning_rate": 1.9528092086510556e-05, + "loss": 0.5086, + "step": 1581 + }, + { + "epoch": 0.12533174886116064, + "grad_norm": 2.2348117985011973, + "learning_rate": 1.9527312803137767e-05, + "loss": 0.4691, + "step": 1582 + }, + { + "epoch": 0.125410972469796, + "grad_norm": 2.6368223200522363, + "learning_rate": 1.9526532892437275e-05, + "loss": 0.6259, + "step": 1583 + }, + { + "epoch": 0.12549019607843137, + "grad_norm": 2.284528147692858, + "learning_rate": 1.9525752354460433e-05, + "loss": 0.5743, + "step": 1584 + }, + { + "epoch": 0.12556941968706675, + "grad_norm": 2.2208798173703976, + "learning_rate": 1.9524971189258627e-05, + "loss": 0.5342, + "step": 1585 + }, + { + "epoch": 0.1256486432957021, + "grad_norm": 2.2671382435804603, + "learning_rate": 1.9524189396883307e-05, + "loss": 0.4803, + "step": 1586 + }, + { + "epoch": 0.1257278669043375, + "grad_norm": 1.9844528087045685, + "learning_rate": 1.9523406977385937e-05, + "loss": 0.5166, + "step": 1587 + }, + { + "epoch": 0.12580709051297287, + "grad_norm": 1.7632181098371553, + "learning_rate": 1.9522623930818043e-05, + "loss": 0.4377, + "step": 1588 + }, + { + "epoch": 0.12588631412160825, + "grad_norm": 2.283692140176902, + "learning_rate": 1.9521840257231183e-05, + "loss": 0.4947, + "step": 1589 + }, + { + "epoch": 0.1259655377302436, + "grad_norm": 2.3164307111854368, + "learning_rate": 1.9521055956676956e-05, + "loss": 0.4708, + "step": 1590 + }, + { + "epoch": 0.12604476133887899, + "grad_norm": 1.7116145701711083, + "learning_rate": 1.9520271029207008e-05, + "loss": 0.4086, + "step": 1591 + }, + { + "epoch": 0.12612398494751437, + "grad_norm": 1.9182914083948066, + "learning_rate": 1.9519485474873027e-05, + "loss": 0.4887, + "step": 1592 + }, + { + "epoch": 0.12620320855614972, + "grad_norm": 2.023898036529861, + "learning_rate": 1.9518699293726727e-05, + "loss": 0.4496, + "step": 1593 + }, + { + "epoch": 0.1262824321647851, + "grad_norm": 1.7625947344225925, + "learning_rate": 1.9517912485819878e-05, + "loss": 0.3721, + "step": 1594 + }, + { + "epoch": 0.12636165577342048, + "grad_norm": 2.208037197461247, + "learning_rate": 1.9517125051204292e-05, + "loss": 0.4905, + "step": 1595 + }, + { + "epoch": 0.12644087938205587, + "grad_norm": 2.2507473829483047, + "learning_rate": 1.9516336989931813e-05, + "loss": 0.5883, + "step": 1596 + }, + { + "epoch": 0.12652010299069122, + "grad_norm": 2.0932088228907055, + "learning_rate": 1.9515548302054335e-05, + "loss": 0.5015, + "step": 1597 + }, + { + "epoch": 0.1265993265993266, + "grad_norm": 1.787315191367274, + "learning_rate": 1.9514758987623784e-05, + "loss": 0.3419, + "step": 1598 + }, + { + "epoch": 0.12667855020796198, + "grad_norm": 1.9059191388049546, + "learning_rate": 1.9513969046692137e-05, + "loss": 0.4962, + "step": 1599 + }, + { + "epoch": 0.12675777381659734, + "grad_norm": 2.0120350854369553, + "learning_rate": 1.951317847931141e-05, + "loss": 0.4746, + "step": 1600 + }, + { + "epoch": 0.12683699742523272, + "grad_norm": 2.002509790584961, + "learning_rate": 1.9512387285533655e-05, + "loss": 0.439, + "step": 1601 + }, + { + "epoch": 0.1269162210338681, + "grad_norm": 2.334436469329716, + "learning_rate": 1.951159546541096e-05, + "loss": 0.6096, + "step": 1602 + }, + { + "epoch": 0.12699544464250345, + "grad_norm": 1.859265258937115, + "learning_rate": 1.9510803018995477e-05, + "loss": 0.377, + "step": 1603 + }, + { + "epoch": 0.12707466825113883, + "grad_norm": 1.900511346149793, + "learning_rate": 1.9510009946339377e-05, + "loss": 0.5022, + "step": 1604 + }, + { + "epoch": 0.12715389185977422, + "grad_norm": 2.050205491252545, + "learning_rate": 1.9509216247494882e-05, + "loss": 0.4027, + "step": 1605 + }, + { + "epoch": 0.1272331154684096, + "grad_norm": 2.197658630941255, + "learning_rate": 1.950842192251425e-05, + "loss": 0.5213, + "step": 1606 + }, + { + "epoch": 0.12731233907704495, + "grad_norm": 1.832814046622772, + "learning_rate": 1.950762697144979e-05, + "loss": 0.381, + "step": 1607 + }, + { + "epoch": 0.12739156268568033, + "grad_norm": 1.897873626707906, + "learning_rate": 1.950683139435384e-05, + "loss": 0.3199, + "step": 1608 + }, + { + "epoch": 0.1274707862943157, + "grad_norm": 2.3630339918533414, + "learning_rate": 1.9506035191278784e-05, + "loss": 0.561, + "step": 1609 + }, + { + "epoch": 0.12755000990295107, + "grad_norm": 2.056307252757091, + "learning_rate": 1.9505238362277054e-05, + "loss": 0.4708, + "step": 1610 + }, + { + "epoch": 0.12762923351158645, + "grad_norm": 1.8252485722983463, + "learning_rate": 1.9504440907401113e-05, + "loss": 0.3927, + "step": 1611 + }, + { + "epoch": 0.12770845712022183, + "grad_norm": 1.9059585629214528, + "learning_rate": 1.9503642826703468e-05, + "loss": 0.395, + "step": 1612 + }, + { + "epoch": 0.1277876807288572, + "grad_norm": 1.9259285645711433, + "learning_rate": 1.950284412023668e-05, + "loss": 0.4115, + "step": 1613 + }, + { + "epoch": 0.12786690433749257, + "grad_norm": 2.3218748090406005, + "learning_rate": 1.9502044788053322e-05, + "loss": 0.5312, + "step": 1614 + }, + { + "epoch": 0.12794612794612795, + "grad_norm": 1.8636654565152415, + "learning_rate": 1.9501244830206037e-05, + "loss": 0.4334, + "step": 1615 + }, + { + "epoch": 0.12802535155476333, + "grad_norm": 2.2423125778198454, + "learning_rate": 1.9500444246747502e-05, + "loss": 0.4985, + "step": 1616 + }, + { + "epoch": 0.12810457516339868, + "grad_norm": 2.2631406096595867, + "learning_rate": 1.9499643037730422e-05, + "loss": 0.4807, + "step": 1617 + }, + { + "epoch": 0.12818379877203406, + "grad_norm": 2.009836839593306, + "learning_rate": 1.949884120320756e-05, + "loss": 0.4357, + "step": 1618 + }, + { + "epoch": 0.12826302238066944, + "grad_norm": 2.0614390156081823, + "learning_rate": 1.949803874323171e-05, + "loss": 0.4016, + "step": 1619 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 2.4047935362273134, + "learning_rate": 1.949723565785571e-05, + "loss": 0.5006, + "step": 1620 + }, + { + "epoch": 0.12842146959794018, + "grad_norm": 2.340608266658904, + "learning_rate": 1.9496431947132438e-05, + "loss": 0.5462, + "step": 1621 + }, + { + "epoch": 0.12850069320657556, + "grad_norm": 1.902774466757955, + "learning_rate": 1.9495627611114817e-05, + "loss": 0.3877, + "step": 1622 + }, + { + "epoch": 0.12857991681521094, + "grad_norm": 2.6526359364203653, + "learning_rate": 1.949482264985581e-05, + "loss": 0.5816, + "step": 1623 + }, + { + "epoch": 0.1286591404238463, + "grad_norm": 2.224691379426264, + "learning_rate": 1.9494017063408415e-05, + "loss": 0.5562, + "step": 1624 + }, + { + "epoch": 0.12873836403248168, + "grad_norm": 2.252047688919355, + "learning_rate": 1.9493210851825682e-05, + "loss": 0.4172, + "step": 1625 + }, + { + "epoch": 0.12881758764111706, + "grad_norm": 2.020222266558436, + "learning_rate": 1.949240401516069e-05, + "loss": 0.4992, + "step": 1626 + }, + { + "epoch": 0.1288968112497524, + "grad_norm": 2.087228576009431, + "learning_rate": 1.9491596553466568e-05, + "loss": 0.383, + "step": 1627 + }, + { + "epoch": 0.1289760348583878, + "grad_norm": 1.9397511634762696, + "learning_rate": 1.9490788466796483e-05, + "loss": 0.4283, + "step": 1628 + }, + { + "epoch": 0.12905525846702318, + "grad_norm": 2.134617262902257, + "learning_rate": 1.9489979755203646e-05, + "loss": 0.3847, + "step": 1629 + }, + { + "epoch": 0.12913448207565856, + "grad_norm": 2.2783258272898244, + "learning_rate": 1.9489170418741306e-05, + "loss": 0.5288, + "step": 1630 + }, + { + "epoch": 0.1292137056842939, + "grad_norm": 2.627729679675159, + "learning_rate": 1.948836045746275e-05, + "loss": 0.4422, + "step": 1631 + }, + { + "epoch": 0.1292929292929293, + "grad_norm": 2.0808924413119847, + "learning_rate": 1.9487549871421316e-05, + "loss": 0.5224, + "step": 1632 + }, + { + "epoch": 0.12937215290156467, + "grad_norm": 2.734266585355052, + "learning_rate": 1.9486738660670373e-05, + "loss": 0.5707, + "step": 1633 + }, + { + "epoch": 0.12945137651020003, + "grad_norm": 1.9749476541489894, + "learning_rate": 1.9485926825263334e-05, + "loss": 0.4319, + "step": 1634 + }, + { + "epoch": 0.1295306001188354, + "grad_norm": 2.1730236224927646, + "learning_rate": 1.948511436525366e-05, + "loss": 0.5097, + "step": 1635 + }, + { + "epoch": 0.1296098237274708, + "grad_norm": 1.9952733290539528, + "learning_rate": 1.9484301280694845e-05, + "loss": 0.3531, + "step": 1636 + }, + { + "epoch": 0.12968904733610617, + "grad_norm": 2.142154481154024, + "learning_rate": 1.9483487571640424e-05, + "loss": 0.5847, + "step": 1637 + }, + { + "epoch": 0.12976827094474153, + "grad_norm": 2.098423162700416, + "learning_rate": 1.948267323814398e-05, + "loss": 0.3985, + "step": 1638 + }, + { + "epoch": 0.1298474945533769, + "grad_norm": 1.8687273876495234, + "learning_rate": 1.948185828025913e-05, + "loss": 0.3977, + "step": 1639 + }, + { + "epoch": 0.1299267181620123, + "grad_norm": 1.966885840639226, + "learning_rate": 1.9481042698039534e-05, + "loss": 0.4246, + "step": 1640 + }, + { + "epoch": 0.13000594177064764, + "grad_norm": 1.772690548251816, + "learning_rate": 1.94802264915389e-05, + "loss": 0.3666, + "step": 1641 + }, + { + "epoch": 0.13008516537928302, + "grad_norm": 2.284571102529604, + "learning_rate": 1.9479409660810965e-05, + "loss": 0.4862, + "step": 1642 + }, + { + "epoch": 0.1301643889879184, + "grad_norm": 1.79577134966779, + "learning_rate": 1.9478592205909517e-05, + "loss": 0.4195, + "step": 1643 + }, + { + "epoch": 0.13024361259655376, + "grad_norm": 1.9873410139883068, + "learning_rate": 1.947777412688838e-05, + "loss": 0.4572, + "step": 1644 + }, + { + "epoch": 0.13032283620518914, + "grad_norm": 2.0112424630111896, + "learning_rate": 1.947695542380142e-05, + "loss": 0.4602, + "step": 1645 + }, + { + "epoch": 0.13040205981382452, + "grad_norm": 2.2439224519208976, + "learning_rate": 1.9476136096702546e-05, + "loss": 0.4055, + "step": 1646 + }, + { + "epoch": 0.1304812834224599, + "grad_norm": 2.08499264584321, + "learning_rate": 1.9475316145645706e-05, + "loss": 0.4869, + "step": 1647 + }, + { + "epoch": 0.13056050703109526, + "grad_norm": 1.866420939233387, + "learning_rate": 1.947449557068489e-05, + "loss": 0.398, + "step": 1648 + }, + { + "epoch": 0.13063973063973064, + "grad_norm": 1.9962876757412493, + "learning_rate": 1.947367437187413e-05, + "loss": 0.5062, + "step": 1649 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 2.4789946542886434, + "learning_rate": 1.9472852549267496e-05, + "loss": 0.5333, + "step": 1650 + }, + { + "epoch": 0.13079817785700137, + "grad_norm": 2.3722255312534912, + "learning_rate": 1.9472030102919102e-05, + "loss": 0.5509, + "step": 1651 + }, + { + "epoch": 0.13087740146563676, + "grad_norm": 1.785553283135544, + "learning_rate": 1.9471207032883103e-05, + "loss": 0.4859, + "step": 1652 + }, + { + "epoch": 0.13095662507427214, + "grad_norm": 1.7755155125517021, + "learning_rate": 1.9470383339213693e-05, + "loss": 0.5293, + "step": 1653 + }, + { + "epoch": 0.13103584868290752, + "grad_norm": 1.9955128080505007, + "learning_rate": 1.946955902196511e-05, + "loss": 0.4461, + "step": 1654 + }, + { + "epoch": 0.13111507229154287, + "grad_norm": 1.8357218407037454, + "learning_rate": 1.9468734081191627e-05, + "loss": 0.4582, + "step": 1655 + }, + { + "epoch": 0.13119429590017825, + "grad_norm": 2.1395018904148255, + "learning_rate": 1.9467908516947568e-05, + "loss": 0.5928, + "step": 1656 + }, + { + "epoch": 0.13127351950881364, + "grad_norm": 2.3972933487425863, + "learning_rate": 1.946708232928729e-05, + "loss": 0.529, + "step": 1657 + }, + { + "epoch": 0.131352743117449, + "grad_norm": 1.9151294624797697, + "learning_rate": 1.9466255518265193e-05, + "loss": 0.4837, + "step": 1658 + }, + { + "epoch": 0.13143196672608437, + "grad_norm": 2.449250233769008, + "learning_rate": 1.946542808393572e-05, + "loss": 0.5345, + "step": 1659 + }, + { + "epoch": 0.13151119033471975, + "grad_norm": 1.8208164851237159, + "learning_rate": 1.946460002635335e-05, + "loss": 0.4359, + "step": 1660 + }, + { + "epoch": 0.13159041394335513, + "grad_norm": 1.8595550911542422, + "learning_rate": 1.946377134557261e-05, + "loss": 0.5135, + "step": 1661 + }, + { + "epoch": 0.1316696375519905, + "grad_norm": 1.8927095736270318, + "learning_rate": 1.9462942041648062e-05, + "loss": 0.3981, + "step": 1662 + }, + { + "epoch": 0.13174886116062587, + "grad_norm": 1.9692179918122763, + "learning_rate": 1.9462112114634316e-05, + "loss": 0.4599, + "step": 1663 + }, + { + "epoch": 0.13182808476926125, + "grad_norm": 1.8405268038449931, + "learning_rate": 1.9461281564586014e-05, + "loss": 0.3978, + "step": 1664 + }, + { + "epoch": 0.1319073083778966, + "grad_norm": 1.8747009013460822, + "learning_rate": 1.9460450391557847e-05, + "loss": 0.4791, + "step": 1665 + }, + { + "epoch": 0.13198653198653199, + "grad_norm": 1.8437471781988803, + "learning_rate": 1.945961859560454e-05, + "loss": 0.4044, + "step": 1666 + }, + { + "epoch": 0.13206575559516737, + "grad_norm": 2.343887922901139, + "learning_rate": 1.9458786176780868e-05, + "loss": 0.479, + "step": 1667 + }, + { + "epoch": 0.13214497920380272, + "grad_norm": 2.1212067978883047, + "learning_rate": 1.945795313514164e-05, + "loss": 0.4229, + "step": 1668 + }, + { + "epoch": 0.1322242028124381, + "grad_norm": 2.1224087212384464, + "learning_rate": 1.9457119470741707e-05, + "loss": 0.5046, + "step": 1669 + }, + { + "epoch": 0.13230342642107348, + "grad_norm": 2.2257522080975014, + "learning_rate": 1.9456285183635958e-05, + "loss": 0.5205, + "step": 1670 + }, + { + "epoch": 0.13238265002970886, + "grad_norm": 2.1916722752343043, + "learning_rate": 1.9455450273879332e-05, + "loss": 0.378, + "step": 1671 + }, + { + "epoch": 0.13246187363834422, + "grad_norm": 2.253134935717543, + "learning_rate": 1.94546147415268e-05, + "loss": 0.4761, + "step": 1672 + }, + { + "epoch": 0.1325410972469796, + "grad_norm": 1.9484350906708843, + "learning_rate": 1.9453778586633386e-05, + "loss": 0.5103, + "step": 1673 + }, + { + "epoch": 0.13262032085561498, + "grad_norm": 2.034277497863812, + "learning_rate": 1.9452941809254136e-05, + "loss": 0.5718, + "step": 1674 + }, + { + "epoch": 0.13269954446425034, + "grad_norm": 2.1139733755876446, + "learning_rate": 1.9452104409444153e-05, + "loss": 0.4832, + "step": 1675 + }, + { + "epoch": 0.13277876807288572, + "grad_norm": 2.345538241691021, + "learning_rate": 1.9451266387258576e-05, + "loss": 0.4459, + "step": 1676 + }, + { + "epoch": 0.1328579916815211, + "grad_norm": 2.374230006452594, + "learning_rate": 1.9450427742752583e-05, + "loss": 0.4718, + "step": 1677 + }, + { + "epoch": 0.13293721529015648, + "grad_norm": 1.8458156884277443, + "learning_rate": 1.9449588475981394e-05, + "loss": 0.4513, + "step": 1678 + }, + { + "epoch": 0.13301643889879183, + "grad_norm": 1.8872002622842183, + "learning_rate": 1.9448748587000277e-05, + "loss": 0.4412, + "step": 1679 + }, + { + "epoch": 0.13309566250742721, + "grad_norm": 2.225454113668978, + "learning_rate": 1.944790807586453e-05, + "loss": 0.4744, + "step": 1680 + }, + { + "epoch": 0.1331748861160626, + "grad_norm": 1.4787705331846077, + "learning_rate": 1.9447066942629495e-05, + "loss": 0.3266, + "step": 1681 + }, + { + "epoch": 0.13325410972469795, + "grad_norm": 2.104026365802812, + "learning_rate": 1.9446225187350558e-05, + "loss": 0.5449, + "step": 1682 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 2.00456098540276, + "learning_rate": 1.9445382810083143e-05, + "loss": 0.4949, + "step": 1683 + }, + { + "epoch": 0.1334125569419687, + "grad_norm": 1.7312061856591376, + "learning_rate": 1.944453981088272e-05, + "loss": 0.3797, + "step": 1684 + }, + { + "epoch": 0.13349178055060407, + "grad_norm": 1.7597842704681006, + "learning_rate": 1.9443696189804793e-05, + "loss": 0.4461, + "step": 1685 + }, + { + "epoch": 0.13357100415923945, + "grad_norm": 1.7799676338091903, + "learning_rate": 1.9442851946904914e-05, + "loss": 0.4498, + "step": 1686 + }, + { + "epoch": 0.13365022776787483, + "grad_norm": 2.1520571804664677, + "learning_rate": 1.9442007082238673e-05, + "loss": 0.3885, + "step": 1687 + }, + { + "epoch": 0.1337294513765102, + "grad_norm": 1.882594705224177, + "learning_rate": 1.944116159586169e-05, + "loss": 0.3983, + "step": 1688 + }, + { + "epoch": 0.13380867498514556, + "grad_norm": 1.8917192419641224, + "learning_rate": 1.944031548782965e-05, + "loss": 0.4186, + "step": 1689 + }, + { + "epoch": 0.13388789859378095, + "grad_norm": 1.960842937343515, + "learning_rate": 1.9439468758198258e-05, + "loss": 0.3934, + "step": 1690 + }, + { + "epoch": 0.13396712220241633, + "grad_norm": 2.0689676580915037, + "learning_rate": 1.943862140702327e-05, + "loss": 0.4452, + "step": 1691 + }, + { + "epoch": 0.13404634581105168, + "grad_norm": 2.0448959404311684, + "learning_rate": 1.9437773434360476e-05, + "loss": 0.4711, + "step": 1692 + }, + { + "epoch": 0.13412556941968706, + "grad_norm": 2.2481434596155467, + "learning_rate": 1.943692484026571e-05, + "loss": 0.4853, + "step": 1693 + }, + { + "epoch": 0.13420479302832244, + "grad_norm": 2.023476681537764, + "learning_rate": 1.9436075624794853e-05, + "loss": 0.4078, + "step": 1694 + }, + { + "epoch": 0.13428401663695783, + "grad_norm": 2.479704968592698, + "learning_rate": 1.9435225788003822e-05, + "loss": 0.4518, + "step": 1695 + }, + { + "epoch": 0.13436324024559318, + "grad_norm": 2.5161634152042636, + "learning_rate": 1.943437532994857e-05, + "loss": 0.5347, + "step": 1696 + }, + { + "epoch": 0.13444246385422856, + "grad_norm": 2.4083826215696504, + "learning_rate": 1.9433524250685098e-05, + "loss": 0.5727, + "step": 1697 + }, + { + "epoch": 0.13452168746286394, + "grad_norm": 1.7881820050007933, + "learning_rate": 1.9432672550269446e-05, + "loss": 0.3996, + "step": 1698 + }, + { + "epoch": 0.1346009110714993, + "grad_norm": 1.7457137282820732, + "learning_rate": 1.943182022875769e-05, + "loss": 0.4058, + "step": 1699 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 1.8356547487301496, + "learning_rate": 1.9430967286205962e-05, + "loss": 0.4334, + "step": 1700 + }, + { + "epoch": 0.13475935828877006, + "grad_norm": 1.7984240074881184, + "learning_rate": 1.9430113722670412e-05, + "loss": 0.4133, + "step": 1701 + }, + { + "epoch": 0.13483858189740544, + "grad_norm": 1.8746421845702204, + "learning_rate": 1.942925953820725e-05, + "loss": 0.4348, + "step": 1702 + }, + { + "epoch": 0.1349178055060408, + "grad_norm": 1.9845443254777095, + "learning_rate": 1.9428404732872716e-05, + "loss": 0.3993, + "step": 1703 + }, + { + "epoch": 0.13499702911467618, + "grad_norm": 2.4156276039784244, + "learning_rate": 1.94275493067231e-05, + "loss": 0.6108, + "step": 1704 + }, + { + "epoch": 0.13507625272331156, + "grad_norm": 1.809568454283512, + "learning_rate": 1.9426693259814725e-05, + "loss": 0.4645, + "step": 1705 + }, + { + "epoch": 0.1351554763319469, + "grad_norm": 1.8675048103801162, + "learning_rate": 1.9425836592203954e-05, + "loss": 0.3901, + "step": 1706 + }, + { + "epoch": 0.1352346999405823, + "grad_norm": 2.32064376753328, + "learning_rate": 1.94249793039472e-05, + "loss": 0.5329, + "step": 1707 + }, + { + "epoch": 0.13531392354921767, + "grad_norm": 1.7546194849759331, + "learning_rate": 1.9424121395100907e-05, + "loss": 0.4295, + "step": 1708 + }, + { + "epoch": 0.13539314715785303, + "grad_norm": 2.0775656344676037, + "learning_rate": 1.9423262865721567e-05, + "loss": 0.444, + "step": 1709 + }, + { + "epoch": 0.1354723707664884, + "grad_norm": 1.9905151480595324, + "learning_rate": 1.9422403715865708e-05, + "loss": 0.5129, + "step": 1710 + }, + { + "epoch": 0.1355515943751238, + "grad_norm": 1.8947681729403973, + "learning_rate": 1.9421543945589904e-05, + "loss": 0.4244, + "step": 1711 + }, + { + "epoch": 0.13563081798375917, + "grad_norm": 1.8364447732380378, + "learning_rate": 1.9420683554950765e-05, + "loss": 0.4196, + "step": 1712 + }, + { + "epoch": 0.13571004159239453, + "grad_norm": 2.497857229275654, + "learning_rate": 1.9419822544004942e-05, + "loss": 0.5594, + "step": 1713 + }, + { + "epoch": 0.1357892652010299, + "grad_norm": 2.0029454551093964, + "learning_rate": 1.941896091280913e-05, + "loss": 0.5197, + "step": 1714 + }, + { + "epoch": 0.1358684888096653, + "grad_norm": 2.408637352901282, + "learning_rate": 1.9418098661420064e-05, + "loss": 0.4725, + "step": 1715 + }, + { + "epoch": 0.13594771241830064, + "grad_norm": 2.1314088052580527, + "learning_rate": 1.9417235789894517e-05, + "loss": 0.6064, + "step": 1716 + }, + { + "epoch": 0.13602693602693602, + "grad_norm": 2.323294655439459, + "learning_rate": 1.9416372298289306e-05, + "loss": 0.4667, + "step": 1717 + }, + { + "epoch": 0.1361061596355714, + "grad_norm": 2.0042268492920785, + "learning_rate": 1.941550818666129e-05, + "loss": 0.3919, + "step": 1718 + }, + { + "epoch": 0.1361853832442068, + "grad_norm": 2.405695532988341, + "learning_rate": 1.941464345506736e-05, + "loss": 0.5169, + "step": 1719 + }, + { + "epoch": 0.13626460685284214, + "grad_norm": 1.7786793195492034, + "learning_rate": 1.9413778103564462e-05, + "loss": 0.5789, + "step": 1720 + }, + { + "epoch": 0.13634383046147752, + "grad_norm": 1.9578453884853981, + "learning_rate": 1.9412912132209573e-05, + "loss": 0.4823, + "step": 1721 + }, + { + "epoch": 0.1364230540701129, + "grad_norm": 2.1062554513907252, + "learning_rate": 1.941204554105971e-05, + "loss": 0.4481, + "step": 1722 + }, + { + "epoch": 0.13650227767874826, + "grad_norm": 1.786579999483879, + "learning_rate": 1.941117833017194e-05, + "loss": 0.4559, + "step": 1723 + }, + { + "epoch": 0.13658150128738364, + "grad_norm": 1.8123789371957015, + "learning_rate": 1.9410310499603356e-05, + "loss": 0.5092, + "step": 1724 + }, + { + "epoch": 0.13666072489601902, + "grad_norm": 1.8015305568837592, + "learning_rate": 1.9409442049411104e-05, + "loss": 0.3541, + "step": 1725 + }, + { + "epoch": 0.13673994850465437, + "grad_norm": 2.2860736215597877, + "learning_rate": 1.9408572979652373e-05, + "loss": 0.5113, + "step": 1726 + }, + { + "epoch": 0.13681917211328976, + "grad_norm": 2.069428158328441, + "learning_rate": 1.940770329038438e-05, + "loss": 0.4118, + "step": 1727 + }, + { + "epoch": 0.13689839572192514, + "grad_norm": 2.0410027061518705, + "learning_rate": 1.9406832981664392e-05, + "loss": 0.3555, + "step": 1728 + }, + { + "epoch": 0.13697761933056052, + "grad_norm": 1.9822005259271056, + "learning_rate": 1.9405962053549717e-05, + "loss": 0.5067, + "step": 1729 + }, + { + "epoch": 0.13705684293919587, + "grad_norm": 2.313602099290828, + "learning_rate": 1.9405090506097698e-05, + "loss": 0.4683, + "step": 1730 + }, + { + "epoch": 0.13713606654783125, + "grad_norm": 2.209547323402575, + "learning_rate": 1.9404218339365724e-05, + "loss": 0.3983, + "step": 1731 + }, + { + "epoch": 0.13721529015646663, + "grad_norm": 2.622615156077461, + "learning_rate": 1.940334555341122e-05, + "loss": 0.4484, + "step": 1732 + }, + { + "epoch": 0.137294513765102, + "grad_norm": 1.981892743092382, + "learning_rate": 1.940247214829166e-05, + "loss": 0.393, + "step": 1733 + }, + { + "epoch": 0.13737373737373737, + "grad_norm": 2.1220095542393382, + "learning_rate": 1.9401598124064552e-05, + "loss": 0.4899, + "step": 1734 + }, + { + "epoch": 0.13745296098237275, + "grad_norm": 1.7769057895679274, + "learning_rate": 1.9400723480787446e-05, + "loss": 0.4501, + "step": 1735 + }, + { + "epoch": 0.13753218459100813, + "grad_norm": 2.2016975897855655, + "learning_rate": 1.9399848218517927e-05, + "loss": 0.4238, + "step": 1736 + }, + { + "epoch": 0.1376114081996435, + "grad_norm": 1.6178729306084192, + "learning_rate": 1.9398972337313634e-05, + "loss": 0.3724, + "step": 1737 + }, + { + "epoch": 0.13769063180827887, + "grad_norm": 1.8502194092890032, + "learning_rate": 1.939809583723224e-05, + "loss": 0.3597, + "step": 1738 + }, + { + "epoch": 0.13776985541691425, + "grad_norm": 2.158042318325267, + "learning_rate": 1.9397218718331455e-05, + "loss": 0.6125, + "step": 1739 + }, + { + "epoch": 0.1378490790255496, + "grad_norm": 1.9061422703867463, + "learning_rate": 1.939634098066903e-05, + "loss": 0.5138, + "step": 1740 + }, + { + "epoch": 0.13792830263418498, + "grad_norm": 1.9098883872894814, + "learning_rate": 1.9395462624302768e-05, + "loss": 0.4624, + "step": 1741 + }, + { + "epoch": 0.13800752624282037, + "grad_norm": 1.7123011474279195, + "learning_rate": 1.93945836492905e-05, + "loss": 0.4588, + "step": 1742 + }, + { + "epoch": 0.13808674985145572, + "grad_norm": 2.0568765939877585, + "learning_rate": 1.93937040556901e-05, + "loss": 0.6117, + "step": 1743 + }, + { + "epoch": 0.1381659734600911, + "grad_norm": 2.861098225849574, + "learning_rate": 1.939282384355949e-05, + "loss": 0.5326, + "step": 1744 + }, + { + "epoch": 0.13824519706872648, + "grad_norm": 1.8175794975317248, + "learning_rate": 1.9391943012956623e-05, + "loss": 0.4212, + "step": 1745 + }, + { + "epoch": 0.13832442067736186, + "grad_norm": 1.838516328275049, + "learning_rate": 1.93910615639395e-05, + "loss": 0.4089, + "step": 1746 + }, + { + "epoch": 0.13840364428599722, + "grad_norm": 2.1572337500925274, + "learning_rate": 1.9390179496566162e-05, + "loss": 0.4446, + "step": 1747 + }, + { + "epoch": 0.1384828678946326, + "grad_norm": 2.418764482717121, + "learning_rate": 1.938929681089469e-05, + "loss": 0.5034, + "step": 1748 + }, + { + "epoch": 0.13856209150326798, + "grad_norm": 2.2109820477973634, + "learning_rate": 1.9388413506983196e-05, + "loss": 0.5481, + "step": 1749 + }, + { + "epoch": 0.13864131511190333, + "grad_norm": 2.136195707929235, + "learning_rate": 1.938752958488985e-05, + "loss": 0.5183, + "step": 1750 + }, + { + "epoch": 0.13872053872053872, + "grad_norm": 1.9372198262370093, + "learning_rate": 1.9386645044672848e-05, + "loss": 0.3758, + "step": 1751 + }, + { + "epoch": 0.1387997623291741, + "grad_norm": 1.968787028856834, + "learning_rate": 1.9385759886390433e-05, + "loss": 0.4315, + "step": 1752 + }, + { + "epoch": 0.13887898593780948, + "grad_norm": 2.1157436968704757, + "learning_rate": 1.9384874110100897e-05, + "loss": 0.4934, + "step": 1753 + }, + { + "epoch": 0.13895820954644483, + "grad_norm": 2.627711551232593, + "learning_rate": 1.9383987715862554e-05, + "loss": 0.4437, + "step": 1754 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 2.0759798286362887, + "learning_rate": 1.9383100703733774e-05, + "loss": 0.5854, + "step": 1755 + }, + { + "epoch": 0.1391166567637156, + "grad_norm": 2.2200247530146053, + "learning_rate": 1.9382213073772962e-05, + "loss": 0.5481, + "step": 1756 + }, + { + "epoch": 0.13919588037235095, + "grad_norm": 2.3220177650487273, + "learning_rate": 1.938132482603856e-05, + "loss": 0.5872, + "step": 1757 + }, + { + "epoch": 0.13927510398098633, + "grad_norm": 1.904674940657998, + "learning_rate": 1.9380435960589065e-05, + "loss": 0.4909, + "step": 1758 + }, + { + "epoch": 0.1393543275896217, + "grad_norm": 1.8204153175975037, + "learning_rate": 1.937954647748299e-05, + "loss": 0.4696, + "step": 1759 + }, + { + "epoch": 0.1394335511982571, + "grad_norm": 1.9541158849152975, + "learning_rate": 1.9378656376778914e-05, + "loss": 0.5564, + "step": 1760 + }, + { + "epoch": 0.13951277480689245, + "grad_norm": 1.9878054715321214, + "learning_rate": 1.9377765658535445e-05, + "loss": 0.4724, + "step": 1761 + }, + { + "epoch": 0.13959199841552783, + "grad_norm": 1.7890825933234422, + "learning_rate": 1.937687432281123e-05, + "loss": 0.2845, + "step": 1762 + }, + { + "epoch": 0.1396712220241632, + "grad_norm": 1.7643246744742345, + "learning_rate": 1.9375982369664958e-05, + "loss": 0.4345, + "step": 1763 + }, + { + "epoch": 0.13975044563279856, + "grad_norm": 2.020045225419193, + "learning_rate": 1.937508979915536e-05, + "loss": 0.4975, + "step": 1764 + }, + { + "epoch": 0.13982966924143395, + "grad_norm": 2.467895531181798, + "learning_rate": 1.9374196611341212e-05, + "loss": 0.7063, + "step": 1765 + }, + { + "epoch": 0.13990889285006933, + "grad_norm": 2.232716844846986, + "learning_rate": 1.937330280628132e-05, + "loss": 0.625, + "step": 1766 + }, + { + "epoch": 0.13998811645870468, + "grad_norm": 2.0246533429342373, + "learning_rate": 1.937240838403454e-05, + "loss": 0.4467, + "step": 1767 + }, + { + "epoch": 0.14006734006734006, + "grad_norm": 2.0035787659594453, + "learning_rate": 1.9371513344659764e-05, + "loss": 0.4551, + "step": 1768 + }, + { + "epoch": 0.14014656367597544, + "grad_norm": 1.674694238789154, + "learning_rate": 1.937061768821593e-05, + "loss": 0.3754, + "step": 1769 + }, + { + "epoch": 0.14022578728461083, + "grad_norm": 2.022031124885922, + "learning_rate": 1.936972141476201e-05, + "loss": 0.3525, + "step": 1770 + }, + { + "epoch": 0.14030501089324618, + "grad_norm": 1.9498652413194093, + "learning_rate": 1.936882452435702e-05, + "loss": 0.4232, + "step": 1771 + }, + { + "epoch": 0.14038423450188156, + "grad_norm": 1.6798047629882495, + "learning_rate": 1.936792701706001e-05, + "loss": 0.3754, + "step": 1772 + }, + { + "epoch": 0.14046345811051694, + "grad_norm": 2.0061424050036796, + "learning_rate": 1.9367028892930088e-05, + "loss": 0.383, + "step": 1773 + }, + { + "epoch": 0.1405426817191523, + "grad_norm": 2.390564364769597, + "learning_rate": 1.9366130152026378e-05, + "loss": 0.5046, + "step": 1774 + }, + { + "epoch": 0.14062190532778768, + "grad_norm": 1.624583928388353, + "learning_rate": 1.936523079440807e-05, + "loss": 0.3932, + "step": 1775 + }, + { + "epoch": 0.14070112893642306, + "grad_norm": 2.1716089822125673, + "learning_rate": 1.936433082013437e-05, + "loss": 0.5159, + "step": 1776 + }, + { + "epoch": 0.14078035254505844, + "grad_norm": 1.7393439467030505, + "learning_rate": 1.936343022926455e-05, + "loss": 0.4323, + "step": 1777 + }, + { + "epoch": 0.1408595761536938, + "grad_norm": 2.163720813492295, + "learning_rate": 1.93625290218579e-05, + "loss": 0.5656, + "step": 1778 + }, + { + "epoch": 0.14093879976232918, + "grad_norm": 1.816942167119474, + "learning_rate": 1.9361627197973767e-05, + "loss": 0.3822, + "step": 1779 + }, + { + "epoch": 0.14101802337096456, + "grad_norm": 2.0497407275673174, + "learning_rate": 1.9360724757671525e-05, + "loss": 0.4217, + "step": 1780 + }, + { + "epoch": 0.1410972469795999, + "grad_norm": 1.5973374977032029, + "learning_rate": 1.93598217010106e-05, + "loss": 0.4664, + "step": 1781 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 2.0599105378559264, + "learning_rate": 1.9358918028050453e-05, + "loss": 0.5569, + "step": 1782 + }, + { + "epoch": 0.14125569419687067, + "grad_norm": 1.6677554130424028, + "learning_rate": 1.9358013738850586e-05, + "loss": 0.43, + "step": 1783 + }, + { + "epoch": 0.14133491780550603, + "grad_norm": 2.215116502783159, + "learning_rate": 1.935710883347054e-05, + "loss": 0.4736, + "step": 1784 + }, + { + "epoch": 0.1414141414141414, + "grad_norm": 2.270922316355081, + "learning_rate": 1.9356203311969903e-05, + "loss": 0.5182, + "step": 1785 + }, + { + "epoch": 0.1414933650227768, + "grad_norm": 1.7845245648001509, + "learning_rate": 1.9355297174408298e-05, + "loss": 0.3727, + "step": 1786 + }, + { + "epoch": 0.14157258863141217, + "grad_norm": 1.7965048440585847, + "learning_rate": 1.9354390420845387e-05, + "loss": 0.4023, + "step": 1787 + }, + { + "epoch": 0.14165181224004753, + "grad_norm": 2.4450551216498755, + "learning_rate": 1.9353483051340876e-05, + "loss": 0.4747, + "step": 1788 + }, + { + "epoch": 0.1417310358486829, + "grad_norm": 2.138546589514695, + "learning_rate": 1.9352575065954515e-05, + "loss": 0.6135, + "step": 1789 + }, + { + "epoch": 0.1418102594573183, + "grad_norm": 2.131733890328888, + "learning_rate": 1.9351666464746087e-05, + "loss": 0.4813, + "step": 1790 + }, + { + "epoch": 0.14188948306595364, + "grad_norm": 1.857900351570552, + "learning_rate": 1.935075724777542e-05, + "loss": 0.4552, + "step": 1791 + }, + { + "epoch": 0.14196870667458902, + "grad_norm": 1.8538230130208877, + "learning_rate": 1.9349847415102378e-05, + "loss": 0.4836, + "step": 1792 + }, + { + "epoch": 0.1420479302832244, + "grad_norm": 1.7066402987548663, + "learning_rate": 1.9348936966786874e-05, + "loss": 0.384, + "step": 1793 + }, + { + "epoch": 0.1421271538918598, + "grad_norm": 2.1433453566457112, + "learning_rate": 1.9348025902888858e-05, + "loss": 0.5182, + "step": 1794 + }, + { + "epoch": 0.14220637750049514, + "grad_norm": 2.1828203151582346, + "learning_rate": 1.9347114223468316e-05, + "loss": 0.4587, + "step": 1795 + }, + { + "epoch": 0.14228560110913052, + "grad_norm": 2.2696694482378073, + "learning_rate": 1.9346201928585273e-05, + "loss": 0.6383, + "step": 1796 + }, + { + "epoch": 0.1423648247177659, + "grad_norm": 1.9021557835183327, + "learning_rate": 1.9345289018299807e-05, + "loss": 0.3727, + "step": 1797 + }, + { + "epoch": 0.14244404832640126, + "grad_norm": 1.9451941303057496, + "learning_rate": 1.9344375492672024e-05, + "loss": 0.4042, + "step": 1798 + }, + { + "epoch": 0.14252327193503664, + "grad_norm": 1.503029071705186, + "learning_rate": 1.934346135176208e-05, + "loss": 0.2743, + "step": 1799 + }, + { + "epoch": 0.14260249554367202, + "grad_norm": 2.0780442067088725, + "learning_rate": 1.9342546595630162e-05, + "loss": 0.4638, + "step": 1800 + }, + { + "epoch": 0.1426817191523074, + "grad_norm": 1.9043616225453697, + "learning_rate": 1.9341631224336503e-05, + "loss": 0.4801, + "step": 1801 + }, + { + "epoch": 0.14276094276094276, + "grad_norm": 2.0991769456846012, + "learning_rate": 1.934071523794138e-05, + "loss": 0.4705, + "step": 1802 + }, + { + "epoch": 0.14284016636957814, + "grad_norm": 2.035425321656529, + "learning_rate": 1.9339798636505102e-05, + "loss": 0.3996, + "step": 1803 + }, + { + "epoch": 0.14291938997821352, + "grad_norm": 2.0135250284271895, + "learning_rate": 1.9338881420088023e-05, + "loss": 0.478, + "step": 1804 + }, + { + "epoch": 0.14299861358684887, + "grad_norm": 2.220003798769769, + "learning_rate": 1.933796358875054e-05, + "loss": 0.5016, + "step": 1805 + }, + { + "epoch": 0.14307783719548425, + "grad_norm": 2.2951484077224986, + "learning_rate": 1.9337045142553085e-05, + "loss": 0.5116, + "step": 1806 + }, + { + "epoch": 0.14315706080411963, + "grad_norm": 2.0687508524836313, + "learning_rate": 1.9336126081556134e-05, + "loss": 0.4933, + "step": 1807 + }, + { + "epoch": 0.143236284412755, + "grad_norm": 2.159079806953251, + "learning_rate": 1.9335206405820208e-05, + "loss": 0.4002, + "step": 1808 + }, + { + "epoch": 0.14331550802139037, + "grad_norm": 2.1994063427890596, + "learning_rate": 1.933428611540585e-05, + "loss": 0.4737, + "step": 1809 + }, + { + "epoch": 0.14339473163002575, + "grad_norm": 2.307142705416731, + "learning_rate": 1.9333365210373668e-05, + "loss": 0.4908, + "step": 1810 + }, + { + "epoch": 0.14347395523866113, + "grad_norm": 1.685965820228709, + "learning_rate": 1.93324436907843e-05, + "loss": 0.3631, + "step": 1811 + }, + { + "epoch": 0.1435531788472965, + "grad_norm": 2.0599784451619927, + "learning_rate": 1.9331521556698415e-05, + "loss": 0.4934, + "step": 1812 + }, + { + "epoch": 0.14363240245593187, + "grad_norm": 2.0466443169909954, + "learning_rate": 1.9330598808176736e-05, + "loss": 0.4656, + "step": 1813 + }, + { + "epoch": 0.14371162606456725, + "grad_norm": 2.1132047205863733, + "learning_rate": 1.9329675445280024e-05, + "loss": 0.4509, + "step": 1814 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 1.7822908149601173, + "learning_rate": 1.9328751468069075e-05, + "loss": 0.3163, + "step": 1815 + }, + { + "epoch": 0.14387007328183798, + "grad_norm": 1.838990777874387, + "learning_rate": 1.932782687660473e-05, + "loss": 0.5313, + "step": 1816 + }, + { + "epoch": 0.14394929689047337, + "grad_norm": 2.2461276342862044, + "learning_rate": 1.9326901670947868e-05, + "loss": 0.5057, + "step": 1817 + }, + { + "epoch": 0.14402852049910875, + "grad_norm": 1.8609101306501556, + "learning_rate": 1.9325975851159406e-05, + "loss": 0.3807, + "step": 1818 + }, + { + "epoch": 0.1441077441077441, + "grad_norm": 1.981105678074427, + "learning_rate": 1.932504941730031e-05, + "loss": 0.3796, + "step": 1819 + }, + { + "epoch": 0.14418696771637948, + "grad_norm": 2.306442711644101, + "learning_rate": 1.932412236943158e-05, + "loss": 0.5326, + "step": 1820 + }, + { + "epoch": 0.14426619132501486, + "grad_norm": 1.8029045918670155, + "learning_rate": 1.9323194707614253e-05, + "loss": 0.3696, + "step": 1821 + }, + { + "epoch": 0.14434541493365022, + "grad_norm": 1.7897509339138913, + "learning_rate": 1.932226643190942e-05, + "loss": 0.4776, + "step": 1822 + }, + { + "epoch": 0.1444246385422856, + "grad_norm": 2.340700481179847, + "learning_rate": 1.9321337542378193e-05, + "loss": 0.5961, + "step": 1823 + }, + { + "epoch": 0.14450386215092098, + "grad_norm": 2.076583010875345, + "learning_rate": 1.9320408039081745e-05, + "loss": 0.447, + "step": 1824 + }, + { + "epoch": 0.14458308575955633, + "grad_norm": 1.8023699948273684, + "learning_rate": 1.9319477922081273e-05, + "loss": 0.4057, + "step": 1825 + }, + { + "epoch": 0.14466230936819172, + "grad_norm": 1.7624331472698438, + "learning_rate": 1.9318547191438018e-05, + "loss": 0.393, + "step": 1826 + }, + { + "epoch": 0.1447415329768271, + "grad_norm": 2.1307732710959746, + "learning_rate": 1.9317615847213274e-05, + "loss": 0.4143, + "step": 1827 + }, + { + "epoch": 0.14482075658546248, + "grad_norm": 2.15431151013332, + "learning_rate": 1.931668388946836e-05, + "loss": 0.4426, + "step": 1828 + }, + { + "epoch": 0.14489998019409783, + "grad_norm": 2.167344316732097, + "learning_rate": 1.9315751318264636e-05, + "loss": 0.5725, + "step": 1829 + }, + { + "epoch": 0.14497920380273321, + "grad_norm": 2.3659972389832844, + "learning_rate": 1.9314818133663516e-05, + "loss": 0.5478, + "step": 1830 + }, + { + "epoch": 0.1450584274113686, + "grad_norm": 2.2549618688831155, + "learning_rate": 1.9313884335726443e-05, + "loss": 0.476, + "step": 1831 + }, + { + "epoch": 0.14513765102000395, + "grad_norm": 1.698471366718964, + "learning_rate": 1.93129499245149e-05, + "loss": 0.4373, + "step": 1832 + }, + { + "epoch": 0.14521687462863933, + "grad_norm": 2.1759911317876854, + "learning_rate": 1.9312014900090416e-05, + "loss": 0.4952, + "step": 1833 + }, + { + "epoch": 0.1452960982372747, + "grad_norm": 2.2611368908528875, + "learning_rate": 1.931107926251456e-05, + "loss": 0.4248, + "step": 1834 + }, + { + "epoch": 0.1453753218459101, + "grad_norm": 2.0360940514568315, + "learning_rate": 1.931014301184893e-05, + "loss": 0.4159, + "step": 1835 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 2.117510818133951, + "learning_rate": 1.9309206148155188e-05, + "loss": 0.4959, + "step": 1836 + }, + { + "epoch": 0.14553376906318083, + "grad_norm": 1.8158404215969388, + "learning_rate": 1.930826867149501e-05, + "loss": 0.4113, + "step": 1837 + }, + { + "epoch": 0.1456129926718162, + "grad_norm": 2.209456584755991, + "learning_rate": 1.9307330581930127e-05, + "loss": 0.449, + "step": 1838 + }, + { + "epoch": 0.14569221628045156, + "grad_norm": 2.011804955761324, + "learning_rate": 1.930639187952231e-05, + "loss": 0.4077, + "step": 1839 + }, + { + "epoch": 0.14577143988908695, + "grad_norm": 1.9777091067568096, + "learning_rate": 1.930545256433337e-05, + "loss": 0.4008, + "step": 1840 + }, + { + "epoch": 0.14585066349772233, + "grad_norm": 2.2823522263468665, + "learning_rate": 1.930451263642515e-05, + "loss": 0.4894, + "step": 1841 + }, + { + "epoch": 0.14592988710635768, + "grad_norm": 1.989066747530846, + "learning_rate": 1.9303572095859545e-05, + "loss": 0.4688, + "step": 1842 + }, + { + "epoch": 0.14600911071499306, + "grad_norm": 1.518781692342767, + "learning_rate": 1.9302630942698487e-05, + "loss": 0.3336, + "step": 1843 + }, + { + "epoch": 0.14608833432362844, + "grad_norm": 2.1014901119358766, + "learning_rate": 1.9301689177003938e-05, + "loss": 0.4732, + "step": 1844 + }, + { + "epoch": 0.14616755793226383, + "grad_norm": 1.9598769017842552, + "learning_rate": 1.9300746798837913e-05, + "loss": 0.4883, + "step": 1845 + }, + { + "epoch": 0.14624678154089918, + "grad_norm": 2.3493637045983506, + "learning_rate": 1.9299803808262466e-05, + "loss": 0.5128, + "step": 1846 + }, + { + "epoch": 0.14632600514953456, + "grad_norm": 2.2166559785155435, + "learning_rate": 1.9298860205339685e-05, + "loss": 0.4094, + "step": 1847 + }, + { + "epoch": 0.14640522875816994, + "grad_norm": 1.999818778507222, + "learning_rate": 1.9297915990131704e-05, + "loss": 0.362, + "step": 1848 + }, + { + "epoch": 0.1464844523668053, + "grad_norm": 2.0642269726819746, + "learning_rate": 1.9296971162700696e-05, + "loss": 0.4919, + "step": 1849 + }, + { + "epoch": 0.14656367597544068, + "grad_norm": 1.847261442593121, + "learning_rate": 1.9296025723108867e-05, + "loss": 0.4321, + "step": 1850 + }, + { + "epoch": 0.14664289958407606, + "grad_norm": 1.9549805277293442, + "learning_rate": 1.9295079671418474e-05, + "loss": 0.4691, + "step": 1851 + }, + { + "epoch": 0.14672212319271144, + "grad_norm": 1.85507622284027, + "learning_rate": 1.929413300769181e-05, + "loss": 0.4271, + "step": 1852 + }, + { + "epoch": 0.1468013468013468, + "grad_norm": 2.130972390981168, + "learning_rate": 1.9293185731991212e-05, + "loss": 0.534, + "step": 1853 + }, + { + "epoch": 0.14688057040998218, + "grad_norm": 1.8403258636881825, + "learning_rate": 1.9292237844379043e-05, + "loss": 0.4804, + "step": 1854 + }, + { + "epoch": 0.14695979401861756, + "grad_norm": 2.1049113638663886, + "learning_rate": 1.929128934491773e-05, + "loss": 0.4035, + "step": 1855 + }, + { + "epoch": 0.1470390176272529, + "grad_norm": 1.8290146848525395, + "learning_rate": 1.929034023366972e-05, + "loss": 0.3942, + "step": 1856 + }, + { + "epoch": 0.1471182412358883, + "grad_norm": 2.0293885839440127, + "learning_rate": 1.92893905106975e-05, + "loss": 0.4333, + "step": 1857 + }, + { + "epoch": 0.14719746484452367, + "grad_norm": 2.143315362216103, + "learning_rate": 1.9288440176063617e-05, + "loss": 0.4627, + "step": 1858 + }, + { + "epoch": 0.14727668845315905, + "grad_norm": 2.1624607141174375, + "learning_rate": 1.9287489229830645e-05, + "loss": 0.5045, + "step": 1859 + }, + { + "epoch": 0.1473559120617944, + "grad_norm": 1.9613800936249302, + "learning_rate": 1.9286537672061192e-05, + "loss": 0.5062, + "step": 1860 + }, + { + "epoch": 0.1474351356704298, + "grad_norm": 1.9648995675783334, + "learning_rate": 1.9285585502817917e-05, + "loss": 0.3821, + "step": 1861 + }, + { + "epoch": 0.14751435927906517, + "grad_norm": 1.9786698165372079, + "learning_rate": 1.9284632722163515e-05, + "loss": 0.4164, + "step": 1862 + }, + { + "epoch": 0.14759358288770053, + "grad_norm": 1.9033845131427516, + "learning_rate": 1.9283679330160726e-05, + "loss": 0.5049, + "step": 1863 + }, + { + "epoch": 0.1476728064963359, + "grad_norm": 1.4399827623338708, + "learning_rate": 1.9282725326872324e-05, + "loss": 0.3297, + "step": 1864 + }, + { + "epoch": 0.1477520301049713, + "grad_norm": 1.860708722176362, + "learning_rate": 1.9281770712361123e-05, + "loss": 0.3911, + "step": 1865 + }, + { + "epoch": 0.14783125371360664, + "grad_norm": 1.8495892211555531, + "learning_rate": 1.928081548668998e-05, + "loss": 0.4586, + "step": 1866 + }, + { + "epoch": 0.14791047732224202, + "grad_norm": 1.8420053359851232, + "learning_rate": 1.9279859649921797e-05, + "loss": 0.476, + "step": 1867 + }, + { + "epoch": 0.1479897009308774, + "grad_norm": 1.974134326275655, + "learning_rate": 1.9278903202119508e-05, + "loss": 0.506, + "step": 1868 + }, + { + "epoch": 0.1480689245395128, + "grad_norm": 1.8203063542485634, + "learning_rate": 1.9277946143346086e-05, + "loss": 0.4331, + "step": 1869 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 1.9668506928461797, + "learning_rate": 1.9276988473664557e-05, + "loss": 0.4762, + "step": 1870 + }, + { + "epoch": 0.14822737175678352, + "grad_norm": 2.1191496630185642, + "learning_rate": 1.9276030193137974e-05, + "loss": 0.5223, + "step": 1871 + }, + { + "epoch": 0.1483065953654189, + "grad_norm": 1.7254225381192794, + "learning_rate": 1.927507130182944e-05, + "loss": 0.313, + "step": 1872 + }, + { + "epoch": 0.14838581897405426, + "grad_norm": 1.7873173208811577, + "learning_rate": 1.9274111799802084e-05, + "loss": 0.3964, + "step": 1873 + }, + { + "epoch": 0.14846504258268964, + "grad_norm": 2.2364493024943513, + "learning_rate": 1.9273151687119093e-05, + "loss": 0.4798, + "step": 1874 + }, + { + "epoch": 0.14854426619132502, + "grad_norm": 2.2831714507851073, + "learning_rate": 1.927219096384368e-05, + "loss": 0.3919, + "step": 1875 + }, + { + "epoch": 0.1486234897999604, + "grad_norm": 2.23354123887157, + "learning_rate": 1.9271229630039107e-05, + "loss": 0.4747, + "step": 1876 + }, + { + "epoch": 0.14870271340859575, + "grad_norm": 1.603236364029779, + "learning_rate": 1.9270267685768676e-05, + "loss": 0.3655, + "step": 1877 + }, + { + "epoch": 0.14878193701723114, + "grad_norm": 2.168529420784789, + "learning_rate": 1.9269305131095722e-05, + "loss": 0.3981, + "step": 1878 + }, + { + "epoch": 0.14886116062586652, + "grad_norm": 2.0085573977685494, + "learning_rate": 1.9268341966083627e-05, + "loss": 0.4899, + "step": 1879 + }, + { + "epoch": 0.14894038423450187, + "grad_norm": 2.2243447349940806, + "learning_rate": 1.9267378190795812e-05, + "loss": 0.4673, + "step": 1880 + }, + { + "epoch": 0.14901960784313725, + "grad_norm": 1.9868982898608372, + "learning_rate": 1.9266413805295732e-05, + "loss": 0.3923, + "step": 1881 + }, + { + "epoch": 0.14909883145177263, + "grad_norm": 1.7397097370050283, + "learning_rate": 1.9265448809646893e-05, + "loss": 0.3642, + "step": 1882 + }, + { + "epoch": 0.149178055060408, + "grad_norm": 1.94205580602043, + "learning_rate": 1.9264483203912826e-05, + "loss": 0.4176, + "step": 1883 + }, + { + "epoch": 0.14925727866904337, + "grad_norm": 2.3367070651409754, + "learning_rate": 1.9263516988157123e-05, + "loss": 0.5332, + "step": 1884 + }, + { + "epoch": 0.14933650227767875, + "grad_norm": 2.689015395247916, + "learning_rate": 1.92625501624434e-05, + "loss": 0.4557, + "step": 1885 + }, + { + "epoch": 0.14941572588631413, + "grad_norm": 1.7186251042579286, + "learning_rate": 1.9261582726835316e-05, + "loss": 0.3935, + "step": 1886 + }, + { + "epoch": 0.1494949494949495, + "grad_norm": 1.89128621765505, + "learning_rate": 1.926061468139657e-05, + "loss": 0.3861, + "step": 1887 + }, + { + "epoch": 0.14957417310358487, + "grad_norm": 1.8673714239430046, + "learning_rate": 1.9259646026190913e-05, + "loss": 0.3528, + "step": 1888 + }, + { + "epoch": 0.14965339671222025, + "grad_norm": 2.0300503501002876, + "learning_rate": 1.9258676761282117e-05, + "loss": 0.3455, + "step": 1889 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 1.7113529266372025, + "learning_rate": 1.9257706886734e-05, + "loss": 0.3368, + "step": 1890 + }, + { + "epoch": 0.14981184392949098, + "grad_norm": 1.8920505991175196, + "learning_rate": 1.9256736402610437e-05, + "loss": 0.3931, + "step": 1891 + }, + { + "epoch": 0.14989106753812637, + "grad_norm": 2.0710426675113234, + "learning_rate": 1.9255765308975322e-05, + "loss": 0.3905, + "step": 1892 + }, + { + "epoch": 0.14997029114676175, + "grad_norm": 2.3245060771205983, + "learning_rate": 1.9254793605892596e-05, + "loss": 0.5304, + "step": 1893 + }, + { + "epoch": 0.1500495147553971, + "grad_norm": 1.762178470731661, + "learning_rate": 1.9253821293426242e-05, + "loss": 0.43, + "step": 1894 + }, + { + "epoch": 0.15012873836403248, + "grad_norm": 1.9129250808745424, + "learning_rate": 1.9252848371640284e-05, + "loss": 0.4565, + "step": 1895 + }, + { + "epoch": 0.15020796197266786, + "grad_norm": 1.868247456948084, + "learning_rate": 1.925187484059878e-05, + "loss": 0.4063, + "step": 1896 + }, + { + "epoch": 0.15028718558130322, + "grad_norm": 2.0298751791691827, + "learning_rate": 1.9250900700365837e-05, + "loss": 0.5094, + "step": 1897 + }, + { + "epoch": 0.1503664091899386, + "grad_norm": 1.6624511402079098, + "learning_rate": 1.9249925951005593e-05, + "loss": 0.3305, + "step": 1898 + }, + { + "epoch": 0.15044563279857398, + "grad_norm": 1.8351561038727953, + "learning_rate": 1.9248950592582235e-05, + "loss": 0.479, + "step": 1899 + }, + { + "epoch": 0.15052485640720936, + "grad_norm": 1.7694296875548985, + "learning_rate": 1.9247974625159983e-05, + "loss": 0.3434, + "step": 1900 + }, + { + "epoch": 0.15060408001584472, + "grad_norm": 2.032063298012281, + "learning_rate": 1.92469980488031e-05, + "loss": 0.4861, + "step": 1901 + }, + { + "epoch": 0.1506833036244801, + "grad_norm": 2.224114977159279, + "learning_rate": 1.924602086357589e-05, + "loss": 0.5321, + "step": 1902 + }, + { + "epoch": 0.15076252723311548, + "grad_norm": 1.8468503194943078, + "learning_rate": 1.9245043069542696e-05, + "loss": 0.4219, + "step": 1903 + }, + { + "epoch": 0.15084175084175083, + "grad_norm": 1.8128736638197256, + "learning_rate": 1.92440646667679e-05, + "loss": 0.3802, + "step": 1904 + }, + { + "epoch": 0.1509209744503862, + "grad_norm": 1.9532221440247153, + "learning_rate": 1.9243085655315924e-05, + "loss": 0.5408, + "step": 1905 + }, + { + "epoch": 0.1510001980590216, + "grad_norm": 1.820683007173657, + "learning_rate": 1.924210603525123e-05, + "loss": 0.4453, + "step": 1906 + }, + { + "epoch": 0.15107942166765695, + "grad_norm": 2.304764565458607, + "learning_rate": 1.924112580663833e-05, + "loss": 0.431, + "step": 1907 + }, + { + "epoch": 0.15115864527629233, + "grad_norm": 2.1196869594549077, + "learning_rate": 1.9240144969541754e-05, + "loss": 0.4517, + "step": 1908 + }, + { + "epoch": 0.1512378688849277, + "grad_norm": 1.9947772795376557, + "learning_rate": 1.9239163524026097e-05, + "loss": 0.4483, + "step": 1909 + }, + { + "epoch": 0.1513170924935631, + "grad_norm": 1.684761660778757, + "learning_rate": 1.9238181470155978e-05, + "loss": 0.4046, + "step": 1910 + }, + { + "epoch": 0.15139631610219845, + "grad_norm": 2.130858326720803, + "learning_rate": 1.923719880799606e-05, + "loss": 0.5245, + "step": 1911 + }, + { + "epoch": 0.15147553971083383, + "grad_norm": 2.3276100423522386, + "learning_rate": 1.9236215537611044e-05, + "loss": 0.4491, + "step": 1912 + }, + { + "epoch": 0.1515547633194692, + "grad_norm": 1.89893713295418, + "learning_rate": 1.923523165906568e-05, + "loss": 0.4959, + "step": 1913 + }, + { + "epoch": 0.15163398692810456, + "grad_norm": 1.914576502782711, + "learning_rate": 1.923424717242475e-05, + "loss": 0.4415, + "step": 1914 + }, + { + "epoch": 0.15171321053673995, + "grad_norm": 1.843912458264141, + "learning_rate": 1.923326207775307e-05, + "loss": 0.4721, + "step": 1915 + }, + { + "epoch": 0.15179243414537533, + "grad_norm": 1.8891873478621077, + "learning_rate": 1.9232276375115517e-05, + "loss": 0.4514, + "step": 1916 + }, + { + "epoch": 0.1518716577540107, + "grad_norm": 2.106252079033078, + "learning_rate": 1.9231290064576985e-05, + "loss": 0.4155, + "step": 1917 + }, + { + "epoch": 0.15195088136264606, + "grad_norm": 1.695659540473576, + "learning_rate": 1.923030314620242e-05, + "loss": 0.4002, + "step": 1918 + }, + { + "epoch": 0.15203010497128144, + "grad_norm": 2.28437970043238, + "learning_rate": 1.9229315620056805e-05, + "loss": 0.5403, + "step": 1919 + }, + { + "epoch": 0.15210932857991682, + "grad_norm": 2.183536217192937, + "learning_rate": 1.9228327486205166e-05, + "loss": 0.5063, + "step": 1920 + }, + { + "epoch": 0.15218855218855218, + "grad_norm": 1.9305139961497506, + "learning_rate": 1.9227338744712565e-05, + "loss": 0.4946, + "step": 1921 + }, + { + "epoch": 0.15226777579718756, + "grad_norm": 1.942805626215468, + "learning_rate": 1.9226349395644106e-05, + "loss": 0.5179, + "step": 1922 + }, + { + "epoch": 0.15234699940582294, + "grad_norm": 2.187066801878686, + "learning_rate": 1.9225359439064934e-05, + "loss": 0.4609, + "step": 1923 + }, + { + "epoch": 0.1524262230144583, + "grad_norm": 1.7678449602437118, + "learning_rate": 1.9224368875040235e-05, + "loss": 0.4618, + "step": 1924 + }, + { + "epoch": 0.15250544662309368, + "grad_norm": 1.8783593693212375, + "learning_rate": 1.922337770363523e-05, + "loss": 0.3983, + "step": 1925 + }, + { + "epoch": 0.15258467023172906, + "grad_norm": 1.6685539071365887, + "learning_rate": 1.922238592491518e-05, + "loss": 0.4266, + "step": 1926 + }, + { + "epoch": 0.15266389384036444, + "grad_norm": 1.83736043297227, + "learning_rate": 1.9221393538945397e-05, + "loss": 0.5038, + "step": 1927 + }, + { + "epoch": 0.1527431174489998, + "grad_norm": 2.065717585181778, + "learning_rate": 1.9220400545791216e-05, + "loss": 0.4098, + "step": 1928 + }, + { + "epoch": 0.15282234105763517, + "grad_norm": 2.194196464387683, + "learning_rate": 1.9219406945518028e-05, + "loss": 0.503, + "step": 1929 + }, + { + "epoch": 0.15290156466627056, + "grad_norm": 1.6921395712465421, + "learning_rate": 1.921841273819125e-05, + "loss": 0.4009, + "step": 1930 + }, + { + "epoch": 0.1529807882749059, + "grad_norm": 2.3982264520278855, + "learning_rate": 1.9217417923876352e-05, + "loss": 0.6454, + "step": 1931 + }, + { + "epoch": 0.1530600118835413, + "grad_norm": 2.434815057296822, + "learning_rate": 1.9216422502638836e-05, + "loss": 0.4516, + "step": 1932 + }, + { + "epoch": 0.15313923549217667, + "grad_norm": 1.5625483543103351, + "learning_rate": 1.9215426474544242e-05, + "loss": 0.3211, + "step": 1933 + }, + { + "epoch": 0.15321845910081205, + "grad_norm": 1.6859384136778546, + "learning_rate": 1.9214429839658156e-05, + "loss": 0.3147, + "step": 1934 + }, + { + "epoch": 0.1532976827094474, + "grad_norm": 1.8659018671223904, + "learning_rate": 1.9213432598046205e-05, + "loss": 0.4216, + "step": 1935 + }, + { + "epoch": 0.1533769063180828, + "grad_norm": 1.950069094228493, + "learning_rate": 1.9212434749774048e-05, + "loss": 0.3841, + "step": 1936 + }, + { + "epoch": 0.15345612992671817, + "grad_norm": 1.9437979084349992, + "learning_rate": 1.921143629490739e-05, + "loss": 0.4203, + "step": 1937 + }, + { + "epoch": 0.15353535353535352, + "grad_norm": 2.132083856118412, + "learning_rate": 1.9210437233511974e-05, + "loss": 0.3643, + "step": 1938 + }, + { + "epoch": 0.1536145771439889, + "grad_norm": 2.0621296382892886, + "learning_rate": 1.9209437565653587e-05, + "loss": 0.3834, + "step": 1939 + }, + { + "epoch": 0.1536938007526243, + "grad_norm": 2.3378011508499488, + "learning_rate": 1.9208437291398045e-05, + "loss": 0.5445, + "step": 1940 + }, + { + "epoch": 0.15377302436125967, + "grad_norm": 1.8673915945006168, + "learning_rate": 1.920743641081122e-05, + "loss": 0.4372, + "step": 1941 + }, + { + "epoch": 0.15385224796989502, + "grad_norm": 1.559541513346412, + "learning_rate": 1.920643492395901e-05, + "loss": 0.3706, + "step": 1942 + }, + { + "epoch": 0.1539314715785304, + "grad_norm": 1.9106238247941263, + "learning_rate": 1.9205432830907353e-05, + "loss": 0.4876, + "step": 1943 + }, + { + "epoch": 0.15401069518716579, + "grad_norm": 1.8895165530515177, + "learning_rate": 1.9204430131722243e-05, + "loss": 0.4114, + "step": 1944 + }, + { + "epoch": 0.15408991879580114, + "grad_norm": 2.1240288066456925, + "learning_rate": 1.9203426826469695e-05, + "loss": 0.464, + "step": 1945 + }, + { + "epoch": 0.15416914240443652, + "grad_norm": 2.2132213437258508, + "learning_rate": 1.9202422915215777e-05, + "loss": 0.4162, + "step": 1946 + }, + { + "epoch": 0.1542483660130719, + "grad_norm": 1.9889021149191182, + "learning_rate": 1.920141839802659e-05, + "loss": 0.4147, + "step": 1947 + }, + { + "epoch": 0.15432758962170726, + "grad_norm": 2.0598692042699227, + "learning_rate": 1.9200413274968276e-05, + "loss": 0.3997, + "step": 1948 + }, + { + "epoch": 0.15440681323034264, + "grad_norm": 2.0430074413352903, + "learning_rate": 1.9199407546107014e-05, + "loss": 0.4796, + "step": 1949 + }, + { + "epoch": 0.15448603683897802, + "grad_norm": 2.3283017916561928, + "learning_rate": 1.919840121150903e-05, + "loss": 0.3975, + "step": 1950 + }, + { + "epoch": 0.1545652604476134, + "grad_norm": 2.3288207185637506, + "learning_rate": 1.9197394271240587e-05, + "loss": 0.5509, + "step": 1951 + }, + { + "epoch": 0.15464448405624875, + "grad_norm": 1.972040550521486, + "learning_rate": 1.919638672536799e-05, + "loss": 0.4874, + "step": 1952 + }, + { + "epoch": 0.15472370766488414, + "grad_norm": 2.2835353620288976, + "learning_rate": 1.9195378573957574e-05, + "loss": 0.4073, + "step": 1953 + }, + { + "epoch": 0.15480293127351952, + "grad_norm": 2.3055396461467703, + "learning_rate": 1.9194369817075725e-05, + "loss": 0.5435, + "step": 1954 + }, + { + "epoch": 0.15488215488215487, + "grad_norm": 1.7468913721066341, + "learning_rate": 1.9193360454788864e-05, + "loss": 0.4197, + "step": 1955 + }, + { + "epoch": 0.15496137849079025, + "grad_norm": 1.8901320146770375, + "learning_rate": 1.919235048716345e-05, + "loss": 0.3991, + "step": 1956 + }, + { + "epoch": 0.15504060209942563, + "grad_norm": 1.8694540215549391, + "learning_rate": 1.919133991426599e-05, + "loss": 0.4191, + "step": 1957 + }, + { + "epoch": 0.15511982570806102, + "grad_norm": 2.111349903660736, + "learning_rate": 1.919032873616302e-05, + "loss": 0.5526, + "step": 1958 + }, + { + "epoch": 0.15519904931669637, + "grad_norm": 1.8547085662579925, + "learning_rate": 1.918931695292113e-05, + "loss": 0.5017, + "step": 1959 + }, + { + "epoch": 0.15527827292533175, + "grad_norm": 1.8235165024553577, + "learning_rate": 1.918830456460693e-05, + "loss": 0.4983, + "step": 1960 + }, + { + "epoch": 0.15535749653396713, + "grad_norm": 2.154537625909922, + "learning_rate": 1.9187291571287088e-05, + "loss": 0.4045, + "step": 1961 + }, + { + "epoch": 0.15543672014260249, + "grad_norm": 1.7149052536133882, + "learning_rate": 1.91862779730283e-05, + "loss": 0.3957, + "step": 1962 + }, + { + "epoch": 0.15551594375123787, + "grad_norm": 2.0026327116423372, + "learning_rate": 1.918526376989731e-05, + "loss": 0.4579, + "step": 1963 + }, + { + "epoch": 0.15559516735987325, + "grad_norm": 1.7958870638110085, + "learning_rate": 1.9184248961960895e-05, + "loss": 0.4649, + "step": 1964 + }, + { + "epoch": 0.1556743909685086, + "grad_norm": 1.9881404823689581, + "learning_rate": 1.918323354928588e-05, + "loss": 0.5235, + "step": 1965 + }, + { + "epoch": 0.15575361457714398, + "grad_norm": 1.6403889304204737, + "learning_rate": 1.918221753193912e-05, + "loss": 0.4186, + "step": 1966 + }, + { + "epoch": 0.15583283818577937, + "grad_norm": 1.8395212180387734, + "learning_rate": 1.9181200909987524e-05, + "loss": 0.5151, + "step": 1967 + }, + { + "epoch": 0.15591206179441475, + "grad_norm": 1.8932017186535837, + "learning_rate": 1.918018368349802e-05, + "loss": 0.461, + "step": 1968 + }, + { + "epoch": 0.1559912854030501, + "grad_norm": 1.9612754693647139, + "learning_rate": 1.9179165852537596e-05, + "loss": 0.4759, + "step": 1969 + }, + { + "epoch": 0.15607050901168548, + "grad_norm": 2.142376187267396, + "learning_rate": 1.9178147417173265e-05, + "loss": 0.5838, + "step": 1970 + }, + { + "epoch": 0.15614973262032086, + "grad_norm": 2.072107257313081, + "learning_rate": 1.917712837747209e-05, + "loss": 0.4795, + "step": 1971 + }, + { + "epoch": 0.15622895622895622, + "grad_norm": 2.1710238774166597, + "learning_rate": 1.917610873350117e-05, + "loss": 0.4474, + "step": 1972 + }, + { + "epoch": 0.1563081798375916, + "grad_norm": 2.1051730723179514, + "learning_rate": 1.917508848532764e-05, + "loss": 0.4373, + "step": 1973 + }, + { + "epoch": 0.15638740344622698, + "grad_norm": 1.7526336787370713, + "learning_rate": 1.9174067633018682e-05, + "loss": 0.4352, + "step": 1974 + }, + { + "epoch": 0.15646662705486236, + "grad_norm": 1.8533115293675426, + "learning_rate": 1.9173046176641515e-05, + "loss": 0.3838, + "step": 1975 + }, + { + "epoch": 0.15654585066349772, + "grad_norm": 2.019297783913294, + "learning_rate": 1.917202411626339e-05, + "loss": 0.3946, + "step": 1976 + }, + { + "epoch": 0.1566250742721331, + "grad_norm": 1.9613658949742254, + "learning_rate": 1.9171001451951616e-05, + "loss": 0.4629, + "step": 1977 + }, + { + "epoch": 0.15670429788076848, + "grad_norm": 2.316414806630757, + "learning_rate": 1.916997818377352e-05, + "loss": 0.476, + "step": 1978 + }, + { + "epoch": 0.15678352148940383, + "grad_norm": 2.0416898656544595, + "learning_rate": 1.9168954311796487e-05, + "loss": 0.4168, + "step": 1979 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 1.8983919071654909, + "learning_rate": 1.9167929836087932e-05, + "loss": 0.4465, + "step": 1980 + }, + { + "epoch": 0.1569419687066746, + "grad_norm": 1.8743570587315996, + "learning_rate": 1.9166904756715307e-05, + "loss": 0.4146, + "step": 1981 + }, + { + "epoch": 0.15702119231530995, + "grad_norm": 2.2083717297570997, + "learning_rate": 1.9165879073746112e-05, + "loss": 0.6243, + "step": 1982 + }, + { + "epoch": 0.15710041592394533, + "grad_norm": 1.617666459264679, + "learning_rate": 1.9164852787247887e-05, + "loss": 0.4738, + "step": 1983 + }, + { + "epoch": 0.1571796395325807, + "grad_norm": 1.875389729531953, + "learning_rate": 1.91638258972882e-05, + "loss": 0.42, + "step": 1984 + }, + { + "epoch": 0.1572588631412161, + "grad_norm": 1.8603775762284498, + "learning_rate": 1.916279840393467e-05, + "loss": 0.4651, + "step": 1985 + }, + { + "epoch": 0.15733808674985145, + "grad_norm": 2.231502971943156, + "learning_rate": 1.916177030725496e-05, + "loss": 0.5369, + "step": 1986 + }, + { + "epoch": 0.15741731035848683, + "grad_norm": 1.7585322187350656, + "learning_rate": 1.9160741607316755e-05, + "loss": 0.402, + "step": 1987 + }, + { + "epoch": 0.1574965339671222, + "grad_norm": 1.9824088280783905, + "learning_rate": 1.9159712304187795e-05, + "loss": 0.4537, + "step": 1988 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 1.7478757597559542, + "learning_rate": 1.9158682397935852e-05, + "loss": 0.3418, + "step": 1989 + }, + { + "epoch": 0.15765498118439294, + "grad_norm": 2.0178883333436697, + "learning_rate": 1.9157651888628744e-05, + "loss": 0.4895, + "step": 1990 + }, + { + "epoch": 0.15773420479302833, + "grad_norm": 2.004160681129957, + "learning_rate": 1.915662077633432e-05, + "loss": 0.4939, + "step": 1991 + }, + { + "epoch": 0.1578134284016637, + "grad_norm": 1.596888270010446, + "learning_rate": 1.915558906112048e-05, + "loss": 0.361, + "step": 1992 + }, + { + "epoch": 0.15789265201029906, + "grad_norm": 1.7962284825872572, + "learning_rate": 1.915455674305515e-05, + "loss": 0.3198, + "step": 1993 + }, + { + "epoch": 0.15797187561893444, + "grad_norm": 2.3687125225646013, + "learning_rate": 1.9153523822206312e-05, + "loss": 0.435, + "step": 1994 + }, + { + "epoch": 0.15805109922756982, + "grad_norm": 2.018520997961375, + "learning_rate": 1.9152490298641973e-05, + "loss": 0.4507, + "step": 1995 + }, + { + "epoch": 0.15813032283620518, + "grad_norm": 1.9315785147219517, + "learning_rate": 1.9151456172430186e-05, + "loss": 0.4238, + "step": 1996 + }, + { + "epoch": 0.15820954644484056, + "grad_norm": 1.99264143017953, + "learning_rate": 1.9150421443639045e-05, + "loss": 0.4224, + "step": 1997 + }, + { + "epoch": 0.15828877005347594, + "grad_norm": 2.4910885234633198, + "learning_rate": 1.9149386112336682e-05, + "loss": 0.4694, + "step": 1998 + }, + { + "epoch": 0.15836799366211132, + "grad_norm": 1.8709733923205423, + "learning_rate": 1.9148350178591264e-05, + "loss": 0.3645, + "step": 1999 + }, + { + "epoch": 0.15844721727074668, + "grad_norm": 1.9463881954475855, + "learning_rate": 1.914731364247101e-05, + "loss": 0.4791, + "step": 2000 + }, + { + "epoch": 0.15852644087938206, + "grad_norm": 2.017351418628687, + "learning_rate": 1.914627650404416e-05, + "loss": 0.459, + "step": 2001 + }, + { + "epoch": 0.15860566448801744, + "grad_norm": 2.4404235710193176, + "learning_rate": 1.9145238763379016e-05, + "loss": 0.4703, + "step": 2002 + }, + { + "epoch": 0.1586848880966528, + "grad_norm": 2.159753927165498, + "learning_rate": 1.9144200420543905e-05, + "loss": 0.4714, + "step": 2003 + }, + { + "epoch": 0.15876411170528817, + "grad_norm": 1.841447863879372, + "learning_rate": 1.9143161475607194e-05, + "loss": 0.4276, + "step": 2004 + }, + { + "epoch": 0.15884333531392356, + "grad_norm": 2.019703730189138, + "learning_rate": 1.9142121928637292e-05, + "loss": 0.4465, + "step": 2005 + }, + { + "epoch": 0.1589225589225589, + "grad_norm": 1.7159889083633495, + "learning_rate": 1.914108177970265e-05, + "loss": 0.3886, + "step": 2006 + }, + { + "epoch": 0.1590017825311943, + "grad_norm": 2.138110793928573, + "learning_rate": 1.914004102887176e-05, + "loss": 0.4477, + "step": 2007 + }, + { + "epoch": 0.15908100613982967, + "grad_norm": 1.9513403361777697, + "learning_rate": 1.9138999676213146e-05, + "loss": 0.5159, + "step": 2008 + }, + { + "epoch": 0.15916022974846505, + "grad_norm": 2.0677284631210897, + "learning_rate": 1.9137957721795376e-05, + "loss": 0.4755, + "step": 2009 + }, + { + "epoch": 0.1592394533571004, + "grad_norm": 2.2030131498243293, + "learning_rate": 1.913691516568706e-05, + "loss": 0.5203, + "step": 2010 + }, + { + "epoch": 0.1593186769657358, + "grad_norm": 1.9704861041232478, + "learning_rate": 1.9135872007956846e-05, + "loss": 0.4124, + "step": 2011 + }, + { + "epoch": 0.15939790057437117, + "grad_norm": 1.8426520951644911, + "learning_rate": 1.9134828248673415e-05, + "loss": 0.3733, + "step": 2012 + }, + { + "epoch": 0.15947712418300652, + "grad_norm": 2.0257096131206302, + "learning_rate": 1.9133783887905502e-05, + "loss": 0.5265, + "step": 2013 + }, + { + "epoch": 0.1595563477916419, + "grad_norm": 1.6164377945708577, + "learning_rate": 1.913273892572187e-05, + "loss": 0.3541, + "step": 2014 + }, + { + "epoch": 0.1596355714002773, + "grad_norm": 2.0665921569618275, + "learning_rate": 1.9131693362191318e-05, + "loss": 0.4187, + "step": 2015 + }, + { + "epoch": 0.15971479500891267, + "grad_norm": 2.3161746982649007, + "learning_rate": 1.91306471973827e-05, + "loss": 0.4786, + "step": 2016 + }, + { + "epoch": 0.15979401861754802, + "grad_norm": 2.2556797725522566, + "learning_rate": 1.91296004313649e-05, + "loss": 0.4511, + "step": 2017 + }, + { + "epoch": 0.1598732422261834, + "grad_norm": 2.256879583081843, + "learning_rate": 1.9128553064206835e-05, + "loss": 0.4731, + "step": 2018 + }, + { + "epoch": 0.15995246583481879, + "grad_norm": 1.9858378208952217, + "learning_rate": 1.9127505095977483e-05, + "loss": 0.5413, + "step": 2019 + }, + { + "epoch": 0.16003168944345414, + "grad_norm": 2.214168557110572, + "learning_rate": 1.9126456526745833e-05, + "loss": 0.4752, + "step": 2020 + }, + { + "epoch": 0.16011091305208952, + "grad_norm": 1.998546795067003, + "learning_rate": 1.9125407356580932e-05, + "loss": 0.4276, + "step": 2021 + }, + { + "epoch": 0.1601901366607249, + "grad_norm": 2.1717770604291156, + "learning_rate": 1.9124357585551872e-05, + "loss": 0.5396, + "step": 2022 + }, + { + "epoch": 0.16026936026936026, + "grad_norm": 1.8499838317063686, + "learning_rate": 1.9123307213727764e-05, + "loss": 0.4366, + "step": 2023 + }, + { + "epoch": 0.16034858387799564, + "grad_norm": 2.069710774423531, + "learning_rate": 1.9122256241177776e-05, + "loss": 0.4577, + "step": 2024 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 1.9744168513151665, + "learning_rate": 1.9121204667971107e-05, + "loss": 0.4787, + "step": 2025 + }, + { + "epoch": 0.1605070310952664, + "grad_norm": 2.1224475634786195, + "learning_rate": 1.9120152494177e-05, + "loss": 0.4443, + "step": 2026 + }, + { + "epoch": 0.16058625470390175, + "grad_norm": 1.5698929079671524, + "learning_rate": 1.9119099719864735e-05, + "loss": 0.4655, + "step": 2027 + }, + { + "epoch": 0.16066547831253714, + "grad_norm": 2.3474613616238753, + "learning_rate": 1.911804634510363e-05, + "loss": 0.435, + "step": 2028 + }, + { + "epoch": 0.16074470192117252, + "grad_norm": 2.032901151342805, + "learning_rate": 1.911699236996305e-05, + "loss": 0.4536, + "step": 2029 + }, + { + "epoch": 0.16082392552980787, + "grad_norm": 1.511792177635146, + "learning_rate": 1.911593779451239e-05, + "loss": 0.3334, + "step": 2030 + }, + { + "epoch": 0.16090314913844325, + "grad_norm": 2.0327297256623638, + "learning_rate": 1.911488261882109e-05, + "loss": 0.5448, + "step": 2031 + }, + { + "epoch": 0.16098237274707863, + "grad_norm": 2.029856153994575, + "learning_rate": 1.911382684295862e-05, + "loss": 0.4393, + "step": 2032 + }, + { + "epoch": 0.16106159635571402, + "grad_norm": 2.2739213651011267, + "learning_rate": 1.911277046699451e-05, + "loss": 0.4683, + "step": 2033 + }, + { + "epoch": 0.16114081996434937, + "grad_norm": 1.6415975123062514, + "learning_rate": 1.9111713490998316e-05, + "loss": 0.3008, + "step": 2034 + }, + { + "epoch": 0.16122004357298475, + "grad_norm": 1.8590560329796424, + "learning_rate": 1.911065591503963e-05, + "loss": 0.483, + "step": 2035 + }, + { + "epoch": 0.16129926718162013, + "grad_norm": 1.703794023995196, + "learning_rate": 1.9109597739188088e-05, + "loss": 0.4398, + "step": 2036 + }, + { + "epoch": 0.16137849079025549, + "grad_norm": 2.079177596949361, + "learning_rate": 1.9108538963513366e-05, + "loss": 0.4747, + "step": 2037 + }, + { + "epoch": 0.16145771439889087, + "grad_norm": 2.0291583347975672, + "learning_rate": 1.9107479588085182e-05, + "loss": 0.3828, + "step": 2038 + }, + { + "epoch": 0.16153693800752625, + "grad_norm": 1.9075007919021594, + "learning_rate": 1.910641961297329e-05, + "loss": 0.4762, + "step": 2039 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 2.2221860674462475, + "learning_rate": 1.9105359038247484e-05, + "loss": 0.4476, + "step": 2040 + }, + { + "epoch": 0.16169538522479698, + "grad_norm": 1.9098952614991478, + "learning_rate": 1.9104297863977595e-05, + "loss": 0.4687, + "step": 2041 + }, + { + "epoch": 0.16177460883343237, + "grad_norm": 2.2812926157485798, + "learning_rate": 1.9103236090233507e-05, + "loss": 0.4885, + "step": 2042 + }, + { + "epoch": 0.16185383244206775, + "grad_norm": 1.903474218188879, + "learning_rate": 1.9102173717085114e-05, + "loss": 0.4055, + "step": 2043 + }, + { + "epoch": 0.1619330560507031, + "grad_norm": 2.3890631174378254, + "learning_rate": 1.9101110744602384e-05, + "loss": 0.4792, + "step": 2044 + }, + { + "epoch": 0.16201227965933848, + "grad_norm": 2.2426265314474194, + "learning_rate": 1.9100047172855306e-05, + "loss": 0.5375, + "step": 2045 + }, + { + "epoch": 0.16209150326797386, + "grad_norm": 1.64981596146523, + "learning_rate": 1.9098983001913903e-05, + "loss": 0.3908, + "step": 2046 + }, + { + "epoch": 0.16217072687660922, + "grad_norm": 1.9634646988029925, + "learning_rate": 1.909791823184825e-05, + "loss": 0.4321, + "step": 2047 + }, + { + "epoch": 0.1622499504852446, + "grad_norm": 1.8697621624055465, + "learning_rate": 1.909685286272846e-05, + "loss": 0.4648, + "step": 2048 + }, + { + "epoch": 0.16232917409387998, + "grad_norm": 1.7425545839507501, + "learning_rate": 1.9095786894624685e-05, + "loss": 0.4435, + "step": 2049 + }, + { + "epoch": 0.16240839770251536, + "grad_norm": 1.9943424804669774, + "learning_rate": 1.9094720327607102e-05, + "loss": 0.4894, + "step": 2050 + }, + { + "epoch": 0.16248762131115072, + "grad_norm": 1.803249905651218, + "learning_rate": 1.909365316174595e-05, + "loss": 0.4681, + "step": 2051 + }, + { + "epoch": 0.1625668449197861, + "grad_norm": 1.5984663033682562, + "learning_rate": 1.9092585397111492e-05, + "loss": 0.2981, + "step": 2052 + }, + { + "epoch": 0.16264606852842148, + "grad_norm": 2.2081089528667444, + "learning_rate": 1.9091517033774038e-05, + "loss": 0.5272, + "step": 2053 + }, + { + "epoch": 0.16272529213705683, + "grad_norm": 1.8387584633591254, + "learning_rate": 1.9090448071803932e-05, + "loss": 0.4184, + "step": 2054 + }, + { + "epoch": 0.1628045157456922, + "grad_norm": 1.7075200875404835, + "learning_rate": 1.908937851127156e-05, + "loss": 0.4085, + "step": 2055 + }, + { + "epoch": 0.1628837393543276, + "grad_norm": 1.6881624219173819, + "learning_rate": 1.908830835224735e-05, + "loss": 0.3926, + "step": 2056 + }, + { + "epoch": 0.16296296296296298, + "grad_norm": 1.8902781401712885, + "learning_rate": 1.9087237594801762e-05, + "loss": 0.4746, + "step": 2057 + }, + { + "epoch": 0.16304218657159833, + "grad_norm": 2.1921377278812217, + "learning_rate": 1.9086166239005305e-05, + "loss": 0.5582, + "step": 2058 + }, + { + "epoch": 0.1631214101802337, + "grad_norm": 1.8514505977716071, + "learning_rate": 1.908509428492852e-05, + "loss": 0.3789, + "step": 2059 + }, + { + "epoch": 0.1632006337888691, + "grad_norm": 1.8186435845825861, + "learning_rate": 1.9084021732641994e-05, + "loss": 0.4499, + "step": 2060 + }, + { + "epoch": 0.16327985739750445, + "grad_norm": 1.9738670012443507, + "learning_rate": 1.9082948582216344e-05, + "loss": 0.4885, + "step": 2061 + }, + { + "epoch": 0.16335908100613983, + "grad_norm": 2.3557387508860597, + "learning_rate": 1.9081874833722234e-05, + "loss": 0.4721, + "step": 2062 + }, + { + "epoch": 0.1634383046147752, + "grad_norm": 2.283746955716872, + "learning_rate": 1.908080048723037e-05, + "loss": 0.5479, + "step": 2063 + }, + { + "epoch": 0.16351752822341056, + "grad_norm": 1.8879144260508687, + "learning_rate": 1.9079725542811484e-05, + "loss": 0.4572, + "step": 2064 + }, + { + "epoch": 0.16359675183204594, + "grad_norm": 1.8688000657124575, + "learning_rate": 1.907865000053636e-05, + "loss": 0.4312, + "step": 2065 + }, + { + "epoch": 0.16367597544068133, + "grad_norm": 1.7126821263919434, + "learning_rate": 1.9077573860475815e-05, + "loss": 0.4382, + "step": 2066 + }, + { + "epoch": 0.1637551990493167, + "grad_norm": 1.9439573365348657, + "learning_rate": 1.9076497122700713e-05, + "loss": 0.4075, + "step": 2067 + }, + { + "epoch": 0.16383442265795206, + "grad_norm": 1.7635609787546924, + "learning_rate": 1.9075419787281948e-05, + "loss": 0.4871, + "step": 2068 + }, + { + "epoch": 0.16391364626658744, + "grad_norm": 1.9910760462873718, + "learning_rate": 1.9074341854290458e-05, + "loss": 0.3818, + "step": 2069 + }, + { + "epoch": 0.16399286987522282, + "grad_norm": 1.7261206009996528, + "learning_rate": 1.907326332379722e-05, + "loss": 0.3531, + "step": 2070 + }, + { + "epoch": 0.16407209348385818, + "grad_norm": 1.772590764593171, + "learning_rate": 1.9072184195873248e-05, + "loss": 0.3428, + "step": 2071 + }, + { + "epoch": 0.16415131709249356, + "grad_norm": 1.7374034073598783, + "learning_rate": 1.9071104470589603e-05, + "loss": 0.3992, + "step": 2072 + }, + { + "epoch": 0.16423054070112894, + "grad_norm": 2.137588648482301, + "learning_rate": 1.9070024148017375e-05, + "loss": 0.4891, + "step": 2073 + }, + { + "epoch": 0.16430976430976432, + "grad_norm": 1.8073878554775773, + "learning_rate": 1.9068943228227695e-05, + "loss": 0.3693, + "step": 2074 + }, + { + "epoch": 0.16438898791839968, + "grad_norm": 2.0353060672381114, + "learning_rate": 1.9067861711291744e-05, + "loss": 0.404, + "step": 2075 + }, + { + "epoch": 0.16446821152703506, + "grad_norm": 1.8381303585687634, + "learning_rate": 1.906677959728073e-05, + "loss": 0.434, + "step": 2076 + }, + { + "epoch": 0.16454743513567044, + "grad_norm": 1.7978064015330337, + "learning_rate": 1.9065696886265906e-05, + "loss": 0.4621, + "step": 2077 + }, + { + "epoch": 0.1646266587443058, + "grad_norm": 1.9251036960685879, + "learning_rate": 1.9064613578318564e-05, + "loss": 0.334, + "step": 2078 + }, + { + "epoch": 0.16470588235294117, + "grad_norm": 1.734373021904076, + "learning_rate": 1.9063529673510036e-05, + "loss": 0.299, + "step": 2079 + }, + { + "epoch": 0.16478510596157656, + "grad_norm": 1.9358714590221786, + "learning_rate": 1.9062445171911688e-05, + "loss": 0.469, + "step": 2080 + }, + { + "epoch": 0.1648643295702119, + "grad_norm": 1.9175926311797722, + "learning_rate": 1.9061360073594933e-05, + "loss": 0.4203, + "step": 2081 + }, + { + "epoch": 0.1649435531788473, + "grad_norm": 2.081708246257512, + "learning_rate": 1.9060274378631215e-05, + "loss": 0.4775, + "step": 2082 + }, + { + "epoch": 0.16502277678748267, + "grad_norm": 2.3725404126742777, + "learning_rate": 1.9059188087092025e-05, + "loss": 0.5039, + "step": 2083 + }, + { + "epoch": 0.16510200039611805, + "grad_norm": 2.1449702764161676, + "learning_rate": 1.905810119904889e-05, + "loss": 0.3308, + "step": 2084 + }, + { + "epoch": 0.1651812240047534, + "grad_norm": 1.9393740275276423, + "learning_rate": 1.9057013714573375e-05, + "loss": 0.4446, + "step": 2085 + }, + { + "epoch": 0.1652604476133888, + "grad_norm": 2.0224354747188973, + "learning_rate": 1.9055925633737088e-05, + "loss": 0.453, + "step": 2086 + }, + { + "epoch": 0.16533967122202417, + "grad_norm": 2.0719452740776876, + "learning_rate": 1.905483695661167e-05, + "loss": 0.458, + "step": 2087 + }, + { + "epoch": 0.16541889483065952, + "grad_norm": 1.7243595554861604, + "learning_rate": 1.905374768326881e-05, + "loss": 0.4261, + "step": 2088 + }, + { + "epoch": 0.1654981184392949, + "grad_norm": 2.1132599060514368, + "learning_rate": 1.9052657813780226e-05, + "loss": 0.5107, + "step": 2089 + }, + { + "epoch": 0.1655773420479303, + "grad_norm": 1.764948832320103, + "learning_rate": 1.9051567348217686e-05, + "loss": 0.3729, + "step": 2090 + }, + { + "epoch": 0.16565656565656567, + "grad_norm": 1.7770507985614987, + "learning_rate": 1.905047628665299e-05, + "loss": 0.3085, + "step": 2091 + }, + { + "epoch": 0.16573578926520102, + "grad_norm": 1.9481151339485177, + "learning_rate": 1.9049384629157974e-05, + "loss": 0.4357, + "step": 2092 + }, + { + "epoch": 0.1658150128738364, + "grad_norm": 2.08903763187559, + "learning_rate": 1.9048292375804527e-05, + "loss": 0.5798, + "step": 2093 + }, + { + "epoch": 0.16589423648247179, + "grad_norm": 1.912963993717447, + "learning_rate": 1.9047199526664565e-05, + "loss": 0.5083, + "step": 2094 + }, + { + "epoch": 0.16597346009110714, + "grad_norm": 1.8950488329303423, + "learning_rate": 1.9046106081810047e-05, + "loss": 0.4688, + "step": 2095 + }, + { + "epoch": 0.16605268369974252, + "grad_norm": 1.8322721980137766, + "learning_rate": 1.9045012041312966e-05, + "loss": 0.4893, + "step": 2096 + }, + { + "epoch": 0.1661319073083779, + "grad_norm": 1.6160660139896603, + "learning_rate": 1.904391740524537e-05, + "loss": 0.3787, + "step": 2097 + }, + { + "epoch": 0.16621113091701328, + "grad_norm": 1.748887779213861, + "learning_rate": 1.9042822173679325e-05, + "loss": 0.4242, + "step": 2098 + }, + { + "epoch": 0.16629035452564864, + "grad_norm": 1.6657150099368654, + "learning_rate": 1.9041726346686952e-05, + "loss": 0.3845, + "step": 2099 + }, + { + "epoch": 0.16636957813428402, + "grad_norm": 2.080950012323449, + "learning_rate": 1.9040629924340406e-05, + "loss": 0.4616, + "step": 2100 + }, + { + "epoch": 0.1664488017429194, + "grad_norm": 1.944120818298703, + "learning_rate": 1.903953290671188e-05, + "loss": 0.5048, + "step": 2101 + }, + { + "epoch": 0.16652802535155475, + "grad_norm": 1.9155882044296995, + "learning_rate": 1.903843529387361e-05, + "loss": 0.4695, + "step": 2102 + }, + { + "epoch": 0.16660724896019014, + "grad_norm": 1.7618387395184265, + "learning_rate": 1.903733708589786e-05, + "loss": 0.4762, + "step": 2103 + }, + { + "epoch": 0.16668647256882552, + "grad_norm": 2.220991776497542, + "learning_rate": 1.9036238282856952e-05, + "loss": 0.6267, + "step": 2104 + }, + { + "epoch": 0.16676569617746087, + "grad_norm": 1.9177135363531168, + "learning_rate": 1.903513888482323e-05, + "loss": 0.4821, + "step": 2105 + }, + { + "epoch": 0.16684491978609625, + "grad_norm": 1.530413558382587, + "learning_rate": 1.903403889186909e-05, + "loss": 0.3674, + "step": 2106 + }, + { + "epoch": 0.16692414339473163, + "grad_norm": 1.9414858457253144, + "learning_rate": 1.903293830406696e-05, + "loss": 0.5525, + "step": 2107 + }, + { + "epoch": 0.16700336700336701, + "grad_norm": 2.0608747651108983, + "learning_rate": 1.9031837121489303e-05, + "loss": 0.518, + "step": 2108 + }, + { + "epoch": 0.16708259061200237, + "grad_norm": 1.7072264140131648, + "learning_rate": 1.903073534420863e-05, + "loss": 0.4484, + "step": 2109 + }, + { + "epoch": 0.16716181422063775, + "grad_norm": 1.6015691891846717, + "learning_rate": 1.9029632972297488e-05, + "loss": 0.3609, + "step": 2110 + }, + { + "epoch": 0.16724103782927313, + "grad_norm": 1.8441132771224842, + "learning_rate": 1.9028530005828462e-05, + "loss": 0.4383, + "step": 2111 + }, + { + "epoch": 0.16732026143790849, + "grad_norm": 1.9824977295494668, + "learning_rate": 1.9027426444874177e-05, + "loss": 0.5459, + "step": 2112 + }, + { + "epoch": 0.16739948504654387, + "grad_norm": 1.7882499964851173, + "learning_rate": 1.90263222895073e-05, + "loss": 0.3986, + "step": 2113 + }, + { + "epoch": 0.16747870865517925, + "grad_norm": 2.0990733023350563, + "learning_rate": 1.902521753980053e-05, + "loss": 0.5134, + "step": 2114 + }, + { + "epoch": 0.16755793226381463, + "grad_norm": 2.3701923845142927, + "learning_rate": 1.9024112195826614e-05, + "loss": 0.465, + "step": 2115 + }, + { + "epoch": 0.16763715587244998, + "grad_norm": 1.8813385397244755, + "learning_rate": 1.902300625765833e-05, + "loss": 0.3947, + "step": 2116 + }, + { + "epoch": 0.16771637948108536, + "grad_norm": 1.8744528326878467, + "learning_rate": 1.9021899725368498e-05, + "loss": 0.346, + "step": 2117 + }, + { + "epoch": 0.16779560308972075, + "grad_norm": 2.2864476645229317, + "learning_rate": 1.902079259902998e-05, + "loss": 0.4422, + "step": 2118 + }, + { + "epoch": 0.1678748266983561, + "grad_norm": 1.7235774467986877, + "learning_rate": 1.901968487871568e-05, + "loss": 0.3719, + "step": 2119 + }, + { + "epoch": 0.16795405030699148, + "grad_norm": 1.7635146709495053, + "learning_rate": 1.9018576564498527e-05, + "loss": 0.4148, + "step": 2120 + }, + { + "epoch": 0.16803327391562686, + "grad_norm": 2.3921189681760775, + "learning_rate": 1.9017467656451498e-05, + "loss": 0.5032, + "step": 2121 + }, + { + "epoch": 0.16811249752426222, + "grad_norm": 1.8731702615933892, + "learning_rate": 1.9016358154647618e-05, + "loss": 0.4955, + "step": 2122 + }, + { + "epoch": 0.1681917211328976, + "grad_norm": 2.0615634278881028, + "learning_rate": 1.9015248059159937e-05, + "loss": 0.4585, + "step": 2123 + }, + { + "epoch": 0.16827094474153298, + "grad_norm": 2.104829927777596, + "learning_rate": 1.901413737006155e-05, + "loss": 0.4737, + "step": 2124 + }, + { + "epoch": 0.16835016835016836, + "grad_norm": 2.1271160020437585, + "learning_rate": 1.901302608742559e-05, + "loss": 0.4759, + "step": 2125 + }, + { + "epoch": 0.16842939195880371, + "grad_norm": 1.9955275056730297, + "learning_rate": 1.9011914211325225e-05, + "loss": 0.4339, + "step": 2126 + }, + { + "epoch": 0.1685086155674391, + "grad_norm": 2.0453468189057378, + "learning_rate": 1.9010801741833678e-05, + "loss": 0.3957, + "step": 2127 + }, + { + "epoch": 0.16858783917607448, + "grad_norm": 2.0515103807351527, + "learning_rate": 1.900968867902419e-05, + "loss": 0.4234, + "step": 2128 + }, + { + "epoch": 0.16866706278470983, + "grad_norm": 1.8188145322226423, + "learning_rate": 1.900857502297006e-05, + "loss": 0.3967, + "step": 2129 + }, + { + "epoch": 0.1687462863933452, + "grad_norm": 1.6357454774535003, + "learning_rate": 1.9007460773744605e-05, + "loss": 0.3148, + "step": 2130 + }, + { + "epoch": 0.1688255100019806, + "grad_norm": 1.672921216228675, + "learning_rate": 1.90063459314212e-05, + "loss": 0.3223, + "step": 2131 + }, + { + "epoch": 0.16890473361061598, + "grad_norm": 1.7180600630608942, + "learning_rate": 1.9005230496073256e-05, + "loss": 0.3086, + "step": 2132 + }, + { + "epoch": 0.16898395721925133, + "grad_norm": 1.8721917325562165, + "learning_rate": 1.900411446777421e-05, + "loss": 0.4684, + "step": 2133 + }, + { + "epoch": 0.1690631808278867, + "grad_norm": 1.645104471147386, + "learning_rate": 1.900299784659755e-05, + "loss": 0.388, + "step": 2134 + }, + { + "epoch": 0.1691424044365221, + "grad_norm": 2.22111790445004, + "learning_rate": 1.9001880632616806e-05, + "loss": 0.5104, + "step": 2135 + }, + { + "epoch": 0.16922162804515745, + "grad_norm": 1.8280414426193994, + "learning_rate": 1.9000762825905535e-05, + "loss": 0.4788, + "step": 2136 + }, + { + "epoch": 0.16930085165379283, + "grad_norm": 1.7297646239170181, + "learning_rate": 1.899964442653734e-05, + "loss": 0.3546, + "step": 2137 + }, + { + "epoch": 0.1693800752624282, + "grad_norm": 1.8872502511626468, + "learning_rate": 1.8998525434585862e-05, + "loss": 0.4687, + "step": 2138 + }, + { + "epoch": 0.1694592988710636, + "grad_norm": 1.6529560957691534, + "learning_rate": 1.8997405850124786e-05, + "loss": 0.3574, + "step": 2139 + }, + { + "epoch": 0.16953852247969894, + "grad_norm": 1.8860667886597298, + "learning_rate": 1.8996285673227826e-05, + "loss": 0.4449, + "step": 2140 + }, + { + "epoch": 0.16961774608833433, + "grad_norm": 1.9445173946011252, + "learning_rate": 1.899516490396874e-05, + "loss": 0.5066, + "step": 2141 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 1.6820637648159442, + "learning_rate": 1.8994043542421328e-05, + "loss": 0.4024, + "step": 2142 + }, + { + "epoch": 0.16977619330560506, + "grad_norm": 1.9563454784742638, + "learning_rate": 1.8992921588659424e-05, + "loss": 0.4198, + "step": 2143 + }, + { + "epoch": 0.16985541691424044, + "grad_norm": 1.9033491209101583, + "learning_rate": 1.8991799042756906e-05, + "loss": 0.3132, + "step": 2144 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 1.8633506564375288, + "learning_rate": 1.8990675904787688e-05, + "loss": 0.3507, + "step": 2145 + }, + { + "epoch": 0.17001386413151118, + "grad_norm": 2.146509904970231, + "learning_rate": 1.898955217482572e-05, + "loss": 0.4511, + "step": 2146 + }, + { + "epoch": 0.17009308774014656, + "grad_norm": 1.8306317274246138, + "learning_rate": 1.8988427852944997e-05, + "loss": 0.3992, + "step": 2147 + }, + { + "epoch": 0.17017231134878194, + "grad_norm": 1.9880179087695329, + "learning_rate": 1.898730293921955e-05, + "loss": 0.4553, + "step": 2148 + }, + { + "epoch": 0.17025153495741732, + "grad_norm": 2.448597104416439, + "learning_rate": 1.8986177433723446e-05, + "loss": 0.4902, + "step": 2149 + }, + { + "epoch": 0.17033075856605268, + "grad_norm": 1.9976358383312918, + "learning_rate": 1.89850513365308e-05, + "loss": 0.4545, + "step": 2150 + }, + { + "epoch": 0.17040998217468806, + "grad_norm": 2.202723894983435, + "learning_rate": 1.8983924647715756e-05, + "loss": 0.513, + "step": 2151 + }, + { + "epoch": 0.17048920578332344, + "grad_norm": 1.70808889054141, + "learning_rate": 1.89827973673525e-05, + "loss": 0.3365, + "step": 2152 + }, + { + "epoch": 0.1705684293919588, + "grad_norm": 2.098168455340224, + "learning_rate": 1.8981669495515264e-05, + "loss": 0.5016, + "step": 2153 + }, + { + "epoch": 0.17064765300059417, + "grad_norm": 1.9088495150572768, + "learning_rate": 1.8980541032278302e-05, + "loss": 0.5515, + "step": 2154 + }, + { + "epoch": 0.17072687660922956, + "grad_norm": 1.9722023800135497, + "learning_rate": 1.8979411977715928e-05, + "loss": 0.4859, + "step": 2155 + }, + { + "epoch": 0.17080610021786494, + "grad_norm": 1.8623219955418344, + "learning_rate": 1.8978282331902483e-05, + "loss": 0.3516, + "step": 2156 + }, + { + "epoch": 0.1708853238265003, + "grad_norm": 2.1371829738240833, + "learning_rate": 1.8977152094912346e-05, + "loss": 0.504, + "step": 2157 + }, + { + "epoch": 0.17096454743513567, + "grad_norm": 1.843012693582406, + "learning_rate": 1.897602126681994e-05, + "loss": 0.3696, + "step": 2158 + }, + { + "epoch": 0.17104377104377105, + "grad_norm": 1.926789243703306, + "learning_rate": 1.897488984769972e-05, + "loss": 0.3904, + "step": 2159 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 2.0824324285525537, + "learning_rate": 1.8973757837626193e-05, + "loss": 0.3856, + "step": 2160 + }, + { + "epoch": 0.1712022182610418, + "grad_norm": 1.8621294104250126, + "learning_rate": 1.8972625236673887e-05, + "loss": 0.3375, + "step": 2161 + }, + { + "epoch": 0.17128144186967717, + "grad_norm": 1.737299237088058, + "learning_rate": 1.8971492044917386e-05, + "loss": 0.4387, + "step": 2162 + }, + { + "epoch": 0.17136066547831252, + "grad_norm": 1.989813285249737, + "learning_rate": 1.8970358262431297e-05, + "loss": 0.5958, + "step": 2163 + }, + { + "epoch": 0.1714398890869479, + "grad_norm": 2.022091188701207, + "learning_rate": 1.8969223889290283e-05, + "loss": 0.5144, + "step": 2164 + }, + { + "epoch": 0.1715191126955833, + "grad_norm": 2.388596662473868, + "learning_rate": 1.8968088925569032e-05, + "loss": 0.5097, + "step": 2165 + }, + { + "epoch": 0.17159833630421867, + "grad_norm": 1.8249422246841323, + "learning_rate": 1.896695337134228e-05, + "loss": 0.3813, + "step": 2166 + }, + { + "epoch": 0.17167755991285402, + "grad_norm": 1.402784176995135, + "learning_rate": 1.8965817226684794e-05, + "loss": 0.3669, + "step": 2167 + }, + { + "epoch": 0.1717567835214894, + "grad_norm": 2.0367399975948417, + "learning_rate": 1.896468049167138e-05, + "loss": 0.3647, + "step": 2168 + }, + { + "epoch": 0.17183600713012478, + "grad_norm": 1.4095077651953454, + "learning_rate": 1.896354316637689e-05, + "loss": 0.3241, + "step": 2169 + }, + { + "epoch": 0.17191523073876014, + "grad_norm": 1.8942162604065602, + "learning_rate": 1.8962405250876218e-05, + "loss": 0.3848, + "step": 2170 + }, + { + "epoch": 0.17199445434739552, + "grad_norm": 1.7726567893062553, + "learning_rate": 1.896126674524428e-05, + "loss": 0.3696, + "step": 2171 + }, + { + "epoch": 0.1720736779560309, + "grad_norm": 1.6502099044234704, + "learning_rate": 1.896012764955605e-05, + "loss": 0.3754, + "step": 2172 + }, + { + "epoch": 0.17215290156466628, + "grad_norm": 1.864638018255017, + "learning_rate": 1.8958987963886526e-05, + "loss": 0.3498, + "step": 2173 + }, + { + "epoch": 0.17223212517330164, + "grad_norm": 1.783206741932751, + "learning_rate": 1.8957847688310752e-05, + "loss": 0.4461, + "step": 2174 + }, + { + "epoch": 0.17231134878193702, + "grad_norm": 2.047774975163439, + "learning_rate": 1.8956706822903812e-05, + "loss": 0.3177, + "step": 2175 + }, + { + "epoch": 0.1723905723905724, + "grad_norm": 2.035527189904597, + "learning_rate": 1.8955565367740824e-05, + "loss": 0.4246, + "step": 2176 + }, + { + "epoch": 0.17246979599920775, + "grad_norm": 1.7519343645244982, + "learning_rate": 1.8954423322896944e-05, + "loss": 0.3945, + "step": 2177 + }, + { + "epoch": 0.17254901960784313, + "grad_norm": 1.831292065828642, + "learning_rate": 1.895328068844738e-05, + "loss": 0.4323, + "step": 2178 + }, + { + "epoch": 0.17262824321647852, + "grad_norm": 1.8576091695913817, + "learning_rate": 1.8952137464467358e-05, + "loss": 0.4357, + "step": 2179 + }, + { + "epoch": 0.1727074668251139, + "grad_norm": 1.9984153169578844, + "learning_rate": 1.895099365103216e-05, + "loss": 0.3837, + "step": 2180 + }, + { + "epoch": 0.17278669043374925, + "grad_norm": 1.7284041636644616, + "learning_rate": 1.89498492482171e-05, + "loss": 0.3989, + "step": 2181 + }, + { + "epoch": 0.17286591404238463, + "grad_norm": 2.022492091073739, + "learning_rate": 1.8948704256097533e-05, + "loss": 0.483, + "step": 2182 + }, + { + "epoch": 0.17294513765102001, + "grad_norm": 1.6676594627685584, + "learning_rate": 1.8947558674748844e-05, + "loss": 0.3916, + "step": 2183 + }, + { + "epoch": 0.17302436125965537, + "grad_norm": 1.8786959682061501, + "learning_rate": 1.8946412504246474e-05, + "loss": 0.3982, + "step": 2184 + }, + { + "epoch": 0.17310358486829075, + "grad_norm": 1.7347395440781879, + "learning_rate": 1.8945265744665886e-05, + "loss": 0.4129, + "step": 2185 + }, + { + "epoch": 0.17318280847692613, + "grad_norm": 1.839906345040177, + "learning_rate": 1.8944118396082594e-05, + "loss": 0.4373, + "step": 2186 + }, + { + "epoch": 0.17326203208556148, + "grad_norm": 1.8808715706556598, + "learning_rate": 1.8942970458572138e-05, + "loss": 0.3705, + "step": 2187 + }, + { + "epoch": 0.17334125569419687, + "grad_norm": 1.465523771651254, + "learning_rate": 1.894182193221011e-05, + "loss": 0.3839, + "step": 2188 + }, + { + "epoch": 0.17342047930283225, + "grad_norm": 2.045584338401776, + "learning_rate": 1.894067281707213e-05, + "loss": 0.438, + "step": 2189 + }, + { + "epoch": 0.17349970291146763, + "grad_norm": 1.7244714163141794, + "learning_rate": 1.893952311323387e-05, + "loss": 0.3122, + "step": 2190 + }, + { + "epoch": 0.17357892652010298, + "grad_norm": 2.087892759180738, + "learning_rate": 1.8938372820771024e-05, + "loss": 0.4992, + "step": 2191 + }, + { + "epoch": 0.17365815012873836, + "grad_norm": 1.5864066017656742, + "learning_rate": 1.8937221939759334e-05, + "loss": 0.3433, + "step": 2192 + }, + { + "epoch": 0.17373737373737375, + "grad_norm": 1.810884640645776, + "learning_rate": 1.8936070470274587e-05, + "loss": 0.3559, + "step": 2193 + }, + { + "epoch": 0.1738165973460091, + "grad_norm": 2.034336914336385, + "learning_rate": 1.8934918412392596e-05, + "loss": 0.4277, + "step": 2194 + }, + { + "epoch": 0.17389582095464448, + "grad_norm": 2.170366699440581, + "learning_rate": 1.893376576618922e-05, + "loss": 0.4649, + "step": 2195 + }, + { + "epoch": 0.17397504456327986, + "grad_norm": 2.1226685532874807, + "learning_rate": 1.8932612531740354e-05, + "loss": 0.4181, + "step": 2196 + }, + { + "epoch": 0.17405426817191524, + "grad_norm": 1.9851458292958541, + "learning_rate": 1.893145870912193e-05, + "loss": 0.3979, + "step": 2197 + }, + { + "epoch": 0.1741334917805506, + "grad_norm": 1.853482817886539, + "learning_rate": 1.8930304298409933e-05, + "loss": 0.3808, + "step": 2198 + }, + { + "epoch": 0.17421271538918598, + "grad_norm": 1.6820010225809787, + "learning_rate": 1.8929149299680364e-05, + "loss": 0.3381, + "step": 2199 + }, + { + "epoch": 0.17429193899782136, + "grad_norm": 1.7140596998394408, + "learning_rate": 1.8927993713009275e-05, + "loss": 0.3823, + "step": 2200 + }, + { + "epoch": 0.17437116260645671, + "grad_norm": 1.8606979503810697, + "learning_rate": 1.892683753847276e-05, + "loss": 0.407, + "step": 2201 + }, + { + "epoch": 0.1744503862150921, + "grad_norm": 1.7655200320709468, + "learning_rate": 1.892568077614695e-05, + "loss": 0.3668, + "step": 2202 + }, + { + "epoch": 0.17452960982372748, + "grad_norm": 2.07433274871448, + "learning_rate": 1.892452342610801e-05, + "loss": 0.5242, + "step": 2203 + }, + { + "epoch": 0.17460883343236283, + "grad_norm": 2.1122841979181217, + "learning_rate": 1.892336548843214e-05, + "loss": 0.4286, + "step": 2204 + }, + { + "epoch": 0.1746880570409982, + "grad_norm": 1.6973340263343, + "learning_rate": 1.892220696319559e-05, + "loss": 0.347, + "step": 2205 + }, + { + "epoch": 0.1747672806496336, + "grad_norm": 2.055431202364881, + "learning_rate": 1.8921047850474645e-05, + "loss": 0.4514, + "step": 2206 + }, + { + "epoch": 0.17484650425826898, + "grad_norm": 1.7403543808111084, + "learning_rate": 1.891988815034562e-05, + "loss": 0.3488, + "step": 2207 + }, + { + "epoch": 0.17492572786690433, + "grad_norm": 2.1790220862502063, + "learning_rate": 1.891872786288488e-05, + "loss": 0.6366, + "step": 2208 + }, + { + "epoch": 0.1750049514755397, + "grad_norm": 1.71837675448399, + "learning_rate": 1.8917566988168826e-05, + "loss": 0.3661, + "step": 2209 + }, + { + "epoch": 0.1750841750841751, + "grad_norm": 1.6733013712942197, + "learning_rate": 1.8916405526273894e-05, + "loss": 0.4033, + "step": 2210 + }, + { + "epoch": 0.17516339869281045, + "grad_norm": 1.9810764788718538, + "learning_rate": 1.8915243477276563e-05, + "loss": 0.5247, + "step": 2211 + }, + { + "epoch": 0.17524262230144583, + "grad_norm": 1.623001545856307, + "learning_rate": 1.8914080841253348e-05, + "loss": 0.3908, + "step": 2212 + }, + { + "epoch": 0.1753218459100812, + "grad_norm": 2.406920649554192, + "learning_rate": 1.8912917618280796e-05, + "loss": 0.5585, + "step": 2213 + }, + { + "epoch": 0.1754010695187166, + "grad_norm": 2.0108397831424187, + "learning_rate": 1.8911753808435508e-05, + "loss": 0.4854, + "step": 2214 + }, + { + "epoch": 0.17548029312735194, + "grad_norm": 1.7907118992550852, + "learning_rate": 1.891058941179411e-05, + "loss": 0.3247, + "step": 2215 + }, + { + "epoch": 0.17555951673598733, + "grad_norm": 1.5781424876269412, + "learning_rate": 1.8909424428433278e-05, + "loss": 0.4004, + "step": 2216 + }, + { + "epoch": 0.1756387403446227, + "grad_norm": 1.7942504313275773, + "learning_rate": 1.8908258858429716e-05, + "loss": 0.3783, + "step": 2217 + }, + { + "epoch": 0.17571796395325806, + "grad_norm": 2.010130021109717, + "learning_rate": 1.890709270186017e-05, + "loss": 0.4597, + "step": 2218 + }, + { + "epoch": 0.17579718756189344, + "grad_norm": 1.7787478256826252, + "learning_rate": 1.890592595880143e-05, + "loss": 0.3464, + "step": 2219 + }, + { + "epoch": 0.17587641117052882, + "grad_norm": 1.8029842841402366, + "learning_rate": 1.890475862933032e-05, + "loss": 0.5667, + "step": 2220 + }, + { + "epoch": 0.17595563477916418, + "grad_norm": 1.7161573015522174, + "learning_rate": 1.8903590713523698e-05, + "loss": 0.3594, + "step": 2221 + }, + { + "epoch": 0.17603485838779956, + "grad_norm": 1.4311308529672409, + "learning_rate": 1.8902422211458466e-05, + "loss": 0.3366, + "step": 2222 + }, + { + "epoch": 0.17611408199643494, + "grad_norm": 1.6691948159474395, + "learning_rate": 1.890125312321157e-05, + "loss": 0.4429, + "step": 2223 + }, + { + "epoch": 0.17619330560507032, + "grad_norm": 1.8912113260284953, + "learning_rate": 1.8900083448859986e-05, + "loss": 0.3895, + "step": 2224 + }, + { + "epoch": 0.17627252921370568, + "grad_norm": 1.64699710502356, + "learning_rate": 1.8898913188480733e-05, + "loss": 0.3213, + "step": 2225 + }, + { + "epoch": 0.17635175282234106, + "grad_norm": 2.556629372696625, + "learning_rate": 1.8897742342150863e-05, + "loss": 0.5572, + "step": 2226 + }, + { + "epoch": 0.17643097643097644, + "grad_norm": 2.045487012558576, + "learning_rate": 1.8896570909947477e-05, + "loss": 0.539, + "step": 2227 + }, + { + "epoch": 0.1765102000396118, + "grad_norm": 2.0179074922465023, + "learning_rate": 1.88953988919477e-05, + "loss": 0.3821, + "step": 2228 + }, + { + "epoch": 0.17658942364824717, + "grad_norm": 1.8211579487525722, + "learning_rate": 1.8894226288228707e-05, + "loss": 0.4121, + "step": 2229 + }, + { + "epoch": 0.17666864725688255, + "grad_norm": 2.615598291706729, + "learning_rate": 1.8893053098867714e-05, + "loss": 0.3657, + "step": 2230 + }, + { + "epoch": 0.17674787086551794, + "grad_norm": 1.9369807536093724, + "learning_rate": 1.889187932394196e-05, + "loss": 0.5303, + "step": 2231 + }, + { + "epoch": 0.1768270944741533, + "grad_norm": 1.6452689917937342, + "learning_rate": 1.889070496352874e-05, + "loss": 0.4002, + "step": 2232 + }, + { + "epoch": 0.17690631808278867, + "grad_norm": 1.8744234724362385, + "learning_rate": 1.888953001770538e-05, + "loss": 0.4721, + "step": 2233 + }, + { + "epoch": 0.17698554169142405, + "grad_norm": 2.2046879278370866, + "learning_rate": 1.8888354486549238e-05, + "loss": 0.4631, + "step": 2234 + }, + { + "epoch": 0.1770647653000594, + "grad_norm": 1.8778360823220894, + "learning_rate": 1.888717837013772e-05, + "loss": 0.3896, + "step": 2235 + }, + { + "epoch": 0.1771439889086948, + "grad_norm": 1.8536637664604478, + "learning_rate": 1.8886001668548273e-05, + "loss": 0.4214, + "step": 2236 + }, + { + "epoch": 0.17722321251733017, + "grad_norm": 1.6890399945181858, + "learning_rate": 1.8884824381858368e-05, + "loss": 0.4159, + "step": 2237 + }, + { + "epoch": 0.17730243612596555, + "grad_norm": 1.7342792892104857, + "learning_rate": 1.888364651014553e-05, + "loss": 0.3457, + "step": 2238 + }, + { + "epoch": 0.1773816597346009, + "grad_norm": 1.7625808294094478, + "learning_rate": 1.888246805348732e-05, + "loss": 0.3966, + "step": 2239 + }, + { + "epoch": 0.1774608833432363, + "grad_norm": 1.7118432210807184, + "learning_rate": 1.8881289011961323e-05, + "loss": 0.4413, + "step": 2240 + }, + { + "epoch": 0.17754010695187167, + "grad_norm": 1.838074358096047, + "learning_rate": 1.8880109385645184e-05, + "loss": 0.4436, + "step": 2241 + }, + { + "epoch": 0.17761933056050702, + "grad_norm": 1.633478721072027, + "learning_rate": 1.8878929174616566e-05, + "loss": 0.3417, + "step": 2242 + }, + { + "epoch": 0.1776985541691424, + "grad_norm": 1.9819808143150355, + "learning_rate": 1.887774837895318e-05, + "loss": 0.3732, + "step": 2243 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 1.6699514643121105, + "learning_rate": 1.887656699873279e-05, + "loss": 0.5105, + "step": 2244 + }, + { + "epoch": 0.17785700138641314, + "grad_norm": 1.7868109276929764, + "learning_rate": 1.887538503403317e-05, + "loss": 0.4515, + "step": 2245 + }, + { + "epoch": 0.17793622499504852, + "grad_norm": 2.029234407698369, + "learning_rate": 1.8874202484932148e-05, + "loss": 0.6043, + "step": 2246 + }, + { + "epoch": 0.1780154486036839, + "grad_norm": 1.9147865702362916, + "learning_rate": 1.8873019351507596e-05, + "loss": 0.501, + "step": 2247 + }, + { + "epoch": 0.17809467221231928, + "grad_norm": 1.8624715434446462, + "learning_rate": 1.887183563383741e-05, + "loss": 0.4854, + "step": 2248 + }, + { + "epoch": 0.17817389582095464, + "grad_norm": 1.556742610453188, + "learning_rate": 1.8870651331999542e-05, + "loss": 0.3313, + "step": 2249 + }, + { + "epoch": 0.17825311942959002, + "grad_norm": 1.841256103307755, + "learning_rate": 1.886946644607196e-05, + "loss": 0.3702, + "step": 2250 + }, + { + "epoch": 0.1783323430382254, + "grad_norm": 1.6772844439285954, + "learning_rate": 1.8868280976132697e-05, + "loss": 0.361, + "step": 2251 + }, + { + "epoch": 0.17841156664686075, + "grad_norm": 1.6568975210615398, + "learning_rate": 1.8867094922259798e-05, + "loss": 0.3319, + "step": 2252 + }, + { + "epoch": 0.17849079025549613, + "grad_norm": 1.5713953858385998, + "learning_rate": 1.8865908284531368e-05, + "loss": 0.3441, + "step": 2253 + }, + { + "epoch": 0.17857001386413152, + "grad_norm": 1.8025363517663684, + "learning_rate": 1.8864721063025536e-05, + "loss": 0.4084, + "step": 2254 + }, + { + "epoch": 0.1786492374727669, + "grad_norm": 1.7126671880469564, + "learning_rate": 1.8863533257820475e-05, + "loss": 0.3185, + "step": 2255 + }, + { + "epoch": 0.17872846108140225, + "grad_norm": 1.7851946841043358, + "learning_rate": 1.8862344868994395e-05, + "loss": 0.4637, + "step": 2256 + }, + { + "epoch": 0.17880768469003763, + "grad_norm": 1.5654988003945565, + "learning_rate": 1.8861155896625553e-05, + "loss": 0.3847, + "step": 2257 + }, + { + "epoch": 0.17888690829867301, + "grad_norm": 1.9567725757796477, + "learning_rate": 1.885996634079223e-05, + "loss": 0.398, + "step": 2258 + }, + { + "epoch": 0.17896613190730837, + "grad_norm": 1.6956798680506862, + "learning_rate": 1.8858776201572758e-05, + "loss": 0.4264, + "step": 2259 + }, + { + "epoch": 0.17904535551594375, + "grad_norm": 1.6010016102637785, + "learning_rate": 1.8857585479045493e-05, + "loss": 0.2754, + "step": 2260 + }, + { + "epoch": 0.17912457912457913, + "grad_norm": 2.4283995901520323, + "learning_rate": 1.8856394173288848e-05, + "loss": 0.5484, + "step": 2261 + }, + { + "epoch": 0.17920380273321448, + "grad_norm": 1.8478325552885637, + "learning_rate": 1.8855202284381264e-05, + "loss": 0.4718, + "step": 2262 + }, + { + "epoch": 0.17928302634184987, + "grad_norm": 1.6448562805599607, + "learning_rate": 1.8854009812401213e-05, + "loss": 0.355, + "step": 2263 + }, + { + "epoch": 0.17936224995048525, + "grad_norm": 1.7881800379246628, + "learning_rate": 1.885281675742722e-05, + "loss": 0.4924, + "step": 2264 + }, + { + "epoch": 0.17944147355912063, + "grad_norm": 1.4895489922092775, + "learning_rate": 1.885162311953784e-05, + "loss": 0.3156, + "step": 2265 + }, + { + "epoch": 0.17952069716775598, + "grad_norm": 1.5722179443363025, + "learning_rate": 1.885042889881167e-05, + "loss": 0.3398, + "step": 2266 + }, + { + "epoch": 0.17959992077639136, + "grad_norm": 2.073310183346257, + "learning_rate": 1.8849234095327343e-05, + "loss": 0.5563, + "step": 2267 + }, + { + "epoch": 0.17967914438502675, + "grad_norm": 2.7623455888483717, + "learning_rate": 1.884803870916353e-05, + "loss": 0.4301, + "step": 2268 + }, + { + "epoch": 0.1797583679936621, + "grad_norm": 1.8925565254847532, + "learning_rate": 1.884684274039894e-05, + "loss": 0.4399, + "step": 2269 + }, + { + "epoch": 0.17983759160229748, + "grad_norm": 2.0557675513682216, + "learning_rate": 1.8845646189112327e-05, + "loss": 0.4875, + "step": 2270 + }, + { + "epoch": 0.17991681521093286, + "grad_norm": 1.898396110131812, + "learning_rate": 1.8844449055382473e-05, + "loss": 0.3867, + "step": 2271 + }, + { + "epoch": 0.17999603881956824, + "grad_norm": 1.7200747344175484, + "learning_rate": 1.8843251339288207e-05, + "loss": 0.3697, + "step": 2272 + }, + { + "epoch": 0.1800752624282036, + "grad_norm": 1.7646438533908584, + "learning_rate": 1.884205304090839e-05, + "loss": 0.4222, + "step": 2273 + }, + { + "epoch": 0.18015448603683898, + "grad_norm": 2.07752988484997, + "learning_rate": 1.8840854160321926e-05, + "loss": 0.309, + "step": 2274 + }, + { + "epoch": 0.18023370964547436, + "grad_norm": 1.9290459038600012, + "learning_rate": 1.8839654697607756e-05, + "loss": 0.3583, + "step": 2275 + }, + { + "epoch": 0.18031293325410971, + "grad_norm": 1.9685104820992123, + "learning_rate": 1.8838454652844857e-05, + "loss": 0.3555, + "step": 2276 + }, + { + "epoch": 0.1803921568627451, + "grad_norm": 1.53254060447503, + "learning_rate": 1.8837254026112245e-05, + "loss": 0.2805, + "step": 2277 + }, + { + "epoch": 0.18047138047138048, + "grad_norm": 1.4558650278677392, + "learning_rate": 1.883605281748898e-05, + "loss": 0.3279, + "step": 2278 + }, + { + "epoch": 0.18055060408001586, + "grad_norm": 1.4837920476190676, + "learning_rate": 1.8834851027054152e-05, + "loss": 0.3208, + "step": 2279 + }, + { + "epoch": 0.1806298276886512, + "grad_norm": 1.7923882954292831, + "learning_rate": 1.8833648654886898e-05, + "loss": 0.5319, + "step": 2280 + }, + { + "epoch": 0.1807090512972866, + "grad_norm": 1.6702166683689448, + "learning_rate": 1.883244570106638e-05, + "loss": 0.3297, + "step": 2281 + }, + { + "epoch": 0.18078827490592198, + "grad_norm": 1.6943542703288907, + "learning_rate": 1.8831242165671816e-05, + "loss": 0.3677, + "step": 2282 + }, + { + "epoch": 0.18086749851455733, + "grad_norm": 1.8095776228294747, + "learning_rate": 1.8830038048782445e-05, + "loss": 0.4291, + "step": 2283 + }, + { + "epoch": 0.1809467221231927, + "grad_norm": 2.104819798483281, + "learning_rate": 1.8828833350477556e-05, + "loss": 0.3804, + "step": 2284 + }, + { + "epoch": 0.1810259457318281, + "grad_norm": 2.3131001287007913, + "learning_rate": 1.8827628070836477e-05, + "loss": 0.5896, + "step": 2285 + }, + { + "epoch": 0.18110516934046345, + "grad_norm": 1.433658407859947, + "learning_rate": 1.8826422209938563e-05, + "loss": 0.2805, + "step": 2286 + }, + { + "epoch": 0.18118439294909883, + "grad_norm": 2.161279939067316, + "learning_rate": 1.8825215767863215e-05, + "loss": 0.4456, + "step": 2287 + }, + { + "epoch": 0.1812636165577342, + "grad_norm": 2.216487613370055, + "learning_rate": 1.8824008744689873e-05, + "loss": 0.4306, + "step": 2288 + }, + { + "epoch": 0.1813428401663696, + "grad_norm": 2.0995617196942513, + "learning_rate": 1.8822801140498014e-05, + "loss": 0.4203, + "step": 2289 + }, + { + "epoch": 0.18142206377500494, + "grad_norm": 1.8671765245175258, + "learning_rate": 1.8821592955367154e-05, + "loss": 0.5573, + "step": 2290 + }, + { + "epoch": 0.18150128738364033, + "grad_norm": 2.2008061758558997, + "learning_rate": 1.8820384189376845e-05, + "loss": 0.506, + "step": 2291 + }, + { + "epoch": 0.1815805109922757, + "grad_norm": 1.8644571066637654, + "learning_rate": 1.8819174842606675e-05, + "loss": 0.3825, + "step": 2292 + }, + { + "epoch": 0.18165973460091106, + "grad_norm": 1.6594249224595061, + "learning_rate": 1.8817964915136277e-05, + "loss": 0.3296, + "step": 2293 + }, + { + "epoch": 0.18173895820954644, + "grad_norm": 2.129810835139294, + "learning_rate": 1.881675440704532e-05, + "loss": 0.3965, + "step": 2294 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.5873513756967357, + "learning_rate": 1.881554331841351e-05, + "loss": 0.3441, + "step": 2295 + }, + { + "epoch": 0.1818974054268172, + "grad_norm": 2.0750169233732354, + "learning_rate": 1.881433164932059e-05, + "loss": 0.4623, + "step": 2296 + }, + { + "epoch": 0.18197662903545256, + "grad_norm": 1.7549889422274445, + "learning_rate": 1.881311939984634e-05, + "loss": 0.3912, + "step": 2297 + }, + { + "epoch": 0.18205585264408794, + "grad_norm": 1.573779645385025, + "learning_rate": 1.8811906570070583e-05, + "loss": 0.3316, + "step": 2298 + }, + { + "epoch": 0.18213507625272332, + "grad_norm": 2.0709601923959, + "learning_rate": 1.8810693160073184e-05, + "loss": 0.4057, + "step": 2299 + }, + { + "epoch": 0.18221429986135868, + "grad_norm": 2.097543494234802, + "learning_rate": 1.880947916993403e-05, + "loss": 0.3673, + "step": 2300 + }, + { + "epoch": 0.18229352346999406, + "grad_norm": 1.5298726500406574, + "learning_rate": 1.8808264599733065e-05, + "loss": 0.2659, + "step": 2301 + }, + { + "epoch": 0.18237274707862944, + "grad_norm": 1.5216584193933604, + "learning_rate": 1.8807049449550254e-05, + "loss": 0.2266, + "step": 2302 + }, + { + "epoch": 0.1824519706872648, + "grad_norm": 1.9837962569782617, + "learning_rate": 1.8805833719465617e-05, + "loss": 0.4794, + "step": 2303 + }, + { + "epoch": 0.18253119429590017, + "grad_norm": 2.0585730297186937, + "learning_rate": 1.88046174095592e-05, + "loss": 0.4592, + "step": 2304 + }, + { + "epoch": 0.18261041790453555, + "grad_norm": 1.7416208978356076, + "learning_rate": 1.880340051991109e-05, + "loss": 0.3332, + "step": 2305 + }, + { + "epoch": 0.18268964151317094, + "grad_norm": 2.265459999298335, + "learning_rate": 1.8802183050601417e-05, + "loss": 0.4646, + "step": 2306 + }, + { + "epoch": 0.1827688651218063, + "grad_norm": 2.203347640226268, + "learning_rate": 1.8800965001710342e-05, + "loss": 0.5617, + "step": 2307 + }, + { + "epoch": 0.18284808873044167, + "grad_norm": 1.7305201048658656, + "learning_rate": 1.879974637331807e-05, + "loss": 0.4589, + "step": 2308 + }, + { + "epoch": 0.18292731233907705, + "grad_norm": 1.9013759269040318, + "learning_rate": 1.879852716550484e-05, + "loss": 0.4891, + "step": 2309 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 1.5753954653956812, + "learning_rate": 1.8797307378350935e-05, + "loss": 0.4267, + "step": 2310 + }, + { + "epoch": 0.1830857595563478, + "grad_norm": 1.8518004712075684, + "learning_rate": 1.8796087011936665e-05, + "loss": 0.4864, + "step": 2311 + }, + { + "epoch": 0.18316498316498317, + "grad_norm": 1.899141150680386, + "learning_rate": 1.8794866066342394e-05, + "loss": 0.4532, + "step": 2312 + }, + { + "epoch": 0.18324420677361855, + "grad_norm": 2.040495096945813, + "learning_rate": 1.879364454164851e-05, + "loss": 0.3518, + "step": 2313 + }, + { + "epoch": 0.1833234303822539, + "grad_norm": 1.7950597727607651, + "learning_rate": 1.879242243793544e-05, + "loss": 0.4003, + "step": 2314 + }, + { + "epoch": 0.18340265399088929, + "grad_norm": 1.8331303240602623, + "learning_rate": 1.8791199755283664e-05, + "loss": 0.3965, + "step": 2315 + }, + { + "epoch": 0.18348187759952467, + "grad_norm": 1.7683662036436285, + "learning_rate": 1.878997649377368e-05, + "loss": 0.4382, + "step": 2316 + }, + { + "epoch": 0.18356110120816002, + "grad_norm": 1.9893096676356994, + "learning_rate": 1.8788752653486045e-05, + "loss": 0.5534, + "step": 2317 + }, + { + "epoch": 0.1836403248167954, + "grad_norm": 2.5506165340396256, + "learning_rate": 1.878752823450133e-05, + "loss": 0.5076, + "step": 2318 + }, + { + "epoch": 0.18371954842543078, + "grad_norm": 1.6382932677607578, + "learning_rate": 1.878630323690017e-05, + "loss": 0.2987, + "step": 2319 + }, + { + "epoch": 0.18379877203406614, + "grad_norm": 1.7383952777398874, + "learning_rate": 1.8785077660763217e-05, + "loss": 0.3365, + "step": 2320 + }, + { + "epoch": 0.18387799564270152, + "grad_norm": 1.8154240396472983, + "learning_rate": 1.8783851506171166e-05, + "loss": 0.3896, + "step": 2321 + }, + { + "epoch": 0.1839572192513369, + "grad_norm": 1.7578521501095088, + "learning_rate": 1.8782624773204764e-05, + "loss": 0.3561, + "step": 2322 + }, + { + "epoch": 0.18403644285997228, + "grad_norm": 2.0584519642640684, + "learning_rate": 1.8781397461944777e-05, + "loss": 0.3873, + "step": 2323 + }, + { + "epoch": 0.18411566646860764, + "grad_norm": 1.7436913212805967, + "learning_rate": 1.8780169572472024e-05, + "loss": 0.4053, + "step": 2324 + }, + { + "epoch": 0.18419489007724302, + "grad_norm": 1.5158083241245237, + "learning_rate": 1.8778941104867347e-05, + "loss": 0.256, + "step": 2325 + }, + { + "epoch": 0.1842741136858784, + "grad_norm": 1.9183068256449582, + "learning_rate": 1.8777712059211643e-05, + "loss": 0.4625, + "step": 2326 + }, + { + "epoch": 0.18435333729451375, + "grad_norm": 1.7576413609117723, + "learning_rate": 1.8776482435585836e-05, + "loss": 0.4278, + "step": 2327 + }, + { + "epoch": 0.18443256090314913, + "grad_norm": 2.419763721225595, + "learning_rate": 1.877525223407089e-05, + "loss": 0.4879, + "step": 2328 + }, + { + "epoch": 0.18451178451178452, + "grad_norm": 1.6680153888882208, + "learning_rate": 1.877402145474781e-05, + "loss": 0.4523, + "step": 2329 + }, + { + "epoch": 0.1845910081204199, + "grad_norm": 1.9967123683697205, + "learning_rate": 1.877279009769763e-05, + "loss": 0.5596, + "step": 2330 + }, + { + "epoch": 0.18467023172905525, + "grad_norm": 1.927182763885645, + "learning_rate": 1.8771558163001438e-05, + "loss": 0.4686, + "step": 2331 + }, + { + "epoch": 0.18474945533769063, + "grad_norm": 1.747050931247662, + "learning_rate": 1.8770325650740347e-05, + "loss": 0.3459, + "step": 2332 + }, + { + "epoch": 0.184828678946326, + "grad_norm": 1.9548417004629488, + "learning_rate": 1.876909256099551e-05, + "loss": 0.4088, + "step": 2333 + }, + { + "epoch": 0.18490790255496137, + "grad_norm": 2.144296732351858, + "learning_rate": 1.876785889384812e-05, + "loss": 0.4656, + "step": 2334 + }, + { + "epoch": 0.18498712616359675, + "grad_norm": 2.031294617666716, + "learning_rate": 1.8766624649379415e-05, + "loss": 0.4511, + "step": 2335 + }, + { + "epoch": 0.18506634977223213, + "grad_norm": 2.1992577270489178, + "learning_rate": 1.8765389827670657e-05, + "loss": 0.5271, + "step": 2336 + }, + { + "epoch": 0.1851455733808675, + "grad_norm": 1.8328758846218425, + "learning_rate": 1.8764154428803155e-05, + "loss": 0.3273, + "step": 2337 + }, + { + "epoch": 0.18522479698950287, + "grad_norm": 1.7319106122120445, + "learning_rate": 1.8762918452858256e-05, + "loss": 0.3204, + "step": 2338 + }, + { + "epoch": 0.18530402059813825, + "grad_norm": 1.7051350317911467, + "learning_rate": 1.876168189991734e-05, + "loss": 0.3261, + "step": 2339 + }, + { + "epoch": 0.18538324420677363, + "grad_norm": 1.635363196019197, + "learning_rate": 1.876044477006183e-05, + "loss": 0.3294, + "step": 2340 + }, + { + "epoch": 0.18546246781540898, + "grad_norm": 1.8682510709027873, + "learning_rate": 1.8759207063373183e-05, + "loss": 0.4412, + "step": 2341 + }, + { + "epoch": 0.18554169142404436, + "grad_norm": 1.9523791002269242, + "learning_rate": 1.87579687799329e-05, + "loss": 0.3822, + "step": 2342 + }, + { + "epoch": 0.18562091503267975, + "grad_norm": 2.1880988821770866, + "learning_rate": 1.875672991982251e-05, + "loss": 0.4145, + "step": 2343 + }, + { + "epoch": 0.1857001386413151, + "grad_norm": 1.7331520326443695, + "learning_rate": 1.875549048312359e-05, + "loss": 0.3571, + "step": 2344 + }, + { + "epoch": 0.18577936224995048, + "grad_norm": 2.275734147404578, + "learning_rate": 1.8754250469917753e-05, + "loss": 0.6173, + "step": 2345 + }, + { + "epoch": 0.18585858585858586, + "grad_norm": 1.9917886538701886, + "learning_rate": 1.8753009880286647e-05, + "loss": 0.4861, + "step": 2346 + }, + { + "epoch": 0.18593780946722124, + "grad_norm": 2.156289567617249, + "learning_rate": 1.8751768714311952e-05, + "loss": 0.4442, + "step": 2347 + }, + { + "epoch": 0.1860170330758566, + "grad_norm": 1.8264008770166533, + "learning_rate": 1.87505269720754e-05, + "loss": 0.4578, + "step": 2348 + }, + { + "epoch": 0.18609625668449198, + "grad_norm": 1.687707281552878, + "learning_rate": 1.8749284653658754e-05, + "loss": 0.4556, + "step": 2349 + }, + { + "epoch": 0.18617548029312736, + "grad_norm": 1.8904634684342112, + "learning_rate": 1.874804175914381e-05, + "loss": 0.4188, + "step": 2350 + }, + { + "epoch": 0.1862547039017627, + "grad_norm": 1.5538120984870052, + "learning_rate": 1.8746798288612405e-05, + "loss": 0.3356, + "step": 2351 + }, + { + "epoch": 0.1863339275103981, + "grad_norm": 1.8939729130562217, + "learning_rate": 1.8745554242146428e-05, + "loss": 0.4321, + "step": 2352 + }, + { + "epoch": 0.18641315111903348, + "grad_norm": 1.781224552420944, + "learning_rate": 1.874430961982778e-05, + "loss": 0.4184, + "step": 2353 + }, + { + "epoch": 0.18649237472766886, + "grad_norm": 1.670711878932282, + "learning_rate": 1.874306442173842e-05, + "loss": 0.391, + "step": 2354 + }, + { + "epoch": 0.1865715983363042, + "grad_norm": 1.6724785368717014, + "learning_rate": 1.8741818647960337e-05, + "loss": 0.3589, + "step": 2355 + }, + { + "epoch": 0.1866508219449396, + "grad_norm": 1.7555861838819646, + "learning_rate": 1.8740572298575558e-05, + "loss": 0.3607, + "step": 2356 + }, + { + "epoch": 0.18673004555357497, + "grad_norm": 1.9545398762094552, + "learning_rate": 1.8739325373666152e-05, + "loss": 0.4158, + "step": 2357 + }, + { + "epoch": 0.18680926916221033, + "grad_norm": 1.7468752504306375, + "learning_rate": 1.8738077873314218e-05, + "loss": 0.3636, + "step": 2358 + }, + { + "epoch": 0.1868884927708457, + "grad_norm": 2.06051487039993, + "learning_rate": 1.8736829797601903e-05, + "loss": 0.5174, + "step": 2359 + }, + { + "epoch": 0.1869677163794811, + "grad_norm": 2.0622390544462497, + "learning_rate": 1.8735581146611387e-05, + "loss": 0.4825, + "step": 2360 + }, + { + "epoch": 0.18704693998811645, + "grad_norm": 1.8789107491141992, + "learning_rate": 1.873433192042488e-05, + "loss": 0.3824, + "step": 2361 + }, + { + "epoch": 0.18712616359675183, + "grad_norm": 1.6979003987512717, + "learning_rate": 1.8733082119124646e-05, + "loss": 0.3843, + "step": 2362 + }, + { + "epoch": 0.1872053872053872, + "grad_norm": 2.035126859611104, + "learning_rate": 1.8731831742792974e-05, + "loss": 0.5086, + "step": 2363 + }, + { + "epoch": 0.1872846108140226, + "grad_norm": 1.9832439389715169, + "learning_rate": 1.87305807915122e-05, + "loss": 0.4475, + "step": 2364 + }, + { + "epoch": 0.18736383442265794, + "grad_norm": 1.8418365032068778, + "learning_rate": 1.8729329265364685e-05, + "loss": 0.413, + "step": 2365 + }, + { + "epoch": 0.18744305803129332, + "grad_norm": 1.8788030236637507, + "learning_rate": 1.8728077164432844e-05, + "loss": 0.4368, + "step": 2366 + }, + { + "epoch": 0.1875222816399287, + "grad_norm": 1.7870952935908009, + "learning_rate": 1.872682448879912e-05, + "loss": 0.325, + "step": 2367 + }, + { + "epoch": 0.18760150524856406, + "grad_norm": 1.7754411529992522, + "learning_rate": 1.8725571238545992e-05, + "loss": 0.3682, + "step": 2368 + }, + { + "epoch": 0.18768072885719944, + "grad_norm": 1.7655766330990534, + "learning_rate": 1.872431741375598e-05, + "loss": 0.4078, + "step": 2369 + }, + { + "epoch": 0.18775995246583482, + "grad_norm": 1.6194993772848008, + "learning_rate": 1.872306301451165e-05, + "loss": 0.268, + "step": 2370 + }, + { + "epoch": 0.1878391760744702, + "grad_norm": 1.7114325339534924, + "learning_rate": 1.872180804089559e-05, + "loss": 0.3151, + "step": 2371 + }, + { + "epoch": 0.18791839968310556, + "grad_norm": 1.8473532529845424, + "learning_rate": 1.8720552492990438e-05, + "loss": 0.3935, + "step": 2372 + }, + { + "epoch": 0.18799762329174094, + "grad_norm": 1.8641753718218268, + "learning_rate": 1.8719296370878866e-05, + "loss": 0.3951, + "step": 2373 + }, + { + "epoch": 0.18807684690037632, + "grad_norm": 1.8870948327931774, + "learning_rate": 1.871803967464358e-05, + "loss": 0.3629, + "step": 2374 + }, + { + "epoch": 0.18815607050901167, + "grad_norm": 2.196962506361073, + "learning_rate": 1.8716782404367333e-05, + "loss": 0.3604, + "step": 2375 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 1.7294808074470036, + "learning_rate": 1.8715524560132906e-05, + "loss": 0.3808, + "step": 2376 + }, + { + "epoch": 0.18831451772628244, + "grad_norm": 2.0256550860295954, + "learning_rate": 1.8714266142023124e-05, + "loss": 0.4235, + "step": 2377 + }, + { + "epoch": 0.18839374133491782, + "grad_norm": 1.7510176391458774, + "learning_rate": 1.8713007150120846e-05, + "loss": 0.375, + "step": 2378 + }, + { + "epoch": 0.18847296494355317, + "grad_norm": 1.892410094595781, + "learning_rate": 1.871174758450897e-05, + "loss": 0.4023, + "step": 2379 + }, + { + "epoch": 0.18855218855218855, + "grad_norm": 1.9373251719769788, + "learning_rate": 1.8710487445270436e-05, + "loss": 0.4978, + "step": 2380 + }, + { + "epoch": 0.18863141216082394, + "grad_norm": 2.0180704993445766, + "learning_rate": 1.8709226732488216e-05, + "loss": 0.3978, + "step": 2381 + }, + { + "epoch": 0.1887106357694593, + "grad_norm": 1.7390286886625042, + "learning_rate": 1.8707965446245317e-05, + "loss": 0.3696, + "step": 2382 + }, + { + "epoch": 0.18878985937809467, + "grad_norm": 1.8298931479275429, + "learning_rate": 1.87067035866248e-05, + "loss": 0.3391, + "step": 2383 + }, + { + "epoch": 0.18886908298673005, + "grad_norm": 1.4915163930334776, + "learning_rate": 1.8705441153709742e-05, + "loss": 0.3487, + "step": 2384 + }, + { + "epoch": 0.1889483065953654, + "grad_norm": 1.7732297991573869, + "learning_rate": 1.8704178147583273e-05, + "loss": 0.4103, + "step": 2385 + }, + { + "epoch": 0.1890275302040008, + "grad_norm": 1.5095699225614836, + "learning_rate": 1.8702914568328555e-05, + "loss": 0.375, + "step": 2386 + }, + { + "epoch": 0.18910675381263617, + "grad_norm": 1.4673082236948103, + "learning_rate": 1.8701650416028788e-05, + "loss": 0.2898, + "step": 2387 + }, + { + "epoch": 0.18918597742127155, + "grad_norm": 1.7899505532539894, + "learning_rate": 1.870038569076721e-05, + "loss": 0.3584, + "step": 2388 + }, + { + "epoch": 0.1892652010299069, + "grad_norm": 1.7333453460030324, + "learning_rate": 1.86991203926271e-05, + "loss": 0.3415, + "step": 2389 + }, + { + "epoch": 0.18934442463854229, + "grad_norm": 2.0920851841960606, + "learning_rate": 1.8697854521691767e-05, + "loss": 0.4613, + "step": 2390 + }, + { + "epoch": 0.18942364824717767, + "grad_norm": 2.2864957706129863, + "learning_rate": 1.8696588078044566e-05, + "loss": 0.4501, + "step": 2391 + }, + { + "epoch": 0.18950287185581302, + "grad_norm": 2.063547774954329, + "learning_rate": 1.8695321061768886e-05, + "loss": 0.4536, + "step": 2392 + }, + { + "epoch": 0.1895820954644484, + "grad_norm": 1.6002743802277264, + "learning_rate": 1.8694053472948154e-05, + "loss": 0.3797, + "step": 2393 + }, + { + "epoch": 0.18966131907308378, + "grad_norm": 1.888273259717488, + "learning_rate": 1.8692785311665835e-05, + "loss": 0.3678, + "step": 2394 + }, + { + "epoch": 0.18974054268171917, + "grad_norm": 1.635737304559724, + "learning_rate": 1.8691516578005426e-05, + "loss": 0.3704, + "step": 2395 + }, + { + "epoch": 0.18981976629035452, + "grad_norm": 1.4483407927362657, + "learning_rate": 1.8690247272050474e-05, + "loss": 0.2832, + "step": 2396 + }, + { + "epoch": 0.1898989898989899, + "grad_norm": 1.5334735991646946, + "learning_rate": 1.8688977393884555e-05, + "loss": 0.3018, + "step": 2397 + }, + { + "epoch": 0.18997821350762528, + "grad_norm": 1.9058660957587708, + "learning_rate": 1.868770694359128e-05, + "loss": 0.3983, + "step": 2398 + }, + { + "epoch": 0.19005743711626064, + "grad_norm": 2.0388866409711555, + "learning_rate": 1.868643592125431e-05, + "loss": 0.4391, + "step": 2399 + }, + { + "epoch": 0.19013666072489602, + "grad_norm": 2.287289222407347, + "learning_rate": 1.8685164326957327e-05, + "loss": 0.5266, + "step": 2400 + }, + { + "epoch": 0.1902158843335314, + "grad_norm": 1.932741715497152, + "learning_rate": 1.8683892160784066e-05, + "loss": 0.4238, + "step": 2401 + }, + { + "epoch": 0.19029510794216675, + "grad_norm": 1.8191490322617931, + "learning_rate": 1.868261942281829e-05, + "loss": 0.4583, + "step": 2402 + }, + { + "epoch": 0.19037433155080213, + "grad_norm": 2.303602836911234, + "learning_rate": 1.86813461131438e-05, + "loss": 0.4993, + "step": 2403 + }, + { + "epoch": 0.19045355515943752, + "grad_norm": 1.7777739596052975, + "learning_rate": 1.8680072231844445e-05, + "loss": 0.3685, + "step": 2404 + }, + { + "epoch": 0.1905327787680729, + "grad_norm": 1.4140370681319647, + "learning_rate": 1.8678797779004096e-05, + "loss": 0.3052, + "step": 2405 + }, + { + "epoch": 0.19061200237670825, + "grad_norm": 1.8499230873998467, + "learning_rate": 1.8677522754706677e-05, + "loss": 0.4748, + "step": 2406 + }, + { + "epoch": 0.19069122598534363, + "grad_norm": 1.6907549028453686, + "learning_rate": 1.8676247159036132e-05, + "loss": 0.413, + "step": 2407 + }, + { + "epoch": 0.190770449593979, + "grad_norm": 1.957102796744527, + "learning_rate": 1.8674970992076465e-05, + "loss": 0.3538, + "step": 2408 + }, + { + "epoch": 0.19084967320261437, + "grad_norm": 2.1751572747829657, + "learning_rate": 1.8673694253911696e-05, + "loss": 0.5967, + "step": 2409 + }, + { + "epoch": 0.19092889681124975, + "grad_norm": 1.8679825237373502, + "learning_rate": 1.8672416944625896e-05, + "loss": 0.4704, + "step": 2410 + }, + { + "epoch": 0.19100812041988513, + "grad_norm": 1.8916616445963068, + "learning_rate": 1.867113906430317e-05, + "loss": 0.4638, + "step": 2411 + }, + { + "epoch": 0.1910873440285205, + "grad_norm": 1.5037310405266053, + "learning_rate": 1.8669860613027657e-05, + "loss": 0.343, + "step": 2412 + }, + { + "epoch": 0.19116656763715587, + "grad_norm": 2.342439312107435, + "learning_rate": 1.8668581590883544e-05, + "loss": 0.5825, + "step": 2413 + }, + { + "epoch": 0.19124579124579125, + "grad_norm": 2.116267144376701, + "learning_rate": 1.8667301997955038e-05, + "loss": 0.4826, + "step": 2414 + }, + { + "epoch": 0.19132501485442663, + "grad_norm": 1.789201147244534, + "learning_rate": 1.8666021834326404e-05, + "loss": 0.4607, + "step": 2415 + }, + { + "epoch": 0.19140423846306198, + "grad_norm": 2.1631592228638614, + "learning_rate": 1.866474110008193e-05, + "loss": 0.4965, + "step": 2416 + }, + { + "epoch": 0.19148346207169736, + "grad_norm": 1.7570021410851158, + "learning_rate": 1.8663459795305946e-05, + "loss": 0.4359, + "step": 2417 + }, + { + "epoch": 0.19156268568033274, + "grad_norm": 1.7415731620720836, + "learning_rate": 1.866217792008282e-05, + "loss": 0.4227, + "step": 2418 + }, + { + "epoch": 0.1916419092889681, + "grad_norm": 1.6731262756305025, + "learning_rate": 1.866089547449696e-05, + "loss": 0.3284, + "step": 2419 + }, + { + "epoch": 0.19172113289760348, + "grad_norm": 1.9295648640892022, + "learning_rate": 1.8659612458632802e-05, + "loss": 0.4063, + "step": 2420 + }, + { + "epoch": 0.19180035650623886, + "grad_norm": 1.6838959609857238, + "learning_rate": 1.8658328872574833e-05, + "loss": 0.3641, + "step": 2421 + }, + { + "epoch": 0.19187958011487424, + "grad_norm": 1.8323934603776113, + "learning_rate": 1.8657044716407573e-05, + "loss": 0.4275, + "step": 2422 + }, + { + "epoch": 0.1919588037235096, + "grad_norm": 1.8700447133185338, + "learning_rate": 1.865575999021557e-05, + "loss": 0.534, + "step": 2423 + }, + { + "epoch": 0.19203802733214498, + "grad_norm": 2.230692013912159, + "learning_rate": 1.8654474694083416e-05, + "loss": 0.4678, + "step": 2424 + }, + { + "epoch": 0.19211725094078036, + "grad_norm": 1.8800928379643222, + "learning_rate": 1.8653188828095754e-05, + "loss": 0.3926, + "step": 2425 + }, + { + "epoch": 0.1921964745494157, + "grad_norm": 2.064351405857103, + "learning_rate": 1.865190239233724e-05, + "loss": 0.4101, + "step": 2426 + }, + { + "epoch": 0.1922756981580511, + "grad_norm": 1.8977278493163616, + "learning_rate": 1.8650615386892587e-05, + "loss": 0.3694, + "step": 2427 + }, + { + "epoch": 0.19235492176668648, + "grad_norm": 1.909276467843947, + "learning_rate": 1.8649327811846533e-05, + "loss": 0.5535, + "step": 2428 + }, + { + "epoch": 0.19243414537532186, + "grad_norm": 1.5399040700323343, + "learning_rate": 1.8648039667283857e-05, + "loss": 0.4441, + "step": 2429 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 1.6651636991409458, + "learning_rate": 1.8646750953289384e-05, + "loss": 0.3652, + "step": 2430 + }, + { + "epoch": 0.1925925925925926, + "grad_norm": 1.6815956236704828, + "learning_rate": 1.8645461669947966e-05, + "loss": 0.3155, + "step": 2431 + }, + { + "epoch": 0.19267181620122797, + "grad_norm": 1.6718262583049504, + "learning_rate": 1.8644171817344497e-05, + "loss": 0.3481, + "step": 2432 + }, + { + "epoch": 0.19275103980986333, + "grad_norm": 2.7799199239515766, + "learning_rate": 1.8642881395563904e-05, + "loss": 0.3483, + "step": 2433 + }, + { + "epoch": 0.1928302634184987, + "grad_norm": 1.9756974054244152, + "learning_rate": 1.864159040469116e-05, + "loss": 0.4401, + "step": 2434 + }, + { + "epoch": 0.1929094870271341, + "grad_norm": 1.8071163200499114, + "learning_rate": 1.864029884481127e-05, + "loss": 0.4145, + "step": 2435 + }, + { + "epoch": 0.19298871063576947, + "grad_norm": 2.543240114206302, + "learning_rate": 1.8639006716009275e-05, + "loss": 0.5372, + "step": 2436 + }, + { + "epoch": 0.19306793424440483, + "grad_norm": 2.228588993655016, + "learning_rate": 1.8637714018370255e-05, + "loss": 0.4201, + "step": 2437 + }, + { + "epoch": 0.1931471578530402, + "grad_norm": 1.8494149784248002, + "learning_rate": 1.8636420751979328e-05, + "loss": 0.4531, + "step": 2438 + }, + { + "epoch": 0.1932263814616756, + "grad_norm": 2.0253791399897874, + "learning_rate": 1.863512691692165e-05, + "loss": 0.456, + "step": 2439 + }, + { + "epoch": 0.19330560507031094, + "grad_norm": 1.6588983512442654, + "learning_rate": 1.863383251328242e-05, + "loss": 0.424, + "step": 2440 + }, + { + "epoch": 0.19338482867894632, + "grad_norm": 1.8389834487870507, + "learning_rate": 1.8632537541146856e-05, + "loss": 0.3149, + "step": 2441 + }, + { + "epoch": 0.1934640522875817, + "grad_norm": 1.9858381854276652, + "learning_rate": 1.8631242000600235e-05, + "loss": 0.5007, + "step": 2442 + }, + { + "epoch": 0.19354327589621706, + "grad_norm": 2.266124060357365, + "learning_rate": 1.8629945891727856e-05, + "loss": 0.4956, + "step": 2443 + }, + { + "epoch": 0.19362249950485244, + "grad_norm": 1.593089901704416, + "learning_rate": 1.8628649214615066e-05, + "loss": 0.3618, + "step": 2444 + }, + { + "epoch": 0.19370172311348782, + "grad_norm": 1.679824462137367, + "learning_rate": 1.8627351969347246e-05, + "loss": 0.3759, + "step": 2445 + }, + { + "epoch": 0.1937809467221232, + "grad_norm": 1.8874286946173535, + "learning_rate": 1.8626054156009807e-05, + "loss": 0.4039, + "step": 2446 + }, + { + "epoch": 0.19386017033075856, + "grad_norm": 1.7848707595708873, + "learning_rate": 1.862475577468821e-05, + "loss": 0.3609, + "step": 2447 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 1.8519651811855438, + "learning_rate": 1.8623456825467948e-05, + "loss": 0.4146, + "step": 2448 + }, + { + "epoch": 0.19401861754802932, + "grad_norm": 1.7979330256148234, + "learning_rate": 1.8622157308434544e-05, + "loss": 0.4959, + "step": 2449 + }, + { + "epoch": 0.19409784115666467, + "grad_norm": 1.966534088434029, + "learning_rate": 1.8620857223673567e-05, + "loss": 0.4544, + "step": 2450 + }, + { + "epoch": 0.19417706476530006, + "grad_norm": 1.9639912803367112, + "learning_rate": 1.8619556571270624e-05, + "loss": 0.357, + "step": 2451 + }, + { + "epoch": 0.19425628837393544, + "grad_norm": 2.1894482909531607, + "learning_rate": 1.8618255351311355e-05, + "loss": 0.3974, + "step": 2452 + }, + { + "epoch": 0.19433551198257082, + "grad_norm": 1.7694374669375357, + "learning_rate": 1.8616953563881444e-05, + "loss": 0.4088, + "step": 2453 + }, + { + "epoch": 0.19441473559120617, + "grad_norm": 1.7142932207290247, + "learning_rate": 1.8615651209066598e-05, + "loss": 0.3559, + "step": 2454 + }, + { + "epoch": 0.19449395919984155, + "grad_norm": 1.9300198351645033, + "learning_rate": 1.8614348286952577e-05, + "loss": 0.3958, + "step": 2455 + }, + { + "epoch": 0.19457318280847694, + "grad_norm": 2.214933640628995, + "learning_rate": 1.8613044797625173e-05, + "loss": 0.4106, + "step": 2456 + }, + { + "epoch": 0.1946524064171123, + "grad_norm": 1.6909762560643709, + "learning_rate": 1.861174074117021e-05, + "loss": 0.367, + "step": 2457 + }, + { + "epoch": 0.19473163002574767, + "grad_norm": 1.9584800927859016, + "learning_rate": 1.8610436117673557e-05, + "loss": 0.3804, + "step": 2458 + }, + { + "epoch": 0.19481085363438305, + "grad_norm": 2.2511836739384603, + "learning_rate": 1.8609130927221116e-05, + "loss": 0.4379, + "step": 2459 + }, + { + "epoch": 0.1948900772430184, + "grad_norm": 1.599297769846532, + "learning_rate": 1.8607825169898827e-05, + "loss": 0.4615, + "step": 2460 + }, + { + "epoch": 0.1949693008516538, + "grad_norm": 1.901142235594116, + "learning_rate": 1.8606518845792672e-05, + "loss": 0.6061, + "step": 2461 + }, + { + "epoch": 0.19504852446028917, + "grad_norm": 2.0458999263956397, + "learning_rate": 1.860521195498866e-05, + "loss": 0.4046, + "step": 2462 + }, + { + "epoch": 0.19512774806892455, + "grad_norm": 1.759581313745163, + "learning_rate": 1.8603904497572846e-05, + "loss": 0.3766, + "step": 2463 + }, + { + "epoch": 0.1952069716775599, + "grad_norm": 1.9240944001896176, + "learning_rate": 1.8602596473631323e-05, + "loss": 0.4983, + "step": 2464 + }, + { + "epoch": 0.19528619528619529, + "grad_norm": 1.522758996737219, + "learning_rate": 1.8601287883250215e-05, + "loss": 0.3731, + "step": 2465 + }, + { + "epoch": 0.19536541889483067, + "grad_norm": 1.8265426742019597, + "learning_rate": 1.8599978726515685e-05, + "loss": 0.3485, + "step": 2466 + }, + { + "epoch": 0.19544464250346602, + "grad_norm": 2.4480816777113112, + "learning_rate": 1.8598669003513934e-05, + "loss": 0.5251, + "step": 2467 + }, + { + "epoch": 0.1955238661121014, + "grad_norm": 1.8224430982709632, + "learning_rate": 1.8597358714331207e-05, + "loss": 0.3705, + "step": 2468 + }, + { + "epoch": 0.19560308972073678, + "grad_norm": 1.7001483763139453, + "learning_rate": 1.8596047859053776e-05, + "loss": 0.4021, + "step": 2469 + }, + { + "epoch": 0.19568231332937217, + "grad_norm": 1.7204409613521898, + "learning_rate": 1.8594736437767954e-05, + "loss": 0.4864, + "step": 2470 + }, + { + "epoch": 0.19576153693800752, + "grad_norm": 2.091932585980975, + "learning_rate": 1.8593424450560094e-05, + "loss": 0.4887, + "step": 2471 + }, + { + "epoch": 0.1958407605466429, + "grad_norm": 1.7633716664478725, + "learning_rate": 1.8592111897516583e-05, + "loss": 0.4496, + "step": 2472 + }, + { + "epoch": 0.19591998415527828, + "grad_norm": 1.6653392322170877, + "learning_rate": 1.8590798778723843e-05, + "loss": 0.4546, + "step": 2473 + }, + { + "epoch": 0.19599920776391364, + "grad_norm": 1.7320120374400938, + "learning_rate": 1.8589485094268344e-05, + "loss": 0.373, + "step": 2474 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 1.6322324489444844, + "learning_rate": 1.858817084423658e-05, + "loss": 0.3944, + "step": 2475 + }, + { + "epoch": 0.1961576549811844, + "grad_norm": 1.795466811829931, + "learning_rate": 1.8586856028715087e-05, + "loss": 0.469, + "step": 2476 + }, + { + "epoch": 0.19623687858981978, + "grad_norm": 2.3937840988692707, + "learning_rate": 1.8585540647790445e-05, + "loss": 0.4013, + "step": 2477 + }, + { + "epoch": 0.19631610219845513, + "grad_norm": 1.6164375785474248, + "learning_rate": 1.858422470154926e-05, + "loss": 0.4214, + "step": 2478 + }, + { + "epoch": 0.19639532580709052, + "grad_norm": 1.9199350130245678, + "learning_rate": 1.8582908190078184e-05, + "loss": 0.5453, + "step": 2479 + }, + { + "epoch": 0.1964745494157259, + "grad_norm": 1.7824278600148742, + "learning_rate": 1.8581591113463903e-05, + "loss": 0.5164, + "step": 2480 + }, + { + "epoch": 0.19655377302436125, + "grad_norm": 1.7286654271742554, + "learning_rate": 1.858027347179314e-05, + "loss": 0.3957, + "step": 2481 + }, + { + "epoch": 0.19663299663299663, + "grad_norm": 2.0083320030792726, + "learning_rate": 1.8578955265152652e-05, + "loss": 0.374, + "step": 2482 + }, + { + "epoch": 0.196712220241632, + "grad_norm": 1.9580549598132762, + "learning_rate": 1.857763649362924e-05, + "loss": 0.5356, + "step": 2483 + }, + { + "epoch": 0.19679144385026737, + "grad_norm": 1.7602610015289, + "learning_rate": 1.857631715730974e-05, + "loss": 0.4486, + "step": 2484 + }, + { + "epoch": 0.19687066745890275, + "grad_norm": 1.6370968567701094, + "learning_rate": 1.857499725628102e-05, + "loss": 0.3566, + "step": 2485 + }, + { + "epoch": 0.19694989106753813, + "grad_norm": 1.922573016685483, + "learning_rate": 1.8573676790629988e-05, + "loss": 0.3174, + "step": 2486 + }, + { + "epoch": 0.1970291146761735, + "grad_norm": 1.4623496852922742, + "learning_rate": 1.8572355760443597e-05, + "loss": 0.2664, + "step": 2487 + }, + { + "epoch": 0.19710833828480886, + "grad_norm": 1.6989502090112747, + "learning_rate": 1.8571034165808826e-05, + "loss": 0.3267, + "step": 2488 + }, + { + "epoch": 0.19718756189344425, + "grad_norm": 1.6543509790456885, + "learning_rate": 1.85697120068127e-05, + "loss": 0.2347, + "step": 2489 + }, + { + "epoch": 0.19726678550207963, + "grad_norm": 2.162622394196891, + "learning_rate": 1.8568389283542263e-05, + "loss": 0.4326, + "step": 2490 + }, + { + "epoch": 0.19734600911071498, + "grad_norm": 3.2740540955986495, + "learning_rate": 1.8567065996084628e-05, + "loss": 0.4848, + "step": 2491 + }, + { + "epoch": 0.19742523271935036, + "grad_norm": 1.8739873451937548, + "learning_rate": 1.8565742144526917e-05, + "loss": 0.4019, + "step": 2492 + }, + { + "epoch": 0.19750445632798574, + "grad_norm": 1.7344103057979974, + "learning_rate": 1.85644177289563e-05, + "loss": 0.4771, + "step": 2493 + }, + { + "epoch": 0.19758367993662113, + "grad_norm": 2.0293785876315154, + "learning_rate": 1.856309274945999e-05, + "loss": 0.3886, + "step": 2494 + }, + { + "epoch": 0.19766290354525648, + "grad_norm": 1.876725659866697, + "learning_rate": 1.8561767206125223e-05, + "loss": 0.3855, + "step": 2495 + }, + { + "epoch": 0.19774212715389186, + "grad_norm": 2.630788697747225, + "learning_rate": 1.856044109903928e-05, + "loss": 0.4633, + "step": 2496 + }, + { + "epoch": 0.19782135076252724, + "grad_norm": 2.225504228548519, + "learning_rate": 1.8559114428289482e-05, + "loss": 0.5706, + "step": 2497 + }, + { + "epoch": 0.1979005743711626, + "grad_norm": 1.8209421607723064, + "learning_rate": 1.8557787193963184e-05, + "loss": 0.4335, + "step": 2498 + }, + { + "epoch": 0.19797979797979798, + "grad_norm": 1.962437757913541, + "learning_rate": 1.8556459396147777e-05, + "loss": 0.4608, + "step": 2499 + }, + { + "epoch": 0.19805902158843336, + "grad_norm": 2.0788544979334023, + "learning_rate": 1.8555131034930686e-05, + "loss": 0.4806, + "step": 2500 + }, + { + "epoch": 0.1981382451970687, + "grad_norm": 1.7789781442235573, + "learning_rate": 1.8553802110399385e-05, + "loss": 0.3791, + "step": 2501 + }, + { + "epoch": 0.1982174688057041, + "grad_norm": 1.8436094826417708, + "learning_rate": 1.8552472622641372e-05, + "loss": 0.3799, + "step": 2502 + }, + { + "epoch": 0.19829669241433948, + "grad_norm": 2.1843434642231268, + "learning_rate": 1.8551142571744188e-05, + "loss": 0.5619, + "step": 2503 + }, + { + "epoch": 0.19837591602297486, + "grad_norm": 1.8538465015068417, + "learning_rate": 1.854981195779541e-05, + "loss": 0.325, + "step": 2504 + }, + { + "epoch": 0.1984551396316102, + "grad_norm": 1.8608829464420031, + "learning_rate": 1.8548480780882658e-05, + "loss": 0.4484, + "step": 2505 + }, + { + "epoch": 0.1985343632402456, + "grad_norm": 2.0326954963360837, + "learning_rate": 1.8547149041093574e-05, + "loss": 0.4729, + "step": 2506 + }, + { + "epoch": 0.19861358684888097, + "grad_norm": 1.5760119244850919, + "learning_rate": 1.8545816738515855e-05, + "loss": 0.4157, + "step": 2507 + }, + { + "epoch": 0.19869281045751633, + "grad_norm": 1.497014462165689, + "learning_rate": 1.854448387323722e-05, + "loss": 0.371, + "step": 2508 + }, + { + "epoch": 0.1987720340661517, + "grad_norm": 1.7390940156536723, + "learning_rate": 1.8543150445345443e-05, + "loss": 0.3971, + "step": 2509 + }, + { + "epoch": 0.1988512576747871, + "grad_norm": 1.5371060435271777, + "learning_rate": 1.854181645492831e-05, + "loss": 0.3336, + "step": 2510 + }, + { + "epoch": 0.19893048128342247, + "grad_norm": 1.9229194455839491, + "learning_rate": 1.8540481902073664e-05, + "loss": 0.4252, + "step": 2511 + }, + { + "epoch": 0.19900970489205783, + "grad_norm": 2.2818319290416724, + "learning_rate": 1.8539146786869385e-05, + "loss": 0.4121, + "step": 2512 + }, + { + "epoch": 0.1990889285006932, + "grad_norm": 1.7807339419216095, + "learning_rate": 1.8537811109403372e-05, + "loss": 0.3523, + "step": 2513 + }, + { + "epoch": 0.1991681521093286, + "grad_norm": 2.208398926240744, + "learning_rate": 1.853647486976358e-05, + "loss": 0.547, + "step": 2514 + }, + { + "epoch": 0.19924737571796394, + "grad_norm": 1.5602544904924176, + "learning_rate": 1.8535138068037995e-05, + "loss": 0.3616, + "step": 2515 + }, + { + "epoch": 0.19932659932659932, + "grad_norm": 2.2095884820039893, + "learning_rate": 1.8533800704314633e-05, + "loss": 0.5332, + "step": 2516 + }, + { + "epoch": 0.1994058229352347, + "grad_norm": 1.7618684601616275, + "learning_rate": 1.8532462778681558e-05, + "loss": 0.3672, + "step": 2517 + }, + { + "epoch": 0.1994850465438701, + "grad_norm": 1.8240924869890305, + "learning_rate": 1.8531124291226866e-05, + "loss": 0.3825, + "step": 2518 + }, + { + "epoch": 0.19956427015250544, + "grad_norm": 1.7317104373236238, + "learning_rate": 1.8529785242038688e-05, + "loss": 0.3898, + "step": 2519 + }, + { + "epoch": 0.19964349376114082, + "grad_norm": 1.6280770333620187, + "learning_rate": 1.8528445631205195e-05, + "loss": 0.3855, + "step": 2520 + }, + { + "epoch": 0.1997227173697762, + "grad_norm": 1.7566863403512298, + "learning_rate": 1.852710545881459e-05, + "loss": 0.4474, + "step": 2521 + }, + { + "epoch": 0.19980194097841156, + "grad_norm": 1.878815093583782, + "learning_rate": 1.8525764724955123e-05, + "loss": 0.4035, + "step": 2522 + }, + { + "epoch": 0.19988116458704694, + "grad_norm": 1.8298324286185195, + "learning_rate": 1.8524423429715072e-05, + "loss": 0.3532, + "step": 2523 + }, + { + "epoch": 0.19996038819568232, + "grad_norm": 1.677771114182212, + "learning_rate": 1.8523081573182754e-05, + "loss": 0.3794, + "step": 2524 + }, + { + "epoch": 0.20003961180431767, + "grad_norm": 1.8449111661378463, + "learning_rate": 1.8521739155446527e-05, + "loss": 0.4851, + "step": 2525 + }, + { + "epoch": 0.20011883541295306, + "grad_norm": 1.686290781787805, + "learning_rate": 1.852039617659478e-05, + "loss": 0.3886, + "step": 2526 + }, + { + "epoch": 0.20019805902158844, + "grad_norm": 1.769262906449984, + "learning_rate": 1.851905263671594e-05, + "loss": 0.4177, + "step": 2527 + }, + { + "epoch": 0.20027728263022382, + "grad_norm": 2.213461401505922, + "learning_rate": 1.8517708535898477e-05, + "loss": 0.3671, + "step": 2528 + }, + { + "epoch": 0.20035650623885917, + "grad_norm": 1.8336084314787529, + "learning_rate": 1.851636387423089e-05, + "loss": 0.3184, + "step": 2529 + }, + { + "epoch": 0.20043572984749455, + "grad_norm": 1.767729823809732, + "learning_rate": 1.8515018651801723e-05, + "loss": 0.3668, + "step": 2530 + }, + { + "epoch": 0.20051495345612994, + "grad_norm": 1.72123658107767, + "learning_rate": 1.8513672868699547e-05, + "loss": 0.409, + "step": 2531 + }, + { + "epoch": 0.2005941770647653, + "grad_norm": 1.9114359791423503, + "learning_rate": 1.851232652501298e-05, + "loss": 0.3531, + "step": 2532 + }, + { + "epoch": 0.20067340067340067, + "grad_norm": 2.0924360141875415, + "learning_rate": 1.851097962083067e-05, + "loss": 0.4826, + "step": 2533 + }, + { + "epoch": 0.20075262428203605, + "grad_norm": 1.552165791094521, + "learning_rate": 1.85096321562413e-05, + "loss": 0.305, + "step": 2534 + }, + { + "epoch": 0.20083184789067143, + "grad_norm": 1.770835993604429, + "learning_rate": 1.8508284131333604e-05, + "loss": 0.3868, + "step": 2535 + }, + { + "epoch": 0.2009110714993068, + "grad_norm": 1.9056575372469093, + "learning_rate": 1.850693554619633e-05, + "loss": 0.4677, + "step": 2536 + }, + { + "epoch": 0.20099029510794217, + "grad_norm": 1.8964029082208331, + "learning_rate": 1.8505586400918288e-05, + "loss": 0.368, + "step": 2537 + }, + { + "epoch": 0.20106951871657755, + "grad_norm": 2.0384928936642686, + "learning_rate": 1.8504236695588308e-05, + "loss": 0.3827, + "step": 2538 + }, + { + "epoch": 0.2011487423252129, + "grad_norm": 1.7720548147958977, + "learning_rate": 1.8502886430295262e-05, + "loss": 0.4841, + "step": 2539 + }, + { + "epoch": 0.20122796593384829, + "grad_norm": 1.992105236078617, + "learning_rate": 1.8501535605128054e-05, + "loss": 0.3567, + "step": 2540 + }, + { + "epoch": 0.20130718954248367, + "grad_norm": 1.6796399304185092, + "learning_rate": 1.8500184220175636e-05, + "loss": 0.4534, + "step": 2541 + }, + { + "epoch": 0.20138641315111902, + "grad_norm": 1.7008230592972515, + "learning_rate": 1.8498832275526988e-05, + "loss": 0.4597, + "step": 2542 + }, + { + "epoch": 0.2014656367597544, + "grad_norm": 1.5855692661266696, + "learning_rate": 1.8497479771271125e-05, + "loss": 0.3148, + "step": 2543 + }, + { + "epoch": 0.20154486036838978, + "grad_norm": 1.7779826536689631, + "learning_rate": 1.8496126707497112e-05, + "loss": 0.407, + "step": 2544 + }, + { + "epoch": 0.20162408397702516, + "grad_norm": 1.8264384362815282, + "learning_rate": 1.849477308429403e-05, + "loss": 0.3947, + "step": 2545 + }, + { + "epoch": 0.20170330758566052, + "grad_norm": 2.158459029956277, + "learning_rate": 1.8493418901751016e-05, + "loss": 0.4558, + "step": 2546 + }, + { + "epoch": 0.2017825311942959, + "grad_norm": 1.786016789552016, + "learning_rate": 1.849206415995724e-05, + "loss": 0.4428, + "step": 2547 + }, + { + "epoch": 0.20186175480293128, + "grad_norm": 1.700575434704635, + "learning_rate": 1.8490708859001896e-05, + "loss": 0.4093, + "step": 2548 + }, + { + "epoch": 0.20194097841156664, + "grad_norm": 2.018488079044388, + "learning_rate": 1.8489352998974227e-05, + "loss": 0.4784, + "step": 2549 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.8071537132065065, + "learning_rate": 1.8487996579963515e-05, + "loss": 0.3771, + "step": 2550 + }, + { + "epoch": 0.2020994256288374, + "grad_norm": 2.0026948751367173, + "learning_rate": 1.8486639602059066e-05, + "loss": 0.4292, + "step": 2551 + }, + { + "epoch": 0.20217864923747278, + "grad_norm": 2.069044623731319, + "learning_rate": 1.8485282065350237e-05, + "loss": 0.4185, + "step": 2552 + }, + { + "epoch": 0.20225787284610813, + "grad_norm": 1.5654862434381134, + "learning_rate": 1.848392396992641e-05, + "loss": 0.3382, + "step": 2553 + }, + { + "epoch": 0.20233709645474351, + "grad_norm": 1.7818155360512213, + "learning_rate": 1.8482565315877013e-05, + "loss": 0.4563, + "step": 2554 + }, + { + "epoch": 0.2024163200633789, + "grad_norm": 1.813607767893193, + "learning_rate": 1.8481206103291506e-05, + "loss": 0.4047, + "step": 2555 + }, + { + "epoch": 0.20249554367201425, + "grad_norm": 2.014937066860306, + "learning_rate": 1.8479846332259388e-05, + "loss": 0.4206, + "step": 2556 + }, + { + "epoch": 0.20257476728064963, + "grad_norm": 2.202476476404529, + "learning_rate": 1.847848600287019e-05, + "loss": 0.4824, + "step": 2557 + }, + { + "epoch": 0.202653990889285, + "grad_norm": 1.9251910412474011, + "learning_rate": 1.8477125115213484e-05, + "loss": 0.5128, + "step": 2558 + }, + { + "epoch": 0.20273321449792037, + "grad_norm": 1.8213524522171174, + "learning_rate": 1.8475763669378878e-05, + "loss": 0.4291, + "step": 2559 + }, + { + "epoch": 0.20281243810655575, + "grad_norm": 1.6161176638402914, + "learning_rate": 1.8474401665456016e-05, + "loss": 0.3686, + "step": 2560 + }, + { + "epoch": 0.20289166171519113, + "grad_norm": 1.9537352090698643, + "learning_rate": 1.8473039103534583e-05, + "loss": 0.4901, + "step": 2561 + }, + { + "epoch": 0.2029708853238265, + "grad_norm": 1.8927710174886234, + "learning_rate": 1.8471675983704295e-05, + "loss": 0.5194, + "step": 2562 + }, + { + "epoch": 0.20305010893246186, + "grad_norm": 1.8293950759224924, + "learning_rate": 1.8470312306054903e-05, + "loss": 0.3808, + "step": 2563 + }, + { + "epoch": 0.20312933254109725, + "grad_norm": 1.4585840390039595, + "learning_rate": 1.8468948070676205e-05, + "loss": 0.3287, + "step": 2564 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 1.8672224498645469, + "learning_rate": 1.8467583277658026e-05, + "loss": 0.3643, + "step": 2565 + }, + { + "epoch": 0.20328777975836798, + "grad_norm": 1.6749160631893312, + "learning_rate": 1.8466217927090232e-05, + "loss": 0.3501, + "step": 2566 + }, + { + "epoch": 0.20336700336700336, + "grad_norm": 2.2147584958701785, + "learning_rate": 1.8464852019062726e-05, + "loss": 0.4364, + "step": 2567 + }, + { + "epoch": 0.20344622697563874, + "grad_norm": 1.822011263780815, + "learning_rate": 1.846348555366544e-05, + "loss": 0.4391, + "step": 2568 + }, + { + "epoch": 0.20352545058427413, + "grad_norm": 1.8808694610123304, + "learning_rate": 1.8462118530988356e-05, + "loss": 0.5218, + "step": 2569 + }, + { + "epoch": 0.20360467419290948, + "grad_norm": 1.7285091505358392, + "learning_rate": 1.8460750951121487e-05, + "loss": 0.4514, + "step": 2570 + }, + { + "epoch": 0.20368389780154486, + "grad_norm": 1.7192328423533971, + "learning_rate": 1.8459382814154874e-05, + "loss": 0.4094, + "step": 2571 + }, + { + "epoch": 0.20376312141018024, + "grad_norm": 1.727594784011334, + "learning_rate": 1.845801412017861e-05, + "loss": 0.3623, + "step": 2572 + }, + { + "epoch": 0.2038423450188156, + "grad_norm": 1.7380797862194362, + "learning_rate": 1.845664486928281e-05, + "loss": 0.4399, + "step": 2573 + }, + { + "epoch": 0.20392156862745098, + "grad_norm": 1.821043741521068, + "learning_rate": 1.8455275061557643e-05, + "loss": 0.346, + "step": 2574 + }, + { + "epoch": 0.20400079223608636, + "grad_norm": 1.7039374485497736, + "learning_rate": 1.845390469709329e-05, + "loss": 0.4366, + "step": 2575 + }, + { + "epoch": 0.20408001584472174, + "grad_norm": 1.6809238091037046, + "learning_rate": 1.8452533775979992e-05, + "loss": 0.3451, + "step": 2576 + }, + { + "epoch": 0.2041592394533571, + "grad_norm": 2.0909572192450225, + "learning_rate": 1.845116229830802e-05, + "loss": 0.4011, + "step": 2577 + }, + { + "epoch": 0.20423846306199248, + "grad_norm": 1.5025030242150463, + "learning_rate": 1.8449790264167672e-05, + "loss": 0.281, + "step": 2578 + }, + { + "epoch": 0.20431768667062786, + "grad_norm": 1.911453706834977, + "learning_rate": 1.8448417673649292e-05, + "loss": 0.5453, + "step": 2579 + }, + { + "epoch": 0.2043969102792632, + "grad_norm": 1.955871959095756, + "learning_rate": 1.844704452684326e-05, + "loss": 0.3655, + "step": 2580 + }, + { + "epoch": 0.2044761338878986, + "grad_norm": 1.7339672713256262, + "learning_rate": 1.844567082383999e-05, + "loss": 0.4493, + "step": 2581 + }, + { + "epoch": 0.20455535749653397, + "grad_norm": 1.6377121197368745, + "learning_rate": 1.8444296564729935e-05, + "loss": 0.4747, + "step": 2582 + }, + { + "epoch": 0.20463458110516933, + "grad_norm": 1.4724385422967265, + "learning_rate": 1.8442921749603586e-05, + "loss": 0.3875, + "step": 2583 + }, + { + "epoch": 0.2047138047138047, + "grad_norm": 1.6695951246345508, + "learning_rate": 1.8441546378551457e-05, + "loss": 0.4559, + "step": 2584 + }, + { + "epoch": 0.2047930283224401, + "grad_norm": 2.36697265882477, + "learning_rate": 1.8440170451664122e-05, + "loss": 0.4361, + "step": 2585 + }, + { + "epoch": 0.20487225193107547, + "grad_norm": 1.8857696783758962, + "learning_rate": 1.8438793969032175e-05, + "loss": 0.4893, + "step": 2586 + }, + { + "epoch": 0.20495147553971083, + "grad_norm": 1.6431798521518761, + "learning_rate": 1.8437416930746248e-05, + "loss": 0.4379, + "step": 2587 + }, + { + "epoch": 0.2050306991483462, + "grad_norm": 1.3155267301350255, + "learning_rate": 1.8436039336897015e-05, + "loss": 0.2707, + "step": 2588 + }, + { + "epoch": 0.2051099227569816, + "grad_norm": 1.8913709165051489, + "learning_rate": 1.8434661187575183e-05, + "loss": 0.4872, + "step": 2589 + }, + { + "epoch": 0.20518914636561694, + "grad_norm": 1.5169523132561975, + "learning_rate": 1.8433282482871497e-05, + "loss": 0.3853, + "step": 2590 + }, + { + "epoch": 0.20526836997425232, + "grad_norm": 1.8298331090397917, + "learning_rate": 1.8431903222876737e-05, + "loss": 0.4041, + "step": 2591 + }, + { + "epoch": 0.2053475935828877, + "grad_norm": 1.603435168941413, + "learning_rate": 1.8430523407681723e-05, + "loss": 0.379, + "step": 2592 + }, + { + "epoch": 0.2054268171915231, + "grad_norm": 2.1434324550226522, + "learning_rate": 1.8429143037377305e-05, + "loss": 0.5042, + "step": 2593 + }, + { + "epoch": 0.20550604080015844, + "grad_norm": 1.9199817324725568, + "learning_rate": 1.8427762112054378e-05, + "loss": 0.4582, + "step": 2594 + }, + { + "epoch": 0.20558526440879382, + "grad_norm": 1.447826585214332, + "learning_rate": 1.842638063180387e-05, + "loss": 0.3226, + "step": 2595 + }, + { + "epoch": 0.2056644880174292, + "grad_norm": 1.8198817247789638, + "learning_rate": 1.8424998596716743e-05, + "loss": 0.454, + "step": 2596 + }, + { + "epoch": 0.20574371162606456, + "grad_norm": 1.659408602894229, + "learning_rate": 1.8423616006883994e-05, + "loss": 0.2472, + "step": 2597 + }, + { + "epoch": 0.20582293523469994, + "grad_norm": 1.7604255855472575, + "learning_rate": 1.8422232862396663e-05, + "loss": 0.3568, + "step": 2598 + }, + { + "epoch": 0.20590215884333532, + "grad_norm": 1.9274813194205778, + "learning_rate": 1.8420849163345824e-05, + "loss": 0.4587, + "step": 2599 + }, + { + "epoch": 0.20598138245197067, + "grad_norm": 2.561083447581465, + "learning_rate": 1.8419464909822585e-05, + "loss": 0.4281, + "step": 2600 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 1.962994187728574, + "learning_rate": 1.8418080101918095e-05, + "loss": 0.4124, + "step": 2601 + }, + { + "epoch": 0.20613982966924144, + "grad_norm": 1.368983002235622, + "learning_rate": 1.8416694739723535e-05, + "loss": 0.3553, + "step": 2602 + }, + { + "epoch": 0.20621905327787682, + "grad_norm": 2.3986732703255464, + "learning_rate": 1.841530882333012e-05, + "loss": 0.5015, + "step": 2603 + }, + { + "epoch": 0.20629827688651217, + "grad_norm": 1.9263548372278987, + "learning_rate": 1.8413922352829118e-05, + "loss": 0.3741, + "step": 2604 + }, + { + "epoch": 0.20637750049514755, + "grad_norm": 1.6655041187445303, + "learning_rate": 1.8412535328311813e-05, + "loss": 0.3234, + "step": 2605 + }, + { + "epoch": 0.20645672410378293, + "grad_norm": 1.963328820654345, + "learning_rate": 1.8411147749869536e-05, + "loss": 0.3981, + "step": 2606 + }, + { + "epoch": 0.2065359477124183, + "grad_norm": 1.8272096800893565, + "learning_rate": 1.840975961759365e-05, + "loss": 0.4043, + "step": 2607 + }, + { + "epoch": 0.20661517132105367, + "grad_norm": 1.900238551080896, + "learning_rate": 1.8408370931575556e-05, + "loss": 0.4265, + "step": 2608 + }, + { + "epoch": 0.20669439492968905, + "grad_norm": 1.7544930611735488, + "learning_rate": 1.84069816919067e-05, + "loss": 0.3841, + "step": 2609 + }, + { + "epoch": 0.20677361853832443, + "grad_norm": 1.7199774777341128, + "learning_rate": 1.8405591898678546e-05, + "loss": 0.3779, + "step": 2610 + }, + { + "epoch": 0.2068528421469598, + "grad_norm": 1.9442477804276905, + "learning_rate": 1.8404201551982612e-05, + "loss": 0.4593, + "step": 2611 + }, + { + "epoch": 0.20693206575559517, + "grad_norm": 1.8343352215609086, + "learning_rate": 1.8402810651910444e-05, + "loss": 0.3389, + "step": 2612 + }, + { + "epoch": 0.20701128936423055, + "grad_norm": 1.8315102033785051, + "learning_rate": 1.840141919855363e-05, + "loss": 0.55, + "step": 2613 + }, + { + "epoch": 0.2070905129728659, + "grad_norm": 1.849525054536266, + "learning_rate": 1.8400027192003782e-05, + "loss": 0.4543, + "step": 2614 + }, + { + "epoch": 0.20716973658150128, + "grad_norm": 1.6906378152268264, + "learning_rate": 1.8398634632352562e-05, + "loss": 0.4921, + "step": 2615 + }, + { + "epoch": 0.20724896019013667, + "grad_norm": 1.6673508123235308, + "learning_rate": 1.8397241519691667e-05, + "loss": 0.3689, + "step": 2616 + }, + { + "epoch": 0.20732818379877205, + "grad_norm": 1.6086499528209952, + "learning_rate": 1.839584785411282e-05, + "loss": 0.395, + "step": 2617 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 1.6810559848762459, + "learning_rate": 1.839445363570779e-05, + "loss": 0.3798, + "step": 2618 + }, + { + "epoch": 0.20748663101604278, + "grad_norm": 1.7597778783938374, + "learning_rate": 1.8393058864568383e-05, + "loss": 0.4065, + "step": 2619 + }, + { + "epoch": 0.20756585462467816, + "grad_norm": 1.6485520861819096, + "learning_rate": 1.839166354078643e-05, + "loss": 0.3813, + "step": 2620 + }, + { + "epoch": 0.20764507823331352, + "grad_norm": 1.5647611079734072, + "learning_rate": 1.8390267664453815e-05, + "loss": 0.4694, + "step": 2621 + }, + { + "epoch": 0.2077243018419489, + "grad_norm": 1.6148699561855906, + "learning_rate": 1.8388871235662442e-05, + "loss": 0.3077, + "step": 2622 + }, + { + "epoch": 0.20780352545058428, + "grad_norm": 1.8008911503103382, + "learning_rate": 1.8387474254504265e-05, + "loss": 0.3734, + "step": 2623 + }, + { + "epoch": 0.20788274905921963, + "grad_norm": 1.9166702698900357, + "learning_rate": 1.8386076721071265e-05, + "loss": 0.5234, + "step": 2624 + }, + { + "epoch": 0.20796197266785502, + "grad_norm": 2.03191312770713, + "learning_rate": 1.8384678635455467e-05, + "loss": 0.3718, + "step": 2625 + }, + { + "epoch": 0.2080411962764904, + "grad_norm": 1.5176904417514294, + "learning_rate": 1.838327999774892e-05, + "loss": 0.2921, + "step": 2626 + }, + { + "epoch": 0.20812041988512578, + "grad_norm": 1.7179604568536229, + "learning_rate": 1.838188080804373e-05, + "loss": 0.3821, + "step": 2627 + }, + { + "epoch": 0.20819964349376113, + "grad_norm": 1.889777973718155, + "learning_rate": 1.8380481066432014e-05, + "loss": 0.3609, + "step": 2628 + }, + { + "epoch": 0.20827886710239651, + "grad_norm": 1.774755717518551, + "learning_rate": 1.8379080773005947e-05, + "loss": 0.3565, + "step": 2629 + }, + { + "epoch": 0.2083580907110319, + "grad_norm": 1.5162758369751226, + "learning_rate": 1.8377679927857727e-05, + "loss": 0.3431, + "step": 2630 + }, + { + "epoch": 0.20843731431966725, + "grad_norm": 1.3024030545869711, + "learning_rate": 1.8376278531079594e-05, + "loss": 0.325, + "step": 2631 + }, + { + "epoch": 0.20851653792830263, + "grad_norm": 1.6443629547466578, + "learning_rate": 1.8374876582763828e-05, + "loss": 0.4108, + "step": 2632 + }, + { + "epoch": 0.208595761536938, + "grad_norm": 1.962643786954383, + "learning_rate": 1.8373474083002732e-05, + "loss": 0.4563, + "step": 2633 + }, + { + "epoch": 0.2086749851455734, + "grad_norm": 1.44752969385219, + "learning_rate": 1.837207103188866e-05, + "loss": 0.3447, + "step": 2634 + }, + { + "epoch": 0.20875420875420875, + "grad_norm": 1.793074278860773, + "learning_rate": 1.8370667429513992e-05, + "loss": 0.4019, + "step": 2635 + }, + { + "epoch": 0.20883343236284413, + "grad_norm": 2.170145185577981, + "learning_rate": 1.8369263275971153e-05, + "loss": 0.4981, + "step": 2636 + }, + { + "epoch": 0.2089126559714795, + "grad_norm": 1.672341674158899, + "learning_rate": 1.8367858571352603e-05, + "loss": 0.3985, + "step": 2637 + }, + { + "epoch": 0.20899187958011486, + "grad_norm": 1.8188718279869924, + "learning_rate": 1.8366453315750822e-05, + "loss": 0.434, + "step": 2638 + }, + { + "epoch": 0.20907110318875025, + "grad_norm": 1.9095407206261972, + "learning_rate": 1.8365047509258346e-05, + "loss": 0.4344, + "step": 2639 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 1.6736104526022306, + "learning_rate": 1.8363641151967747e-05, + "loss": 0.4236, + "step": 2640 + }, + { + "epoch": 0.20922955040602098, + "grad_norm": 1.6438871798450496, + "learning_rate": 1.836223424397162e-05, + "loss": 0.3421, + "step": 2641 + }, + { + "epoch": 0.20930877401465636, + "grad_norm": 1.8883599819287094, + "learning_rate": 1.8360826785362603e-05, + "loss": 0.3268, + "step": 2642 + }, + { + "epoch": 0.20938799762329174, + "grad_norm": 1.9001925714115733, + "learning_rate": 1.835941877623337e-05, + "loss": 0.4171, + "step": 2643 + }, + { + "epoch": 0.20946722123192713, + "grad_norm": 1.5832669542194682, + "learning_rate": 1.835801021667664e-05, + "loss": 0.3351, + "step": 2644 + }, + { + "epoch": 0.20954644484056248, + "grad_norm": 1.8703361485334735, + "learning_rate": 1.8356601106785148e-05, + "loss": 0.4406, + "step": 2645 + }, + { + "epoch": 0.20962566844919786, + "grad_norm": 1.595550029336583, + "learning_rate": 1.8355191446651687e-05, + "loss": 0.3235, + "step": 2646 + }, + { + "epoch": 0.20970489205783324, + "grad_norm": 1.686953363833381, + "learning_rate": 1.8353781236369065e-05, + "loss": 0.4281, + "step": 2647 + }, + { + "epoch": 0.2097841156664686, + "grad_norm": 1.8655691053204475, + "learning_rate": 1.8352370476030147e-05, + "loss": 0.3926, + "step": 2648 + }, + { + "epoch": 0.20986333927510398, + "grad_norm": 2.191073565044196, + "learning_rate": 1.8350959165727826e-05, + "loss": 0.4275, + "step": 2649 + }, + { + "epoch": 0.20994256288373936, + "grad_norm": 1.9816194150389066, + "learning_rate": 1.8349547305555023e-05, + "loss": 0.3713, + "step": 2650 + }, + { + "epoch": 0.21002178649237474, + "grad_norm": 1.653602573804687, + "learning_rate": 1.8348134895604708e-05, + "loss": 0.3924, + "step": 2651 + }, + { + "epoch": 0.2101010101010101, + "grad_norm": 1.7238723669521951, + "learning_rate": 1.8346721935969878e-05, + "loss": 0.4094, + "step": 2652 + }, + { + "epoch": 0.21018023370964548, + "grad_norm": 1.7249791325183572, + "learning_rate": 1.8345308426743568e-05, + "loss": 0.3891, + "step": 2653 + }, + { + "epoch": 0.21025945731828086, + "grad_norm": 1.8682894618405272, + "learning_rate": 1.8343894368018854e-05, + "loss": 0.4323, + "step": 2654 + }, + { + "epoch": 0.2103386809269162, + "grad_norm": 1.4840531948415532, + "learning_rate": 1.8342479759888844e-05, + "loss": 0.2546, + "step": 2655 + }, + { + "epoch": 0.2104179045355516, + "grad_norm": 1.9482152996128668, + "learning_rate": 1.8341064602446686e-05, + "loss": 0.416, + "step": 2656 + }, + { + "epoch": 0.21049712814418697, + "grad_norm": 1.535018377750315, + "learning_rate": 1.8339648895785556e-05, + "loss": 0.2934, + "step": 2657 + }, + { + "epoch": 0.21057635175282233, + "grad_norm": 1.8747616061324945, + "learning_rate": 1.8338232639998672e-05, + "loss": 0.3534, + "step": 2658 + }, + { + "epoch": 0.2106555753614577, + "grad_norm": 1.6466467667253228, + "learning_rate": 1.8336815835179295e-05, + "loss": 0.357, + "step": 2659 + }, + { + "epoch": 0.2107347989700931, + "grad_norm": 1.637187848391672, + "learning_rate": 1.8335398481420705e-05, + "loss": 0.4285, + "step": 2660 + }, + { + "epoch": 0.21081402257872847, + "grad_norm": 1.9957554368431647, + "learning_rate": 1.8333980578816234e-05, + "loss": 0.5636, + "step": 2661 + }, + { + "epoch": 0.21089324618736383, + "grad_norm": 1.63191350712576, + "learning_rate": 1.8332562127459242e-05, + "loss": 0.4423, + "step": 2662 + }, + { + "epoch": 0.2109724697959992, + "grad_norm": 1.754744359268684, + "learning_rate": 1.833114312744313e-05, + "loss": 0.4373, + "step": 2663 + }, + { + "epoch": 0.2110516934046346, + "grad_norm": 1.6104142975151547, + "learning_rate": 1.8329723578861328e-05, + "loss": 0.2937, + "step": 2664 + }, + { + "epoch": 0.21113091701326994, + "grad_norm": 1.7110792611516918, + "learning_rate": 1.8328303481807306e-05, + "loss": 0.3618, + "step": 2665 + }, + { + "epoch": 0.21121014062190532, + "grad_norm": 1.7434590964141203, + "learning_rate": 1.832688283637458e-05, + "loss": 0.4039, + "step": 2666 + }, + { + "epoch": 0.2112893642305407, + "grad_norm": 1.6515419905805533, + "learning_rate": 1.8325461642656676e-05, + "loss": 0.3125, + "step": 2667 + }, + { + "epoch": 0.2113685878391761, + "grad_norm": 1.5834906996870692, + "learning_rate": 1.832403990074719e-05, + "loss": 0.3741, + "step": 2668 + }, + { + "epoch": 0.21144781144781144, + "grad_norm": 1.788676311711513, + "learning_rate": 1.8322617610739726e-05, + "loss": 0.3731, + "step": 2669 + }, + { + "epoch": 0.21152703505644682, + "grad_norm": 1.6559201523777785, + "learning_rate": 1.8321194772727938e-05, + "loss": 0.3825, + "step": 2670 + }, + { + "epoch": 0.2116062586650822, + "grad_norm": 1.6989067194342768, + "learning_rate": 1.8319771386805514e-05, + "loss": 0.3829, + "step": 2671 + }, + { + "epoch": 0.21168548227371756, + "grad_norm": 1.4968904444907245, + "learning_rate": 1.8318347453066176e-05, + "loss": 0.3181, + "step": 2672 + }, + { + "epoch": 0.21176470588235294, + "grad_norm": 1.686285837004851, + "learning_rate": 1.8316922971603685e-05, + "loss": 0.41, + "step": 2673 + }, + { + "epoch": 0.21184392949098832, + "grad_norm": 1.7089345831056113, + "learning_rate": 1.8315497942511836e-05, + "loss": 0.4845, + "step": 2674 + }, + { + "epoch": 0.2119231530996237, + "grad_norm": 1.6623978142932054, + "learning_rate": 1.8314072365884455e-05, + "loss": 0.4018, + "step": 2675 + }, + { + "epoch": 0.21200237670825905, + "grad_norm": 1.7047144285076845, + "learning_rate": 1.831264624181542e-05, + "loss": 0.4098, + "step": 2676 + }, + { + "epoch": 0.21208160031689444, + "grad_norm": 1.622948788936126, + "learning_rate": 1.8311219570398618e-05, + "loss": 0.4357, + "step": 2677 + }, + { + "epoch": 0.21216082392552982, + "grad_norm": 1.7635284502071558, + "learning_rate": 1.8309792351728006e-05, + "loss": 0.3783, + "step": 2678 + }, + { + "epoch": 0.21224004753416517, + "grad_norm": 2.014714300841769, + "learning_rate": 1.830836458589755e-05, + "loss": 0.4123, + "step": 2679 + }, + { + "epoch": 0.21231927114280055, + "grad_norm": 1.7231774570827907, + "learning_rate": 1.8306936273001258e-05, + "loss": 0.3878, + "step": 2680 + }, + { + "epoch": 0.21239849475143593, + "grad_norm": 1.7366222996862124, + "learning_rate": 1.830550741313319e-05, + "loss": 0.3738, + "step": 2681 + }, + { + "epoch": 0.2124777183600713, + "grad_norm": 1.6061900708391434, + "learning_rate": 1.830407800638742e-05, + "loss": 0.3153, + "step": 2682 + }, + { + "epoch": 0.21255694196870667, + "grad_norm": 1.9188884305998029, + "learning_rate": 1.830264805285807e-05, + "loss": 0.3542, + "step": 2683 + }, + { + "epoch": 0.21263616557734205, + "grad_norm": 1.7415687603133443, + "learning_rate": 1.8301217552639294e-05, + "loss": 0.3654, + "step": 2684 + }, + { + "epoch": 0.21271538918597743, + "grad_norm": 1.8435612091661784, + "learning_rate": 1.8299786505825286e-05, + "loss": 0.4261, + "step": 2685 + }, + { + "epoch": 0.2127946127946128, + "grad_norm": 1.746758974317949, + "learning_rate": 1.8298354912510273e-05, + "loss": 0.3495, + "step": 2686 + }, + { + "epoch": 0.21287383640324817, + "grad_norm": 1.824189883935391, + "learning_rate": 1.8296922772788522e-05, + "loss": 0.5282, + "step": 2687 + }, + { + "epoch": 0.21295306001188355, + "grad_norm": 2.15997463525689, + "learning_rate": 1.8295490086754325e-05, + "loss": 0.4156, + "step": 2688 + }, + { + "epoch": 0.2130322836205189, + "grad_norm": 1.850894787197073, + "learning_rate": 1.829405685450202e-05, + "loss": 0.3915, + "step": 2689 + }, + { + "epoch": 0.21311150722915428, + "grad_norm": 1.5642180755623767, + "learning_rate": 1.8292623076125983e-05, + "loss": 0.4266, + "step": 2690 + }, + { + "epoch": 0.21319073083778967, + "grad_norm": 1.610509821557913, + "learning_rate": 1.8291188751720615e-05, + "loss": 0.3243, + "step": 2691 + }, + { + "epoch": 0.21326995444642505, + "grad_norm": 1.904450189224998, + "learning_rate": 1.828975388138036e-05, + "loss": 0.4101, + "step": 2692 + }, + { + "epoch": 0.2133491780550604, + "grad_norm": 1.5324099979749521, + "learning_rate": 1.8288318465199705e-05, + "loss": 0.3095, + "step": 2693 + }, + { + "epoch": 0.21342840166369578, + "grad_norm": 1.9523717857796727, + "learning_rate": 1.8286882503273157e-05, + "loss": 0.4412, + "step": 2694 + }, + { + "epoch": 0.21350762527233116, + "grad_norm": 2.105285197083128, + "learning_rate": 1.828544599569527e-05, + "loss": 0.3897, + "step": 2695 + }, + { + "epoch": 0.21358684888096652, + "grad_norm": 1.8330348790057602, + "learning_rate": 1.8284008942560634e-05, + "loss": 0.4494, + "step": 2696 + }, + { + "epoch": 0.2136660724896019, + "grad_norm": 1.7439570106157116, + "learning_rate": 1.8282571343963865e-05, + "loss": 0.4093, + "step": 2697 + }, + { + "epoch": 0.21374529609823728, + "grad_norm": 1.571739755819498, + "learning_rate": 1.8281133199999628e-05, + "loss": 0.314, + "step": 2698 + }, + { + "epoch": 0.21382451970687263, + "grad_norm": 2.1162420098285546, + "learning_rate": 1.8279694510762616e-05, + "loss": 0.5275, + "step": 2699 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 1.8263679132809585, + "learning_rate": 1.8278255276347563e-05, + "loss": 0.3863, + "step": 2700 + }, + { + "epoch": 0.2139829669241434, + "grad_norm": 1.6512850796616958, + "learning_rate": 1.8276815496849227e-05, + "loss": 0.3041, + "step": 2701 + }, + { + "epoch": 0.21406219053277878, + "grad_norm": 2.0732941465069255, + "learning_rate": 1.827537517236242e-05, + "loss": 0.5025, + "step": 2702 + }, + { + "epoch": 0.21414141414141413, + "grad_norm": 1.6156169966806242, + "learning_rate": 1.8273934302981975e-05, + "loss": 0.3564, + "step": 2703 + }, + { + "epoch": 0.21422063775004951, + "grad_norm": 1.54888986829638, + "learning_rate": 1.8272492888802767e-05, + "loss": 0.3598, + "step": 2704 + }, + { + "epoch": 0.2142998613586849, + "grad_norm": 1.694783413216878, + "learning_rate": 1.8271050929919707e-05, + "loss": 0.3489, + "step": 2705 + }, + { + "epoch": 0.21437908496732025, + "grad_norm": 1.8513059171948416, + "learning_rate": 1.8269608426427743e-05, + "loss": 0.4858, + "step": 2706 + }, + { + "epoch": 0.21445830857595563, + "grad_norm": 1.632221511025351, + "learning_rate": 1.8268165378421852e-05, + "loss": 0.4084, + "step": 2707 + }, + { + "epoch": 0.214537532184591, + "grad_norm": 1.4226988550174064, + "learning_rate": 1.826672178599706e-05, + "loss": 0.3306, + "step": 2708 + }, + { + "epoch": 0.2146167557932264, + "grad_norm": 1.984979993246484, + "learning_rate": 1.826527764924841e-05, + "loss": 0.456, + "step": 2709 + }, + { + "epoch": 0.21469597940186175, + "grad_norm": 2.0728136191318725, + "learning_rate": 1.8263832968271e-05, + "loss": 0.4243, + "step": 2710 + }, + { + "epoch": 0.21477520301049713, + "grad_norm": 1.7036106920163208, + "learning_rate": 1.826238774315995e-05, + "loss": 0.3183, + "step": 2711 + }, + { + "epoch": 0.2148544266191325, + "grad_norm": 1.6966715810963966, + "learning_rate": 1.8260941974010425e-05, + "loss": 0.3235, + "step": 2712 + }, + { + "epoch": 0.21493365022776786, + "grad_norm": 1.7073250040361634, + "learning_rate": 1.825949566091762e-05, + "loss": 0.282, + "step": 2713 + }, + { + "epoch": 0.21501287383640325, + "grad_norm": 1.7404582039089134, + "learning_rate": 1.8258048803976763e-05, + "loss": 0.3556, + "step": 2714 + }, + { + "epoch": 0.21509209744503863, + "grad_norm": 1.7623883626518861, + "learning_rate": 1.8256601403283133e-05, + "loss": 0.3379, + "step": 2715 + }, + { + "epoch": 0.215171321053674, + "grad_norm": 1.7377620999413776, + "learning_rate": 1.8255153458932028e-05, + "loss": 0.3402, + "step": 2716 + }, + { + "epoch": 0.21525054466230936, + "grad_norm": 1.9517174103467538, + "learning_rate": 1.825370497101879e-05, + "loss": 0.3924, + "step": 2717 + }, + { + "epoch": 0.21532976827094474, + "grad_norm": 2.3478029991604417, + "learning_rate": 1.825225593963879e-05, + "loss": 0.4048, + "step": 2718 + }, + { + "epoch": 0.21540899187958013, + "grad_norm": 1.6706776008472004, + "learning_rate": 1.8250806364887446e-05, + "loss": 0.3984, + "step": 2719 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 1.6604358040092195, + "learning_rate": 1.8249356246860205e-05, + "loss": 0.3053, + "step": 2720 + }, + { + "epoch": 0.21556743909685086, + "grad_norm": 1.7716968969404763, + "learning_rate": 1.8247905585652545e-05, + "loss": 0.4634, + "step": 2721 + }, + { + "epoch": 0.21564666270548624, + "grad_norm": 1.5601521197172752, + "learning_rate": 1.824645438135999e-05, + "loss": 0.3034, + "step": 2722 + }, + { + "epoch": 0.2157258863141216, + "grad_norm": 2.037816349771718, + "learning_rate": 1.8245002634078095e-05, + "loss": 0.4467, + "step": 2723 + }, + { + "epoch": 0.21580510992275698, + "grad_norm": 1.7916226037576128, + "learning_rate": 1.8243550343902447e-05, + "loss": 0.3722, + "step": 2724 + }, + { + "epoch": 0.21588433353139236, + "grad_norm": 1.5707068563122943, + "learning_rate": 1.8242097510928672e-05, + "loss": 0.4196, + "step": 2725 + }, + { + "epoch": 0.21596355714002774, + "grad_norm": 1.564041407140951, + "learning_rate": 1.824064413525244e-05, + "loss": 0.3884, + "step": 2726 + }, + { + "epoch": 0.2160427807486631, + "grad_norm": 1.7789757202395033, + "learning_rate": 1.823919021696944e-05, + "loss": 0.3709, + "step": 2727 + }, + { + "epoch": 0.21612200435729848, + "grad_norm": 1.802203886486031, + "learning_rate": 1.8237735756175408e-05, + "loss": 0.4548, + "step": 2728 + }, + { + "epoch": 0.21620122796593386, + "grad_norm": 1.4956295081667574, + "learning_rate": 1.8236280752966115e-05, + "loss": 0.3315, + "step": 2729 + }, + { + "epoch": 0.2162804515745692, + "grad_norm": 1.8126358749315155, + "learning_rate": 1.8234825207437365e-05, + "loss": 0.3911, + "step": 2730 + }, + { + "epoch": 0.2163596751832046, + "grad_norm": 1.6198279392032298, + "learning_rate": 1.8233369119685e-05, + "loss": 0.3624, + "step": 2731 + }, + { + "epoch": 0.21643889879183997, + "grad_norm": 1.7352450828221937, + "learning_rate": 1.8231912489804893e-05, + "loss": 0.3932, + "step": 2732 + }, + { + "epoch": 0.21651812240047535, + "grad_norm": 1.7283964724711196, + "learning_rate": 1.8230455317892957e-05, + "loss": 0.2665, + "step": 2733 + }, + { + "epoch": 0.2165973460091107, + "grad_norm": 1.680340402489577, + "learning_rate": 1.822899760404514e-05, + "loss": 0.3792, + "step": 2734 + }, + { + "epoch": 0.2166765696177461, + "grad_norm": 1.7220318125592986, + "learning_rate": 1.822753934835743e-05, + "loss": 0.4716, + "step": 2735 + }, + { + "epoch": 0.21675579322638147, + "grad_norm": 1.965210521744279, + "learning_rate": 1.822608055092584e-05, + "loss": 0.4193, + "step": 2736 + }, + { + "epoch": 0.21683501683501682, + "grad_norm": 1.9109587540279396, + "learning_rate": 1.8224621211846426e-05, + "loss": 0.5519, + "step": 2737 + }, + { + "epoch": 0.2169142404436522, + "grad_norm": 1.7767288955712042, + "learning_rate": 1.8223161331215285e-05, + "loss": 0.4097, + "step": 2738 + }, + { + "epoch": 0.2169934640522876, + "grad_norm": 1.9563216952486175, + "learning_rate": 1.822170090912853e-05, + "loss": 0.3947, + "step": 2739 + }, + { + "epoch": 0.21707268766092294, + "grad_norm": 1.4339421794399885, + "learning_rate": 1.8220239945682337e-05, + "loss": 0.2488, + "step": 2740 + }, + { + "epoch": 0.21715191126955832, + "grad_norm": 1.6193319390191063, + "learning_rate": 1.8218778440972893e-05, + "loss": 0.4883, + "step": 2741 + }, + { + "epoch": 0.2172311348781937, + "grad_norm": 1.5099276519890195, + "learning_rate": 1.8217316395096438e-05, + "loss": 0.3304, + "step": 2742 + }, + { + "epoch": 0.21731035848682909, + "grad_norm": 1.978902675384766, + "learning_rate": 1.8215853808149237e-05, + "loss": 0.3174, + "step": 2743 + }, + { + "epoch": 0.21738958209546444, + "grad_norm": 1.8533347508442921, + "learning_rate": 1.8214390680227588e-05, + "loss": 0.3355, + "step": 2744 + }, + { + "epoch": 0.21746880570409982, + "grad_norm": 2.0702849627853666, + "learning_rate": 1.8212927011427847e-05, + "loss": 0.4839, + "step": 2745 + }, + { + "epoch": 0.2175480293127352, + "grad_norm": 1.7558025106203288, + "learning_rate": 1.8211462801846375e-05, + "loss": 0.4176, + "step": 2746 + }, + { + "epoch": 0.21762725292137056, + "grad_norm": 2.3722666384626603, + "learning_rate": 1.820999805157959e-05, + "loss": 0.3824, + "step": 2747 + }, + { + "epoch": 0.21770647653000594, + "grad_norm": 1.704705465251105, + "learning_rate": 1.8208532760723937e-05, + "loss": 0.3603, + "step": 2748 + }, + { + "epoch": 0.21778570013864132, + "grad_norm": 2.014080694163564, + "learning_rate": 1.82070669293759e-05, + "loss": 0.4171, + "step": 2749 + }, + { + "epoch": 0.2178649237472767, + "grad_norm": 1.8452739910897396, + "learning_rate": 1.8205600557631995e-05, + "loss": 0.4338, + "step": 2750 + }, + { + "epoch": 0.21794414735591205, + "grad_norm": 1.8242509132529519, + "learning_rate": 1.8204133645588774e-05, + "loss": 0.4571, + "step": 2751 + }, + { + "epoch": 0.21802337096454744, + "grad_norm": 2.0740125033935666, + "learning_rate": 1.8202666193342834e-05, + "loss": 0.4194, + "step": 2752 + }, + { + "epoch": 0.21810259457318282, + "grad_norm": 2.1307657354917966, + "learning_rate": 1.8201198200990787e-05, + "loss": 0.3519, + "step": 2753 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 2.1252030817159673, + "learning_rate": 1.8199729668629303e-05, + "loss": 0.4235, + "step": 2754 + }, + { + "epoch": 0.21826104179045355, + "grad_norm": 1.8031173174969104, + "learning_rate": 1.8198260596355077e-05, + "loss": 0.3511, + "step": 2755 + }, + { + "epoch": 0.21834026539908893, + "grad_norm": 1.964246706829193, + "learning_rate": 1.8196790984264835e-05, + "loss": 0.5037, + "step": 2756 + }, + { + "epoch": 0.21841948900772432, + "grad_norm": 2.284880724024758, + "learning_rate": 1.8195320832455347e-05, + "loss": 0.4277, + "step": 2757 + }, + { + "epoch": 0.21849871261635967, + "grad_norm": 1.6023385673515118, + "learning_rate": 1.819385014102342e-05, + "loss": 0.3717, + "step": 2758 + }, + { + "epoch": 0.21857793622499505, + "grad_norm": 2.2202046474705766, + "learning_rate": 1.8192378910065882e-05, + "loss": 0.3839, + "step": 2759 + }, + { + "epoch": 0.21865715983363043, + "grad_norm": 1.6111925963514728, + "learning_rate": 1.8190907139679614e-05, + "loss": 0.364, + "step": 2760 + }, + { + "epoch": 0.21873638344226579, + "grad_norm": 2.3162717874605097, + "learning_rate": 1.8189434829961525e-05, + "loss": 0.4177, + "step": 2761 + }, + { + "epoch": 0.21881560705090117, + "grad_norm": 1.8462321518533145, + "learning_rate": 1.8187961981008554e-05, + "loss": 0.3343, + "step": 2762 + }, + { + "epoch": 0.21889483065953655, + "grad_norm": 1.8098231600461183, + "learning_rate": 1.8186488592917686e-05, + "loss": 0.3498, + "step": 2763 + }, + { + "epoch": 0.2189740542681719, + "grad_norm": 1.9612214156109713, + "learning_rate": 1.8185014665785936e-05, + "loss": 0.4596, + "step": 2764 + }, + { + "epoch": 0.21905327787680728, + "grad_norm": 1.7332165589616413, + "learning_rate": 1.8183540199710354e-05, + "loss": 0.3906, + "step": 2765 + }, + { + "epoch": 0.21913250148544267, + "grad_norm": 1.82242732245487, + "learning_rate": 1.8182065194788024e-05, + "loss": 0.4079, + "step": 2766 + }, + { + "epoch": 0.21921172509407805, + "grad_norm": 2.221807084570096, + "learning_rate": 1.8180589651116073e-05, + "loss": 0.4526, + "step": 2767 + }, + { + "epoch": 0.2192909487027134, + "grad_norm": 1.7656027807902084, + "learning_rate": 1.8179113568791656e-05, + "loss": 0.4196, + "step": 2768 + }, + { + "epoch": 0.21937017231134878, + "grad_norm": 2.0644646990812494, + "learning_rate": 1.8177636947911964e-05, + "loss": 0.5511, + "step": 2769 + }, + { + "epoch": 0.21944939591998416, + "grad_norm": 1.9476187845744724, + "learning_rate": 1.817615978857423e-05, + "loss": 0.4359, + "step": 2770 + }, + { + "epoch": 0.21952861952861952, + "grad_norm": 1.9106731416280978, + "learning_rate": 1.8174682090875713e-05, + "loss": 0.5247, + "step": 2771 + }, + { + "epoch": 0.2196078431372549, + "grad_norm": 1.6803802827016179, + "learning_rate": 1.8173203854913714e-05, + "loss": 0.361, + "step": 2772 + }, + { + "epoch": 0.21968706674589028, + "grad_norm": 1.6536349493697018, + "learning_rate": 1.817172508078557e-05, + "loss": 0.3716, + "step": 2773 + }, + { + "epoch": 0.21976629035452566, + "grad_norm": 1.8612504702984671, + "learning_rate": 1.817024576858865e-05, + "loss": 0.4081, + "step": 2774 + }, + { + "epoch": 0.21984551396316102, + "grad_norm": 2.1841022591359147, + "learning_rate": 1.8168765918420358e-05, + "loss": 0.4454, + "step": 2775 + }, + { + "epoch": 0.2199247375717964, + "grad_norm": 1.8181876620385793, + "learning_rate": 1.8167285530378134e-05, + "loss": 0.3965, + "step": 2776 + }, + { + "epoch": 0.22000396118043178, + "grad_norm": 1.6906033996564962, + "learning_rate": 1.8165804604559455e-05, + "loss": 0.3544, + "step": 2777 + }, + { + "epoch": 0.22008318478906713, + "grad_norm": 1.6687408918256246, + "learning_rate": 1.816432314106184e-05, + "loss": 0.3091, + "step": 2778 + }, + { + "epoch": 0.2201624083977025, + "grad_norm": 1.5988539486597675, + "learning_rate": 1.8162841139982827e-05, + "loss": 0.4036, + "step": 2779 + }, + { + "epoch": 0.2202416320063379, + "grad_norm": 1.8254445729767457, + "learning_rate": 1.816135860142e-05, + "loss": 0.378, + "step": 2780 + }, + { + "epoch": 0.22032085561497325, + "grad_norm": 1.634245166269344, + "learning_rate": 1.8159875525470984e-05, + "loss": 0.368, + "step": 2781 + }, + { + "epoch": 0.22040007922360863, + "grad_norm": 1.7144973469948466, + "learning_rate": 1.815839191223342e-05, + "loss": 0.3423, + "step": 2782 + }, + { + "epoch": 0.220479302832244, + "grad_norm": 2.1038388548711833, + "learning_rate": 1.815690776180501e-05, + "loss": 0.4847, + "step": 2783 + }, + { + "epoch": 0.2205585264408794, + "grad_norm": 1.9930947548245068, + "learning_rate": 1.815542307428347e-05, + "loss": 0.4618, + "step": 2784 + }, + { + "epoch": 0.22063775004951475, + "grad_norm": 1.8684329676605897, + "learning_rate": 1.8153937849766567e-05, + "loss": 0.3254, + "step": 2785 + }, + { + "epoch": 0.22071697365815013, + "grad_norm": 1.7788005760488583, + "learning_rate": 1.8152452088352084e-05, + "loss": 0.4312, + "step": 2786 + }, + { + "epoch": 0.2207961972667855, + "grad_norm": 1.9557187726641876, + "learning_rate": 1.8150965790137863e-05, + "loss": 0.4146, + "step": 2787 + }, + { + "epoch": 0.22087542087542086, + "grad_norm": 1.7557325322986894, + "learning_rate": 1.814947895522176e-05, + "loss": 0.3931, + "step": 2788 + }, + { + "epoch": 0.22095464448405625, + "grad_norm": 1.8565829578528064, + "learning_rate": 1.8147991583701685e-05, + "loss": 0.442, + "step": 2789 + }, + { + "epoch": 0.22103386809269163, + "grad_norm": 1.6727126799031429, + "learning_rate": 1.8146503675675568e-05, + "loss": 0.3671, + "step": 2790 + }, + { + "epoch": 0.221113091701327, + "grad_norm": 1.7328901425564518, + "learning_rate": 1.814501523124138e-05, + "loss": 0.4493, + "step": 2791 + }, + { + "epoch": 0.22119231530996236, + "grad_norm": 1.7676771472000006, + "learning_rate": 1.8143526250497134e-05, + "loss": 0.3841, + "step": 2792 + }, + { + "epoch": 0.22127153891859774, + "grad_norm": 1.74560570394591, + "learning_rate": 1.8142036733540868e-05, + "loss": 0.3427, + "step": 2793 + }, + { + "epoch": 0.22135076252723312, + "grad_norm": 2.306174376238809, + "learning_rate": 1.814054668047066e-05, + "loss": 0.5288, + "step": 2794 + }, + { + "epoch": 0.22142998613586848, + "grad_norm": 1.850007955548742, + "learning_rate": 1.8139056091384623e-05, + "loss": 0.4305, + "step": 2795 + }, + { + "epoch": 0.22150920974450386, + "grad_norm": 1.5887679853443888, + "learning_rate": 1.8137564966380905e-05, + "loss": 0.3317, + "step": 2796 + }, + { + "epoch": 0.22158843335313924, + "grad_norm": 2.2075788649659263, + "learning_rate": 1.813607330555769e-05, + "loss": 0.5342, + "step": 2797 + }, + { + "epoch": 0.2216676569617746, + "grad_norm": 1.9089197191582707, + "learning_rate": 1.8134581109013193e-05, + "loss": 0.4259, + "step": 2798 + }, + { + "epoch": 0.22174688057040998, + "grad_norm": 1.6996826694914087, + "learning_rate": 1.8133088376845675e-05, + "loss": 0.4196, + "step": 2799 + }, + { + "epoch": 0.22182610417904536, + "grad_norm": 1.6764364786181034, + "learning_rate": 1.8131595109153416e-05, + "loss": 0.3646, + "step": 2800 + }, + { + "epoch": 0.22190532778768074, + "grad_norm": 1.6130005178902656, + "learning_rate": 1.813010130603475e-05, + "loss": 0.3698, + "step": 2801 + }, + { + "epoch": 0.2219845513963161, + "grad_norm": 1.6594515205964468, + "learning_rate": 1.812860696758803e-05, + "loss": 0.3955, + "step": 2802 + }, + { + "epoch": 0.22206377500495147, + "grad_norm": 1.8965793100215345, + "learning_rate": 1.8127112093911655e-05, + "loss": 0.3717, + "step": 2803 + }, + { + "epoch": 0.22214299861358686, + "grad_norm": 1.926036864584367, + "learning_rate": 1.8125616685104055e-05, + "loss": 0.4084, + "step": 2804 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.416660042537055, + "learning_rate": 1.8124120741263692e-05, + "loss": 0.525, + "step": 2805 + }, + { + "epoch": 0.2223014458308576, + "grad_norm": 2.344228003804821, + "learning_rate": 1.812262426248907e-05, + "loss": 0.3643, + "step": 2806 + }, + { + "epoch": 0.22238066943949297, + "grad_norm": 1.4144655318403223, + "learning_rate": 1.8121127248878726e-05, + "loss": 0.3615, + "step": 2807 + }, + { + "epoch": 0.22245989304812835, + "grad_norm": 2.033237364665356, + "learning_rate": 1.8119629700531228e-05, + "loss": 0.4994, + "step": 2808 + }, + { + "epoch": 0.2225391166567637, + "grad_norm": 1.532489518273933, + "learning_rate": 1.8118131617545183e-05, + "loss": 0.3889, + "step": 2809 + }, + { + "epoch": 0.2226183402653991, + "grad_norm": 1.8949282816876172, + "learning_rate": 1.8116633000019233e-05, + "loss": 0.4014, + "step": 2810 + }, + { + "epoch": 0.22269756387403447, + "grad_norm": 1.7934510997309172, + "learning_rate": 1.8115133848052052e-05, + "loss": 0.4296, + "step": 2811 + }, + { + "epoch": 0.22277678748266982, + "grad_norm": 1.6066842324156256, + "learning_rate": 1.8113634161742356e-05, + "loss": 0.3598, + "step": 2812 + }, + { + "epoch": 0.2228560110913052, + "grad_norm": 1.6098163215461445, + "learning_rate": 1.8112133941188892e-05, + "loss": 0.3267, + "step": 2813 + }, + { + "epoch": 0.2229352346999406, + "grad_norm": 1.5888658804890845, + "learning_rate": 1.811063318649044e-05, + "loss": 0.4491, + "step": 2814 + }, + { + "epoch": 0.22301445830857597, + "grad_norm": 1.8857632465348089, + "learning_rate": 1.8109131897745823e-05, + "loss": 0.6655, + "step": 2815 + }, + { + "epoch": 0.22309368191721132, + "grad_norm": 1.4956635509226806, + "learning_rate": 1.8107630075053883e-05, + "loss": 0.3003, + "step": 2816 + }, + { + "epoch": 0.2231729055258467, + "grad_norm": 1.774628240460687, + "learning_rate": 1.810612771851352e-05, + "loss": 0.4392, + "step": 2817 + }, + { + "epoch": 0.22325212913448209, + "grad_norm": 1.6533402235959829, + "learning_rate": 1.8104624828223644e-05, + "loss": 0.3316, + "step": 2818 + }, + { + "epoch": 0.22333135274311744, + "grad_norm": 1.6262057208466414, + "learning_rate": 1.8103121404283222e-05, + "loss": 0.2931, + "step": 2819 + }, + { + "epoch": 0.22341057635175282, + "grad_norm": 2.148579006569893, + "learning_rate": 1.8101617446791248e-05, + "loss": 0.4501, + "step": 2820 + }, + { + "epoch": 0.2234897999603882, + "grad_norm": 1.870992128770809, + "learning_rate": 1.8100112955846746e-05, + "loss": 0.4275, + "step": 2821 + }, + { + "epoch": 0.22356902356902356, + "grad_norm": 1.539892464142909, + "learning_rate": 1.8098607931548782e-05, + "loss": 0.3532, + "step": 2822 + }, + { + "epoch": 0.22364824717765894, + "grad_norm": 1.9338981499494745, + "learning_rate": 1.8097102373996453e-05, + "loss": 0.3456, + "step": 2823 + }, + { + "epoch": 0.22372747078629432, + "grad_norm": 2.076227453999808, + "learning_rate": 1.809559628328889e-05, + "loss": 0.4136, + "step": 2824 + }, + { + "epoch": 0.2238066943949297, + "grad_norm": 1.7753133288999525, + "learning_rate": 1.8094089659525274e-05, + "loss": 0.3267, + "step": 2825 + }, + { + "epoch": 0.22388591800356505, + "grad_norm": 1.7890022175119427, + "learning_rate": 1.8092582502804793e-05, + "loss": 0.3531, + "step": 2826 + }, + { + "epoch": 0.22396514161220044, + "grad_norm": 1.8333210772697812, + "learning_rate": 1.8091074813226696e-05, + "loss": 0.3815, + "step": 2827 + }, + { + "epoch": 0.22404436522083582, + "grad_norm": 1.748560068670978, + "learning_rate": 1.8089566590890253e-05, + "loss": 0.3506, + "step": 2828 + }, + { + "epoch": 0.22412358882947117, + "grad_norm": 1.5594139065430026, + "learning_rate": 1.8088057835894775e-05, + "loss": 0.2637, + "step": 2829 + }, + { + "epoch": 0.22420281243810655, + "grad_norm": 1.4579240095448367, + "learning_rate": 1.8086548548339604e-05, + "loss": 0.3963, + "step": 2830 + }, + { + "epoch": 0.22428203604674193, + "grad_norm": 1.805063003062504, + "learning_rate": 1.8085038728324123e-05, + "loss": 0.5023, + "step": 2831 + }, + { + "epoch": 0.22436125965537732, + "grad_norm": 1.5288625833991882, + "learning_rate": 1.8083528375947744e-05, + "loss": 0.3372, + "step": 2832 + }, + { + "epoch": 0.22444048326401267, + "grad_norm": 1.8536161647210274, + "learning_rate": 1.808201749130992e-05, + "loss": 0.4984, + "step": 2833 + }, + { + "epoch": 0.22451970687264805, + "grad_norm": 1.7108498254541658, + "learning_rate": 1.8080506074510128e-05, + "loss": 0.4324, + "step": 2834 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 1.6716518554392972, + "learning_rate": 1.8078994125647896e-05, + "loss": 0.3309, + "step": 2835 + }, + { + "epoch": 0.22467815408991879, + "grad_norm": 1.6324568286415593, + "learning_rate": 1.807748164482277e-05, + "loss": 0.2995, + "step": 2836 + }, + { + "epoch": 0.22475737769855417, + "grad_norm": 1.75501913264354, + "learning_rate": 1.8075968632134343e-05, + "loss": 0.3935, + "step": 2837 + }, + { + "epoch": 0.22483660130718955, + "grad_norm": 1.409315770587425, + "learning_rate": 1.8074455087682247e-05, + "loss": 0.3316, + "step": 2838 + }, + { + "epoch": 0.2249158249158249, + "grad_norm": 1.6932029895477951, + "learning_rate": 1.8072941011566133e-05, + "loss": 0.3507, + "step": 2839 + }, + { + "epoch": 0.22499504852446028, + "grad_norm": 2.0566322095679914, + "learning_rate": 1.8071426403885698e-05, + "loss": 0.4278, + "step": 2840 + }, + { + "epoch": 0.22507427213309567, + "grad_norm": 1.9683848925073446, + "learning_rate": 1.8069911264740667e-05, + "loss": 0.4925, + "step": 2841 + }, + { + "epoch": 0.22515349574173105, + "grad_norm": 1.3995831132540693, + "learning_rate": 1.8068395594230815e-05, + "loss": 0.3162, + "step": 2842 + }, + { + "epoch": 0.2252327193503664, + "grad_norm": 1.5986686953594085, + "learning_rate": 1.8066879392455932e-05, + "loss": 0.3469, + "step": 2843 + }, + { + "epoch": 0.22531194295900178, + "grad_norm": 1.7270814562069732, + "learning_rate": 1.8065362659515856e-05, + "loss": 0.3735, + "step": 2844 + }, + { + "epoch": 0.22539116656763716, + "grad_norm": 1.3954595344249512, + "learning_rate": 1.806384539551046e-05, + "loss": 0.2429, + "step": 2845 + }, + { + "epoch": 0.22547039017627252, + "grad_norm": 1.7833586740106087, + "learning_rate": 1.8062327600539643e-05, + "loss": 0.4897, + "step": 2846 + }, + { + "epoch": 0.2255496137849079, + "grad_norm": 1.4157658693692063, + "learning_rate": 1.8060809274703352e-05, + "loss": 0.3503, + "step": 2847 + }, + { + "epoch": 0.22562883739354328, + "grad_norm": 1.5738266902854912, + "learning_rate": 1.805929041810155e-05, + "loss": 0.3345, + "step": 2848 + }, + { + "epoch": 0.22570806100217866, + "grad_norm": 1.6559105230974744, + "learning_rate": 1.8057771030834255e-05, + "loss": 0.3446, + "step": 2849 + }, + { + "epoch": 0.22578728461081402, + "grad_norm": 1.6637403875726988, + "learning_rate": 1.8056251113001508e-05, + "loss": 0.4226, + "step": 2850 + }, + { + "epoch": 0.2258665082194494, + "grad_norm": 1.9436512872529066, + "learning_rate": 1.8054730664703393e-05, + "loss": 0.4183, + "step": 2851 + }, + { + "epoch": 0.22594573182808478, + "grad_norm": 2.232681603583807, + "learning_rate": 1.8053209686040017e-05, + "loss": 0.434, + "step": 2852 + }, + { + "epoch": 0.22602495543672013, + "grad_norm": 1.4864096813534597, + "learning_rate": 1.8051688177111532e-05, + "loss": 0.2568, + "step": 2853 + }, + { + "epoch": 0.2261041790453555, + "grad_norm": 1.3801866271700975, + "learning_rate": 1.805016613801813e-05, + "loss": 0.2735, + "step": 2854 + }, + { + "epoch": 0.2261834026539909, + "grad_norm": 1.6727955446182847, + "learning_rate": 1.8048643568860015e-05, + "loss": 0.4645, + "step": 2855 + }, + { + "epoch": 0.22626262626262628, + "grad_norm": 1.6927497343517535, + "learning_rate": 1.804712046973745e-05, + "loss": 0.408, + "step": 2856 + }, + { + "epoch": 0.22634184987126163, + "grad_norm": 2.1394470654309297, + "learning_rate": 1.8045596840750722e-05, + "loss": 0.5207, + "step": 2857 + }, + { + "epoch": 0.226421073479897, + "grad_norm": 1.9842827002618435, + "learning_rate": 1.804407268200016e-05, + "loss": 0.6046, + "step": 2858 + }, + { + "epoch": 0.2265002970885324, + "grad_norm": 1.6780279488009289, + "learning_rate": 1.8042547993586114e-05, + "loss": 0.3369, + "step": 2859 + }, + { + "epoch": 0.22657952069716775, + "grad_norm": 1.704333753821867, + "learning_rate": 1.8041022775608977e-05, + "loss": 0.3989, + "step": 2860 + }, + { + "epoch": 0.22665874430580313, + "grad_norm": 1.7536418479352935, + "learning_rate": 1.803949702816919e-05, + "loss": 0.4659, + "step": 2861 + }, + { + "epoch": 0.2267379679144385, + "grad_norm": 1.6839339653323206, + "learning_rate": 1.80379707513672e-05, + "loss": 0.439, + "step": 2862 + }, + { + "epoch": 0.22681719152307386, + "grad_norm": 1.7368961407423944, + "learning_rate": 1.8036443945303514e-05, + "loss": 0.3599, + "step": 2863 + }, + { + "epoch": 0.22689641513170924, + "grad_norm": 1.8001728866348243, + "learning_rate": 1.8034916610078665e-05, + "loss": 0.4409, + "step": 2864 + }, + { + "epoch": 0.22697563874034463, + "grad_norm": 1.6520122965800768, + "learning_rate": 1.8033388745793218e-05, + "loss": 0.4133, + "step": 2865 + }, + { + "epoch": 0.22705486234898, + "grad_norm": 2.157868520460293, + "learning_rate": 1.8031860352547777e-05, + "loss": 0.5633, + "step": 2866 + }, + { + "epoch": 0.22713408595761536, + "grad_norm": 1.9273442565279941, + "learning_rate": 1.8030331430442974e-05, + "loss": 0.4675, + "step": 2867 + }, + { + "epoch": 0.22721330956625074, + "grad_norm": 1.5075170720673747, + "learning_rate": 1.8028801979579487e-05, + "loss": 0.4169, + "step": 2868 + }, + { + "epoch": 0.22729253317488612, + "grad_norm": 2.1337842069408945, + "learning_rate": 1.8027272000058028e-05, + "loss": 0.4258, + "step": 2869 + }, + { + "epoch": 0.22737175678352148, + "grad_norm": 1.5442012032111099, + "learning_rate": 1.8025741491979326e-05, + "loss": 0.3592, + "step": 2870 + }, + { + "epoch": 0.22745098039215686, + "grad_norm": 1.5826346077276343, + "learning_rate": 1.8024210455444168e-05, + "loss": 0.4305, + "step": 2871 + }, + { + "epoch": 0.22753020400079224, + "grad_norm": 1.6139297922178206, + "learning_rate": 1.8022678890553364e-05, + "loss": 0.4016, + "step": 2872 + }, + { + "epoch": 0.22760942760942762, + "grad_norm": 1.6565453135192383, + "learning_rate": 1.8021146797407752e-05, + "loss": 0.397, + "step": 2873 + }, + { + "epoch": 0.22768865121806298, + "grad_norm": 1.7703330209919277, + "learning_rate": 1.801961417610822e-05, + "loss": 0.4182, + "step": 2874 + }, + { + "epoch": 0.22776787482669836, + "grad_norm": 2.0172280662088165, + "learning_rate": 1.801808102675568e-05, + "loss": 0.461, + "step": 2875 + }, + { + "epoch": 0.22784709843533374, + "grad_norm": 1.5261982453685994, + "learning_rate": 1.801654734945109e-05, + "loss": 0.293, + "step": 2876 + }, + { + "epoch": 0.2279263220439691, + "grad_norm": 1.7126641685235788, + "learning_rate": 1.801501314429543e-05, + "loss": 0.3706, + "step": 2877 + }, + { + "epoch": 0.22800554565260447, + "grad_norm": 1.698682478180822, + "learning_rate": 1.801347841138972e-05, + "loss": 0.3803, + "step": 2878 + }, + { + "epoch": 0.22808476926123986, + "grad_norm": 1.7174235401223599, + "learning_rate": 1.8011943150835013e-05, + "loss": 0.3983, + "step": 2879 + }, + { + "epoch": 0.2281639928698752, + "grad_norm": 1.8542662705192385, + "learning_rate": 1.80104073627324e-05, + "loss": 0.3471, + "step": 2880 + }, + { + "epoch": 0.2282432164785106, + "grad_norm": 1.936275162929922, + "learning_rate": 1.8008871047183005e-05, + "loss": 0.392, + "step": 2881 + }, + { + "epoch": 0.22832244008714597, + "grad_norm": 1.5733888253616128, + "learning_rate": 1.800733420428799e-05, + "loss": 0.2975, + "step": 2882 + }, + { + "epoch": 0.22840166369578135, + "grad_norm": 1.8543446428176829, + "learning_rate": 1.8005796834148545e-05, + "loss": 0.2992, + "step": 2883 + }, + { + "epoch": 0.2284808873044167, + "grad_norm": 1.7376819187485757, + "learning_rate": 1.8004258936865902e-05, + "loss": 0.4457, + "step": 2884 + }, + { + "epoch": 0.2285601109130521, + "grad_norm": 1.856015607302783, + "learning_rate": 1.800272051254132e-05, + "loss": 0.3806, + "step": 2885 + }, + { + "epoch": 0.22863933452168747, + "grad_norm": 1.9483049095172358, + "learning_rate": 1.80011815612761e-05, + "loss": 0.4488, + "step": 2886 + }, + { + "epoch": 0.22871855813032282, + "grad_norm": 1.5645732638142311, + "learning_rate": 1.7999642083171576e-05, + "loss": 0.3392, + "step": 2887 + }, + { + "epoch": 0.2287977817389582, + "grad_norm": 1.5651473662952953, + "learning_rate": 1.799810207832911e-05, + "loss": 0.3407, + "step": 2888 + }, + { + "epoch": 0.2288770053475936, + "grad_norm": 1.7473844301113626, + "learning_rate": 1.7996561546850105e-05, + "loss": 0.399, + "step": 2889 + }, + { + "epoch": 0.22895622895622897, + "grad_norm": 1.4118640375580034, + "learning_rate": 1.7995020488836e-05, + "loss": 0.2476, + "step": 2890 + }, + { + "epoch": 0.22903545256486432, + "grad_norm": 1.9536733110432936, + "learning_rate": 1.799347890438827e-05, + "loss": 0.4379, + "step": 2891 + }, + { + "epoch": 0.2291146761734997, + "grad_norm": 1.677645126704345, + "learning_rate": 1.799193679360841e-05, + "loss": 0.3418, + "step": 2892 + }, + { + "epoch": 0.22919389978213509, + "grad_norm": 1.3672033850914929, + "learning_rate": 1.799039415659797e-05, + "loss": 0.314, + "step": 2893 + }, + { + "epoch": 0.22927312339077044, + "grad_norm": 1.7380627763572427, + "learning_rate": 1.798885099345852e-05, + "loss": 0.3811, + "step": 2894 + }, + { + "epoch": 0.22935234699940582, + "grad_norm": 1.9222401112288479, + "learning_rate": 1.7987307304291676e-05, + "loss": 0.3831, + "step": 2895 + }, + { + "epoch": 0.2294315706080412, + "grad_norm": 1.758899882712589, + "learning_rate": 1.7985763089199073e-05, + "loss": 0.4029, + "step": 2896 + }, + { + "epoch": 0.22951079421667656, + "grad_norm": 1.4954132267643259, + "learning_rate": 1.79842183482824e-05, + "loss": 0.3289, + "step": 2897 + }, + { + "epoch": 0.22959001782531194, + "grad_norm": 1.4396245519738804, + "learning_rate": 1.7982673081643364e-05, + "loss": 0.254, + "step": 2898 + }, + { + "epoch": 0.22966924143394732, + "grad_norm": 1.610872647853128, + "learning_rate": 1.7981127289383718e-05, + "loss": 0.3171, + "step": 2899 + }, + { + "epoch": 0.2297484650425827, + "grad_norm": 1.9379536913485538, + "learning_rate": 1.797958097160524e-05, + "loss": 0.4497, + "step": 2900 + }, + { + "epoch": 0.22982768865121805, + "grad_norm": 1.7177603196683155, + "learning_rate": 1.797803412840975e-05, + "loss": 0.3667, + "step": 2901 + }, + { + "epoch": 0.22990691225985344, + "grad_norm": 1.5813242064588038, + "learning_rate": 1.7976486759899103e-05, + "loss": 0.3622, + "step": 2902 + }, + { + "epoch": 0.22998613586848882, + "grad_norm": 1.9633317059023883, + "learning_rate": 1.797493886617518e-05, + "loss": 0.3923, + "step": 2903 + }, + { + "epoch": 0.23006535947712417, + "grad_norm": 1.4550866977961818, + "learning_rate": 1.797339044733991e-05, + "loss": 0.3863, + "step": 2904 + }, + { + "epoch": 0.23014458308575955, + "grad_norm": 1.7266205264993428, + "learning_rate": 1.797184150349524e-05, + "loss": 0.4353, + "step": 2905 + }, + { + "epoch": 0.23022380669439493, + "grad_norm": 1.5303695192337168, + "learning_rate": 1.7970292034743172e-05, + "loss": 0.3354, + "step": 2906 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 1.7882675421821683, + "learning_rate": 1.7968742041185718e-05, + "loss": 0.3862, + "step": 2907 + }, + { + "epoch": 0.23038225391166567, + "grad_norm": 1.9859746256308541, + "learning_rate": 1.7967191522924946e-05, + "loss": 0.4517, + "step": 2908 + }, + { + "epoch": 0.23046147752030105, + "grad_norm": 1.730100149151367, + "learning_rate": 1.7965640480062945e-05, + "loss": 0.4266, + "step": 2909 + }, + { + "epoch": 0.23054070112893643, + "grad_norm": 1.6802458168425851, + "learning_rate": 1.796408891270185e-05, + "loss": 0.4475, + "step": 2910 + }, + { + "epoch": 0.23061992473757179, + "grad_norm": 1.860548057412297, + "learning_rate": 1.7962536820943822e-05, + "loss": 0.3543, + "step": 2911 + }, + { + "epoch": 0.23069914834620717, + "grad_norm": 2.192719843354401, + "learning_rate": 1.7960984204891055e-05, + "loss": 0.578, + "step": 2912 + }, + { + "epoch": 0.23077837195484255, + "grad_norm": 1.436751565769236, + "learning_rate": 1.7959431064645786e-05, + "loss": 0.3694, + "step": 2913 + }, + { + "epoch": 0.23085759556347793, + "grad_norm": 1.8045081227975395, + "learning_rate": 1.7957877400310275e-05, + "loss": 0.4476, + "step": 2914 + }, + { + "epoch": 0.23093681917211328, + "grad_norm": 1.808268774958562, + "learning_rate": 1.7956323211986833e-05, + "loss": 0.3806, + "step": 2915 + }, + { + "epoch": 0.23101604278074866, + "grad_norm": 1.6331590983584408, + "learning_rate": 1.795476849977779e-05, + "loss": 0.3581, + "step": 2916 + }, + { + "epoch": 0.23109526638938405, + "grad_norm": 1.8663096504901544, + "learning_rate": 1.7953213263785513e-05, + "loss": 0.3988, + "step": 2917 + }, + { + "epoch": 0.2311744899980194, + "grad_norm": 1.4572632980616669, + "learning_rate": 1.7951657504112416e-05, + "loss": 0.3965, + "step": 2918 + }, + { + "epoch": 0.23125371360665478, + "grad_norm": 1.926773228579224, + "learning_rate": 1.795010122086093e-05, + "loss": 0.4426, + "step": 2919 + }, + { + "epoch": 0.23133293721529016, + "grad_norm": 1.5085816174035294, + "learning_rate": 1.7948544414133534e-05, + "loss": 0.3722, + "step": 2920 + }, + { + "epoch": 0.23141216082392552, + "grad_norm": 1.3529200149568625, + "learning_rate": 1.7946987084032733e-05, + "loss": 0.3335, + "step": 2921 + }, + { + "epoch": 0.2314913844325609, + "grad_norm": 1.7407449660299779, + "learning_rate": 1.794542923066107e-05, + "loss": 0.3561, + "step": 2922 + }, + { + "epoch": 0.23157060804119628, + "grad_norm": 1.7414893197132069, + "learning_rate": 1.7943870854121126e-05, + "loss": 0.3892, + "step": 2923 + }, + { + "epoch": 0.23164983164983166, + "grad_norm": 1.7620768326162055, + "learning_rate": 1.794231195451551e-05, + "loss": 0.3609, + "step": 2924 + }, + { + "epoch": 0.23172905525846701, + "grad_norm": 2.054803373558721, + "learning_rate": 1.7940752531946867e-05, + "loss": 0.4284, + "step": 2925 + }, + { + "epoch": 0.2318082788671024, + "grad_norm": 1.6818791261292518, + "learning_rate": 1.793919258651788e-05, + "loss": 0.4182, + "step": 2926 + }, + { + "epoch": 0.23188750247573778, + "grad_norm": 1.7027782079665075, + "learning_rate": 1.7937632118331255e-05, + "loss": 0.3961, + "step": 2927 + }, + { + "epoch": 0.23196672608437313, + "grad_norm": 2.065425586137854, + "learning_rate": 1.7936071127489755e-05, + "loss": 0.4461, + "step": 2928 + }, + { + "epoch": 0.2320459496930085, + "grad_norm": 1.5259608841093395, + "learning_rate": 1.7934509614096156e-05, + "loss": 0.328, + "step": 2929 + }, + { + "epoch": 0.2321251733016439, + "grad_norm": 1.7155426004156702, + "learning_rate": 1.7932947578253273e-05, + "loss": 0.3728, + "step": 2930 + }, + { + "epoch": 0.23220439691027928, + "grad_norm": 1.6718865487410788, + "learning_rate": 1.793138502006397e-05, + "loss": 0.3523, + "step": 2931 + }, + { + "epoch": 0.23228362051891463, + "grad_norm": 1.4028943894615473, + "learning_rate": 1.792982193963112e-05, + "loss": 0.2667, + "step": 2932 + }, + { + "epoch": 0.23236284412755, + "grad_norm": 1.9196503526629345, + "learning_rate": 1.7928258337057657e-05, + "loss": 0.4449, + "step": 2933 + }, + { + "epoch": 0.2324420677361854, + "grad_norm": 1.7750915914790288, + "learning_rate": 1.792669421244653e-05, + "loss": 0.3895, + "step": 2934 + }, + { + "epoch": 0.23252129134482075, + "grad_norm": 1.9564237157450146, + "learning_rate": 1.7925129565900728e-05, + "loss": 0.4574, + "step": 2935 + }, + { + "epoch": 0.23260051495345613, + "grad_norm": 1.7732758780228715, + "learning_rate": 1.792356439752328e-05, + "loss": 0.4024, + "step": 2936 + }, + { + "epoch": 0.2326797385620915, + "grad_norm": 1.5978501727914904, + "learning_rate": 1.792199870741724e-05, + "loss": 0.3584, + "step": 2937 + }, + { + "epoch": 0.23275896217072686, + "grad_norm": 1.7155700712566424, + "learning_rate": 1.79204324956857e-05, + "loss": 0.4193, + "step": 2938 + }, + { + "epoch": 0.23283818577936224, + "grad_norm": 1.5635593332620223, + "learning_rate": 1.7918865762431794e-05, + "loss": 0.3368, + "step": 2939 + }, + { + "epoch": 0.23291740938799763, + "grad_norm": 1.5212810402499088, + "learning_rate": 1.7917298507758684e-05, + "loss": 0.3131, + "step": 2940 + }, + { + "epoch": 0.232996632996633, + "grad_norm": 1.569751981873757, + "learning_rate": 1.7915730731769558e-05, + "loss": 0.3247, + "step": 2941 + }, + { + "epoch": 0.23307585660526836, + "grad_norm": 1.7934693993413744, + "learning_rate": 1.7914162434567653e-05, + "loss": 0.4306, + "step": 2942 + }, + { + "epoch": 0.23315508021390374, + "grad_norm": 1.8631704579174753, + "learning_rate": 1.791259361625623e-05, + "loss": 0.4466, + "step": 2943 + }, + { + "epoch": 0.23323430382253912, + "grad_norm": 1.5854774008709904, + "learning_rate": 1.7911024276938595e-05, + "loss": 0.3638, + "step": 2944 + }, + { + "epoch": 0.23331352743117448, + "grad_norm": 1.8713170820793896, + "learning_rate": 1.7909454416718075e-05, + "loss": 0.3622, + "step": 2945 + }, + { + "epoch": 0.23339275103980986, + "grad_norm": 1.7270872899188183, + "learning_rate": 1.790788403569804e-05, + "loss": 0.419, + "step": 2946 + }, + { + "epoch": 0.23347197464844524, + "grad_norm": 1.7276439062271296, + "learning_rate": 1.7906313133981887e-05, + "loss": 0.4048, + "step": 2947 + }, + { + "epoch": 0.23355119825708062, + "grad_norm": 1.830198475896751, + "learning_rate": 1.7904741711673064e-05, + "loss": 0.4446, + "step": 2948 + }, + { + "epoch": 0.23363042186571598, + "grad_norm": 1.2875092236252006, + "learning_rate": 1.790316976887503e-05, + "loss": 0.3, + "step": 2949 + }, + { + "epoch": 0.23370964547435136, + "grad_norm": 1.6726229280516625, + "learning_rate": 1.7901597305691294e-05, + "loss": 0.3287, + "step": 2950 + }, + { + "epoch": 0.23378886908298674, + "grad_norm": 1.6690163118178876, + "learning_rate": 1.7900024322225394e-05, + "loss": 0.3246, + "step": 2951 + }, + { + "epoch": 0.2338680926916221, + "grad_norm": 1.8325950046398163, + "learning_rate": 1.789845081858091e-05, + "loss": 0.4545, + "step": 2952 + }, + { + "epoch": 0.23394731630025747, + "grad_norm": 1.5981268543349332, + "learning_rate": 1.7896876794861443e-05, + "loss": 0.3624, + "step": 2953 + }, + { + "epoch": 0.23402653990889286, + "grad_norm": 1.5063216184604684, + "learning_rate": 1.7895302251170636e-05, + "loss": 0.3761, + "step": 2954 + }, + { + "epoch": 0.23410576351752824, + "grad_norm": 1.6326728612627766, + "learning_rate": 1.789372718761216e-05, + "loss": 0.3614, + "step": 2955 + }, + { + "epoch": 0.2341849871261636, + "grad_norm": 1.7282467328344246, + "learning_rate": 1.7892151604289738e-05, + "loss": 0.5239, + "step": 2956 + }, + { + "epoch": 0.23426421073479897, + "grad_norm": 1.701786411258173, + "learning_rate": 1.7890575501307105e-05, + "loss": 0.3916, + "step": 2957 + }, + { + "epoch": 0.23434343434343435, + "grad_norm": 1.5787622454027879, + "learning_rate": 1.7888998878768045e-05, + "loss": 0.3971, + "step": 2958 + }, + { + "epoch": 0.2344226579520697, + "grad_norm": 1.5276091746722464, + "learning_rate": 1.7887421736776364e-05, + "loss": 0.2233, + "step": 2959 + }, + { + "epoch": 0.2345018815607051, + "grad_norm": 1.7421391705065867, + "learning_rate": 1.7885844075435915e-05, + "loss": 0.3938, + "step": 2960 + }, + { + "epoch": 0.23458110516934047, + "grad_norm": 1.8126659099638778, + "learning_rate": 1.788426589485058e-05, + "loss": 0.4188, + "step": 2961 + }, + { + "epoch": 0.23466032877797582, + "grad_norm": 1.9265067213446623, + "learning_rate": 1.788268719512427e-05, + "loss": 0.4063, + "step": 2962 + }, + { + "epoch": 0.2347395523866112, + "grad_norm": 1.4853653300698935, + "learning_rate": 1.788110797636094e-05, + "loss": 0.3921, + "step": 2963 + }, + { + "epoch": 0.2348187759952466, + "grad_norm": 2.1927828779338037, + "learning_rate": 1.7879528238664567e-05, + "loss": 0.4026, + "step": 2964 + }, + { + "epoch": 0.23489799960388197, + "grad_norm": 1.7671482596742614, + "learning_rate": 1.7877947982139177e-05, + "loss": 0.437, + "step": 2965 + }, + { + "epoch": 0.23497722321251732, + "grad_norm": 2.0796720775994446, + "learning_rate": 1.7876367206888817e-05, + "loss": 0.4492, + "step": 2966 + }, + { + "epoch": 0.2350564468211527, + "grad_norm": 1.9580159398190233, + "learning_rate": 1.7874785913017575e-05, + "loss": 0.3741, + "step": 2967 + }, + { + "epoch": 0.23513567042978809, + "grad_norm": 1.7011607919402372, + "learning_rate": 1.7873204100629572e-05, + "loss": 0.3527, + "step": 2968 + }, + { + "epoch": 0.23521489403842344, + "grad_norm": 1.9644319886555555, + "learning_rate": 1.7871621769828965e-05, + "loss": 0.5007, + "step": 2969 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 1.6525374695924118, + "learning_rate": 1.7870038920719935e-05, + "loss": 0.2847, + "step": 2970 + }, + { + "epoch": 0.2353733412556942, + "grad_norm": 2.253349584992222, + "learning_rate": 1.7868455553406713e-05, + "loss": 0.5174, + "step": 2971 + }, + { + "epoch": 0.23545256486432958, + "grad_norm": 1.8088971622140557, + "learning_rate": 1.7866871667993554e-05, + "loss": 0.4381, + "step": 2972 + }, + { + "epoch": 0.23553178847296494, + "grad_norm": 1.967009078398569, + "learning_rate": 1.786528726458475e-05, + "loss": 0.3866, + "step": 2973 + }, + { + "epoch": 0.23561101208160032, + "grad_norm": 1.469212006549645, + "learning_rate": 1.786370234328462e-05, + "loss": 0.3598, + "step": 2974 + }, + { + "epoch": 0.2356902356902357, + "grad_norm": 1.7371199479170925, + "learning_rate": 1.7862116904197534e-05, + "loss": 0.3473, + "step": 2975 + }, + { + "epoch": 0.23576945929887105, + "grad_norm": 1.913597333169268, + "learning_rate": 1.7860530947427878e-05, + "loss": 0.5021, + "step": 2976 + }, + { + "epoch": 0.23584868290750644, + "grad_norm": 1.8952711321426534, + "learning_rate": 1.785894447308008e-05, + "loss": 0.5066, + "step": 2977 + }, + { + "epoch": 0.23592790651614182, + "grad_norm": 1.4181389132884858, + "learning_rate": 1.7857357481258603e-05, + "loss": 0.2819, + "step": 2978 + }, + { + "epoch": 0.23600713012477717, + "grad_norm": 1.4438255022898114, + "learning_rate": 1.7855769972067944e-05, + "loss": 0.2693, + "step": 2979 + }, + { + "epoch": 0.23608635373341255, + "grad_norm": 1.9154337634779175, + "learning_rate": 1.785418194561263e-05, + "loss": 0.4551, + "step": 2980 + }, + { + "epoch": 0.23616557734204793, + "grad_norm": 1.8723701004890083, + "learning_rate": 1.7852593401997232e-05, + "loss": 0.3461, + "step": 2981 + }, + { + "epoch": 0.23624480095068331, + "grad_norm": 2.0466176268691534, + "learning_rate": 1.785100434132634e-05, + "loss": 0.4119, + "step": 2982 + }, + { + "epoch": 0.23632402455931867, + "grad_norm": 1.6920718337267746, + "learning_rate": 1.7849414763704587e-05, + "loss": 0.3585, + "step": 2983 + }, + { + "epoch": 0.23640324816795405, + "grad_norm": 1.7472886952156605, + "learning_rate": 1.7847824669236643e-05, + "loss": 0.3363, + "step": 2984 + }, + { + "epoch": 0.23648247177658943, + "grad_norm": 1.8665463353039125, + "learning_rate": 1.7846234058027207e-05, + "loss": 0.3498, + "step": 2985 + }, + { + "epoch": 0.23656169538522479, + "grad_norm": 1.7971249645359173, + "learning_rate": 1.7844642930181008e-05, + "loss": 0.4522, + "step": 2986 + }, + { + "epoch": 0.23664091899386017, + "grad_norm": 1.6625591585831032, + "learning_rate": 1.7843051285802823e-05, + "loss": 0.4483, + "step": 2987 + }, + { + "epoch": 0.23672014260249555, + "grad_norm": 2.0507712255392403, + "learning_rate": 1.7841459124997445e-05, + "loss": 0.4121, + "step": 2988 + }, + { + "epoch": 0.23679936621113093, + "grad_norm": 1.7118752008905507, + "learning_rate": 1.7839866447869717e-05, + "loss": 0.3981, + "step": 2989 + }, + { + "epoch": 0.23687858981976628, + "grad_norm": 1.5289274953893544, + "learning_rate": 1.7838273254524505e-05, + "loss": 0.3272, + "step": 2990 + }, + { + "epoch": 0.23695781342840166, + "grad_norm": 1.9012071846949739, + "learning_rate": 1.7836679545066712e-05, + "loss": 0.3739, + "step": 2991 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 2.2138697661068454, + "learning_rate": 1.7835085319601283e-05, + "loss": 0.4013, + "step": 2992 + }, + { + "epoch": 0.2371162606456724, + "grad_norm": 1.6662270080382835, + "learning_rate": 1.783349057823318e-05, + "loss": 0.3968, + "step": 2993 + }, + { + "epoch": 0.23719548425430778, + "grad_norm": 1.742504282670805, + "learning_rate": 1.783189532106742e-05, + "loss": 0.4317, + "step": 2994 + }, + { + "epoch": 0.23727470786294316, + "grad_norm": 1.7391509132236265, + "learning_rate": 1.783029954820904e-05, + "loss": 0.51, + "step": 2995 + }, + { + "epoch": 0.23735393147157854, + "grad_norm": 1.486493206822086, + "learning_rate": 1.7828703259763107e-05, + "loss": 0.2872, + "step": 2996 + }, + { + "epoch": 0.2374331550802139, + "grad_norm": 1.6780228412202491, + "learning_rate": 1.782710645583473e-05, + "loss": 0.3689, + "step": 2997 + }, + { + "epoch": 0.23751237868884928, + "grad_norm": 1.8037090424314197, + "learning_rate": 1.7825509136529065e-05, + "loss": 0.4099, + "step": 2998 + }, + { + "epoch": 0.23759160229748466, + "grad_norm": 1.4838606463914465, + "learning_rate": 1.782391130195127e-05, + "loss": 0.4174, + "step": 2999 + }, + { + "epoch": 0.23767082590612001, + "grad_norm": 1.7007785737085417, + "learning_rate": 1.7822312952206565e-05, + "loss": 0.4335, + "step": 3000 + }, + { + "epoch": 0.2377500495147554, + "grad_norm": 1.5601984226245849, + "learning_rate": 1.782071408740019e-05, + "loss": 0.4329, + "step": 3001 + }, + { + "epoch": 0.23782927312339078, + "grad_norm": 1.9321930118327455, + "learning_rate": 1.781911470763742e-05, + "loss": 0.4113, + "step": 3002 + }, + { + "epoch": 0.23790849673202613, + "grad_norm": 1.6775462521648703, + "learning_rate": 1.7817514813023577e-05, + "loss": 0.4919, + "step": 3003 + }, + { + "epoch": 0.2379877203406615, + "grad_norm": 1.6890843011783088, + "learning_rate": 1.781591440366399e-05, + "loss": 0.3913, + "step": 3004 + }, + { + "epoch": 0.2380669439492969, + "grad_norm": 1.6815843237143824, + "learning_rate": 1.7814313479664054e-05, + "loss": 0.3549, + "step": 3005 + }, + { + "epoch": 0.23814616755793228, + "grad_norm": 1.630168929724418, + "learning_rate": 1.781271204112917e-05, + "loss": 0.402, + "step": 3006 + }, + { + "epoch": 0.23822539116656763, + "grad_norm": 1.8758683036216994, + "learning_rate": 1.7811110088164797e-05, + "loss": 0.3401, + "step": 3007 + }, + { + "epoch": 0.238304614775203, + "grad_norm": 1.7197732446151963, + "learning_rate": 1.7809507620876406e-05, + "loss": 0.3072, + "step": 3008 + }, + { + "epoch": 0.2383838383838384, + "grad_norm": 1.775250984240573, + "learning_rate": 1.7807904639369512e-05, + "loss": 0.5199, + "step": 3009 + }, + { + "epoch": 0.23846306199247375, + "grad_norm": 1.5126074857013367, + "learning_rate": 1.7806301143749672e-05, + "loss": 0.3181, + "step": 3010 + }, + { + "epoch": 0.23854228560110913, + "grad_norm": 1.610330348021287, + "learning_rate": 1.780469713412246e-05, + "loss": 0.3136, + "step": 3011 + }, + { + "epoch": 0.2386215092097445, + "grad_norm": 1.7474691127532895, + "learning_rate": 1.78030926105935e-05, + "loss": 0.4079, + "step": 3012 + }, + { + "epoch": 0.2387007328183799, + "grad_norm": 1.8788682983942224, + "learning_rate": 1.7801487573268433e-05, + "loss": 0.3756, + "step": 3013 + }, + { + "epoch": 0.23877995642701524, + "grad_norm": 1.545590004326463, + "learning_rate": 1.7799882022252948e-05, + "loss": 0.3509, + "step": 3014 + }, + { + "epoch": 0.23885918003565063, + "grad_norm": 1.8924300162713565, + "learning_rate": 1.7798275957652764e-05, + "loss": 0.4448, + "step": 3015 + }, + { + "epoch": 0.238938403644286, + "grad_norm": 1.9670811136959074, + "learning_rate": 1.779666937957363e-05, + "loss": 0.4056, + "step": 3016 + }, + { + "epoch": 0.23901762725292136, + "grad_norm": 1.5933054658189323, + "learning_rate": 1.7795062288121335e-05, + "loss": 0.3795, + "step": 3017 + }, + { + "epoch": 0.23909685086155674, + "grad_norm": 1.7144014119415023, + "learning_rate": 1.7793454683401692e-05, + "loss": 0.3452, + "step": 3018 + }, + { + "epoch": 0.23917607447019212, + "grad_norm": 1.815888246301984, + "learning_rate": 1.779184656552056e-05, + "loss": 0.3239, + "step": 3019 + }, + { + "epoch": 0.23925529807882748, + "grad_norm": 1.5971836595814837, + "learning_rate": 1.7790237934583824e-05, + "loss": 0.3462, + "step": 3020 + }, + { + "epoch": 0.23933452168746286, + "grad_norm": 1.9646704316874855, + "learning_rate": 1.7788628790697404e-05, + "loss": 0.3804, + "step": 3021 + }, + { + "epoch": 0.23941374529609824, + "grad_norm": 1.6747798166136836, + "learning_rate": 1.7787019133967252e-05, + "loss": 0.3547, + "step": 3022 + }, + { + "epoch": 0.23949296890473362, + "grad_norm": 1.9264676329828028, + "learning_rate": 1.778540896449936e-05, + "loss": 0.4613, + "step": 3023 + }, + { + "epoch": 0.23957219251336898, + "grad_norm": 1.6132476701095293, + "learning_rate": 1.778379828239975e-05, + "loss": 0.4111, + "step": 3024 + }, + { + "epoch": 0.23965141612200436, + "grad_norm": 1.4309644043664016, + "learning_rate": 1.778218708777448e-05, + "loss": 0.2995, + "step": 3025 + }, + { + "epoch": 0.23973063973063974, + "grad_norm": 1.6887063840388452, + "learning_rate": 1.7780575380729626e-05, + "loss": 0.371, + "step": 3026 + }, + { + "epoch": 0.2398098633392751, + "grad_norm": 1.6360906298003395, + "learning_rate": 1.777896316137133e-05, + "loss": 0.3086, + "step": 3027 + }, + { + "epoch": 0.23988908694791047, + "grad_norm": 1.6362852227495086, + "learning_rate": 1.7777350429805734e-05, + "loss": 0.3738, + "step": 3028 + }, + { + "epoch": 0.23996831055654586, + "grad_norm": 2.0635917716487584, + "learning_rate": 1.777573718613904e-05, + "loss": 0.4594, + "step": 3029 + }, + { + "epoch": 0.24004753416518124, + "grad_norm": 1.525494481033945, + "learning_rate": 1.7774123430477464e-05, + "loss": 0.3678, + "step": 3030 + }, + { + "epoch": 0.2401267577738166, + "grad_norm": 2.013675252469297, + "learning_rate": 1.7772509162927266e-05, + "loss": 0.3841, + "step": 3031 + }, + { + "epoch": 0.24020598138245197, + "grad_norm": 1.7014065313927174, + "learning_rate": 1.7770894383594737e-05, + "loss": 0.3373, + "step": 3032 + }, + { + "epoch": 0.24028520499108735, + "grad_norm": 1.5976352232612612, + "learning_rate": 1.7769279092586205e-05, + "loss": 0.3187, + "step": 3033 + }, + { + "epoch": 0.2403644285997227, + "grad_norm": 1.3225149809736216, + "learning_rate": 1.776766329000803e-05, + "loss": 0.2451, + "step": 3034 + }, + { + "epoch": 0.2404436522083581, + "grad_norm": 1.4403620008154954, + "learning_rate": 1.7766046975966603e-05, + "loss": 0.3146, + "step": 3035 + }, + { + "epoch": 0.24052287581699347, + "grad_norm": 1.9039601851649306, + "learning_rate": 1.7764430150568347e-05, + "loss": 0.4125, + "step": 3036 + }, + { + "epoch": 0.24060209942562882, + "grad_norm": 1.779349077077079, + "learning_rate": 1.776281281391973e-05, + "loss": 0.4304, + "step": 3037 + }, + { + "epoch": 0.2406813230342642, + "grad_norm": 1.7453775037164776, + "learning_rate": 1.776119496612724e-05, + "loss": 0.4112, + "step": 3038 + }, + { + "epoch": 0.2407605466428996, + "grad_norm": 1.9217349353308082, + "learning_rate": 1.7759576607297405e-05, + "loss": 0.401, + "step": 3039 + }, + { + "epoch": 0.24083977025153497, + "grad_norm": 1.8946796518967006, + "learning_rate": 1.7757957737536785e-05, + "loss": 0.3424, + "step": 3040 + }, + { + "epoch": 0.24091899386017032, + "grad_norm": 2.1324307793213104, + "learning_rate": 1.775633835695198e-05, + "loss": 0.4827, + "step": 3041 + }, + { + "epoch": 0.2409982174688057, + "grad_norm": 1.9373241398516923, + "learning_rate": 1.7754718465649618e-05, + "loss": 0.386, + "step": 3042 + }, + { + "epoch": 0.24107744107744108, + "grad_norm": 1.716237797792207, + "learning_rate": 1.7753098063736355e-05, + "loss": 0.3793, + "step": 3043 + }, + { + "epoch": 0.24115666468607644, + "grad_norm": 1.7525684948363685, + "learning_rate": 1.775147715131889e-05, + "loss": 0.4361, + "step": 3044 + }, + { + "epoch": 0.24123588829471182, + "grad_norm": 1.612076879378781, + "learning_rate": 1.7749855728503952e-05, + "loss": 0.3075, + "step": 3045 + }, + { + "epoch": 0.2413151119033472, + "grad_norm": 1.7327545991172746, + "learning_rate": 1.7748233795398308e-05, + "loss": 0.4417, + "step": 3046 + }, + { + "epoch": 0.24139433551198258, + "grad_norm": 1.7073355632547802, + "learning_rate": 1.7746611352108744e-05, + "loss": 0.3665, + "step": 3047 + }, + { + "epoch": 0.24147355912061794, + "grad_norm": 1.5098829303663128, + "learning_rate": 1.7744988398742102e-05, + "loss": 0.346, + "step": 3048 + }, + { + "epoch": 0.24155278272925332, + "grad_norm": 1.6746666098265324, + "learning_rate": 1.7743364935405238e-05, + "loss": 0.4539, + "step": 3049 + }, + { + "epoch": 0.2416320063378887, + "grad_norm": 1.570900488242882, + "learning_rate": 1.7741740962205053e-05, + "loss": 0.3007, + "step": 3050 + }, + { + "epoch": 0.24171122994652405, + "grad_norm": 1.8628105363132954, + "learning_rate": 1.7740116479248474e-05, + "loss": 0.4446, + "step": 3051 + }, + { + "epoch": 0.24179045355515943, + "grad_norm": 1.8509043499665498, + "learning_rate": 1.773849148664247e-05, + "loss": 0.4163, + "step": 3052 + }, + { + "epoch": 0.24186967716379482, + "grad_norm": 1.900746073434233, + "learning_rate": 1.773686598449404e-05, + "loss": 0.3962, + "step": 3053 + }, + { + "epoch": 0.2419489007724302, + "grad_norm": 1.7785979488267507, + "learning_rate": 1.7735239972910208e-05, + "loss": 0.3946, + "step": 3054 + }, + { + "epoch": 0.24202812438106555, + "grad_norm": 1.6141601899754905, + "learning_rate": 1.7733613451998043e-05, + "loss": 0.3045, + "step": 3055 + }, + { + "epoch": 0.24210734798970093, + "grad_norm": 1.6990711528937419, + "learning_rate": 1.7731986421864645e-05, + "loss": 0.4688, + "step": 3056 + }, + { + "epoch": 0.24218657159833631, + "grad_norm": 1.7152333008636496, + "learning_rate": 1.7730358882617148e-05, + "loss": 0.4961, + "step": 3057 + }, + { + "epoch": 0.24226579520697167, + "grad_norm": 1.7344426087816067, + "learning_rate": 1.772873083436271e-05, + "loss": 0.3569, + "step": 3058 + }, + { + "epoch": 0.24234501881560705, + "grad_norm": 1.700652081978969, + "learning_rate": 1.7727102277208538e-05, + "loss": 0.3215, + "step": 3059 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 2.014740699375079, + "learning_rate": 1.772547321126186e-05, + "loss": 0.3551, + "step": 3060 + }, + { + "epoch": 0.24250346603287778, + "grad_norm": 1.5192035457231223, + "learning_rate": 1.7723843636629945e-05, + "loss": 0.3115, + "step": 3061 + }, + { + "epoch": 0.24258268964151317, + "grad_norm": 1.6300265788745925, + "learning_rate": 1.772221355342009e-05, + "loss": 0.364, + "step": 3062 + }, + { + "epoch": 0.24266191325014855, + "grad_norm": 1.4934620481195857, + "learning_rate": 1.7720582961739628e-05, + "loss": 0.4145, + "step": 3063 + }, + { + "epoch": 0.24274113685878393, + "grad_norm": 1.514277909480501, + "learning_rate": 1.771895186169593e-05, + "loss": 0.2893, + "step": 3064 + }, + { + "epoch": 0.24282036046741928, + "grad_norm": 1.4571352183046191, + "learning_rate": 1.7717320253396393e-05, + "loss": 0.3356, + "step": 3065 + }, + { + "epoch": 0.24289958407605466, + "grad_norm": 1.6276661897388882, + "learning_rate": 1.771568813694845e-05, + "loss": 0.3816, + "step": 3066 + }, + { + "epoch": 0.24297880768469005, + "grad_norm": 1.591625283464681, + "learning_rate": 1.771405551245957e-05, + "loss": 0.3179, + "step": 3067 + }, + { + "epoch": 0.2430580312933254, + "grad_norm": 1.961787620391812, + "learning_rate": 1.771242238003725e-05, + "loss": 0.4525, + "step": 3068 + }, + { + "epoch": 0.24313725490196078, + "grad_norm": 1.9240313938905484, + "learning_rate": 1.7710788739789025e-05, + "loss": 0.3429, + "step": 3069 + }, + { + "epoch": 0.24321647851059616, + "grad_norm": 1.6863130474500412, + "learning_rate": 1.7709154591822466e-05, + "loss": 0.3725, + "step": 3070 + }, + { + "epoch": 0.24329570211923154, + "grad_norm": 2.001387621587086, + "learning_rate": 1.770751993624517e-05, + "loss": 0.4086, + "step": 3071 + }, + { + "epoch": 0.2433749257278669, + "grad_norm": 1.5142532804538547, + "learning_rate": 1.770588477316477e-05, + "loss": 0.299, + "step": 3072 + }, + { + "epoch": 0.24345414933650228, + "grad_norm": 1.857428628276661, + "learning_rate": 1.770424910268894e-05, + "loss": 0.3723, + "step": 3073 + }, + { + "epoch": 0.24353337294513766, + "grad_norm": 1.7462419498643962, + "learning_rate": 1.7702612924925377e-05, + "loss": 0.3731, + "step": 3074 + }, + { + "epoch": 0.24361259655377301, + "grad_norm": 1.8710645082225665, + "learning_rate": 1.7700976239981815e-05, + "loss": 0.3863, + "step": 3075 + }, + { + "epoch": 0.2436918201624084, + "grad_norm": 1.5347069367482018, + "learning_rate": 1.769933904796602e-05, + "loss": 0.3049, + "step": 3076 + }, + { + "epoch": 0.24377104377104378, + "grad_norm": 1.8587373108909016, + "learning_rate": 1.76977013489858e-05, + "loss": 0.4202, + "step": 3077 + }, + { + "epoch": 0.24385026737967913, + "grad_norm": 1.619430852618424, + "learning_rate": 1.7696063143148982e-05, + "loss": 0.3421, + "step": 3078 + }, + { + "epoch": 0.2439294909883145, + "grad_norm": 1.6464314393924742, + "learning_rate": 1.7694424430563436e-05, + "loss": 0.4486, + "step": 3079 + }, + { + "epoch": 0.2440087145969499, + "grad_norm": 1.6898695277486135, + "learning_rate": 1.769278521133707e-05, + "loss": 0.364, + "step": 3080 + }, + { + "epoch": 0.24408793820558528, + "grad_norm": 1.6357901919241997, + "learning_rate": 1.769114548557781e-05, + "loss": 0.2768, + "step": 3081 + }, + { + "epoch": 0.24416716181422063, + "grad_norm": 1.4944061972271023, + "learning_rate": 1.768950525339362e-05, + "loss": 0.3361, + "step": 3082 + }, + { + "epoch": 0.244246385422856, + "grad_norm": 1.4083502535547918, + "learning_rate": 1.7687864514892516e-05, + "loss": 0.3275, + "step": 3083 + }, + { + "epoch": 0.2443256090314914, + "grad_norm": 2.167969114380632, + "learning_rate": 1.7686223270182524e-05, + "loss": 0.4334, + "step": 3084 + }, + { + "epoch": 0.24440483264012675, + "grad_norm": 2.0972020452737086, + "learning_rate": 1.7684581519371714e-05, + "loss": 0.3704, + "step": 3085 + }, + { + "epoch": 0.24448405624876213, + "grad_norm": 1.5880282774029748, + "learning_rate": 1.768293926256819e-05, + "loss": 0.4288, + "step": 3086 + }, + { + "epoch": 0.2445632798573975, + "grad_norm": 1.6393280801288992, + "learning_rate": 1.7681296499880077e-05, + "loss": 0.3685, + "step": 3087 + }, + { + "epoch": 0.2446425034660329, + "grad_norm": 1.7906286050299878, + "learning_rate": 1.767965323141555e-05, + "loss": 0.3569, + "step": 3088 + }, + { + "epoch": 0.24472172707466824, + "grad_norm": 1.4097848857249107, + "learning_rate": 1.7678009457282816e-05, + "loss": 0.3159, + "step": 3089 + }, + { + "epoch": 0.24480095068330363, + "grad_norm": 1.7815584433961174, + "learning_rate": 1.7676365177590097e-05, + "loss": 0.3152, + "step": 3090 + }, + { + "epoch": 0.244880174291939, + "grad_norm": 1.652073647775185, + "learning_rate": 1.7674720392445672e-05, + "loss": 0.329, + "step": 3091 + }, + { + "epoch": 0.24495939790057436, + "grad_norm": 1.7839111747695318, + "learning_rate": 1.7673075101957837e-05, + "loss": 0.4997, + "step": 3092 + }, + { + "epoch": 0.24503862150920974, + "grad_norm": 1.5159647073828213, + "learning_rate": 1.7671429306234924e-05, + "loss": 0.356, + "step": 3093 + }, + { + "epoch": 0.24511784511784512, + "grad_norm": 1.6781157454696225, + "learning_rate": 1.7669783005385305e-05, + "loss": 0.3434, + "step": 3094 + }, + { + "epoch": 0.2451970687264805, + "grad_norm": 1.9992196666224964, + "learning_rate": 1.766813619951738e-05, + "loss": 0.4459, + "step": 3095 + }, + { + "epoch": 0.24527629233511586, + "grad_norm": 1.6531818851633784, + "learning_rate": 1.7666488888739587e-05, + "loss": 0.3399, + "step": 3096 + }, + { + "epoch": 0.24535551594375124, + "grad_norm": 1.686594843713856, + "learning_rate": 1.7664841073160383e-05, + "loss": 0.4389, + "step": 3097 + }, + { + "epoch": 0.24543473955238662, + "grad_norm": 1.854276648134284, + "learning_rate": 1.766319275288828e-05, + "loss": 0.4581, + "step": 3098 + }, + { + "epoch": 0.24551396316102198, + "grad_norm": 1.7112353364898552, + "learning_rate": 1.7661543928031802e-05, + "loss": 0.325, + "step": 3099 + }, + { + "epoch": 0.24559318676965736, + "grad_norm": 1.9011164612139102, + "learning_rate": 1.7659894598699527e-05, + "loss": 0.367, + "step": 3100 + }, + { + "epoch": 0.24567241037829274, + "grad_norm": 1.6314720605941708, + "learning_rate": 1.765824476500005e-05, + "loss": 0.3954, + "step": 3101 + }, + { + "epoch": 0.2457516339869281, + "grad_norm": 1.6024620008153847, + "learning_rate": 1.7656594427041997e-05, + "loss": 0.4288, + "step": 3102 + }, + { + "epoch": 0.24583085759556347, + "grad_norm": 1.8448318751583703, + "learning_rate": 1.765494358493405e-05, + "loss": 0.3834, + "step": 3103 + }, + { + "epoch": 0.24591008120419885, + "grad_norm": 1.5315277339381745, + "learning_rate": 1.7653292238784897e-05, + "loss": 0.3936, + "step": 3104 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 1.6824810285535965, + "learning_rate": 1.7651640388703275e-05, + "loss": 0.4767, + "step": 3105 + }, + { + "epoch": 0.2460685284214696, + "grad_norm": 1.618077403185247, + "learning_rate": 1.7649988034797952e-05, + "loss": 0.401, + "step": 3106 + }, + { + "epoch": 0.24614775203010497, + "grad_norm": 1.736405319429256, + "learning_rate": 1.7648335177177725e-05, + "loss": 0.2936, + "step": 3107 + }, + { + "epoch": 0.24622697563874035, + "grad_norm": 1.5671705213711955, + "learning_rate": 1.764668181595143e-05, + "loss": 0.3629, + "step": 3108 + }, + { + "epoch": 0.2463061992473757, + "grad_norm": 1.8258693894173534, + "learning_rate": 1.764502795122793e-05, + "loss": 0.6055, + "step": 3109 + }, + { + "epoch": 0.2463854228560111, + "grad_norm": 1.9598912574689136, + "learning_rate": 1.7643373583116123e-05, + "loss": 0.4291, + "step": 3110 + }, + { + "epoch": 0.24646464646464647, + "grad_norm": 1.7132876626298001, + "learning_rate": 1.7641718711724947e-05, + "loss": 0.3735, + "step": 3111 + }, + { + "epoch": 0.24654387007328185, + "grad_norm": 1.345583144252865, + "learning_rate": 1.764006333716336e-05, + "loss": 0.2853, + "step": 3112 + }, + { + "epoch": 0.2466230936819172, + "grad_norm": 1.7463840669564408, + "learning_rate": 1.7638407459540364e-05, + "loss": 0.4549, + "step": 3113 + }, + { + "epoch": 0.2467023172905526, + "grad_norm": 1.9929654102104863, + "learning_rate": 1.7636751078964995e-05, + "loss": 0.4265, + "step": 3114 + }, + { + "epoch": 0.24678154089918797, + "grad_norm": 1.7252340845541883, + "learning_rate": 1.763509419554631e-05, + "loss": 0.3636, + "step": 3115 + }, + { + "epoch": 0.24686076450782332, + "grad_norm": 1.537441125450611, + "learning_rate": 1.763343680939341e-05, + "loss": 0.3439, + "step": 3116 + }, + { + "epoch": 0.2469399881164587, + "grad_norm": 1.8478322196130217, + "learning_rate": 1.7631778920615427e-05, + "loss": 0.3978, + "step": 3117 + }, + { + "epoch": 0.24701921172509408, + "grad_norm": 1.9885081331537757, + "learning_rate": 1.7630120529321518e-05, + "loss": 0.4383, + "step": 3118 + }, + { + "epoch": 0.24709843533372944, + "grad_norm": 1.7683438747751496, + "learning_rate": 1.7628461635620895e-05, + "loss": 0.3372, + "step": 3119 + }, + { + "epoch": 0.24717765894236482, + "grad_norm": 1.6878995620613646, + "learning_rate": 1.7626802239622772e-05, + "loss": 0.367, + "step": 3120 + }, + { + "epoch": 0.2472568825510002, + "grad_norm": 1.6331811307900859, + "learning_rate": 1.7625142341436423e-05, + "loss": 0.3765, + "step": 3121 + }, + { + "epoch": 0.24733610615963558, + "grad_norm": 1.5326916621265476, + "learning_rate": 1.762348194117114e-05, + "loss": 0.2831, + "step": 3122 + }, + { + "epoch": 0.24741532976827094, + "grad_norm": 1.6542354847422005, + "learning_rate": 1.7621821038936257e-05, + "loss": 0.41, + "step": 3123 + }, + { + "epoch": 0.24749455337690632, + "grad_norm": 1.6825881558525462, + "learning_rate": 1.7620159634841127e-05, + "loss": 0.4372, + "step": 3124 + }, + { + "epoch": 0.2475737769855417, + "grad_norm": 1.8215056536469243, + "learning_rate": 1.761849772899515e-05, + "loss": 0.4171, + "step": 3125 + }, + { + "epoch": 0.24765300059417705, + "grad_norm": 1.6850118382127774, + "learning_rate": 1.7616835321507757e-05, + "loss": 0.3661, + "step": 3126 + }, + { + "epoch": 0.24773222420281243, + "grad_norm": 1.3341692323234686, + "learning_rate": 1.761517241248841e-05, + "loss": 0.2281, + "step": 3127 + }, + { + "epoch": 0.24781144781144782, + "grad_norm": 1.6464597112168649, + "learning_rate": 1.76135090020466e-05, + "loss": 0.3422, + "step": 3128 + }, + { + "epoch": 0.2478906714200832, + "grad_norm": 1.8514499850295798, + "learning_rate": 1.7611845090291858e-05, + "loss": 0.4611, + "step": 3129 + }, + { + "epoch": 0.24796989502871855, + "grad_norm": 1.8596657964559526, + "learning_rate": 1.761018067733374e-05, + "loss": 0.4425, + "step": 3130 + }, + { + "epoch": 0.24804911863735393, + "grad_norm": 1.5980531539496576, + "learning_rate": 1.7608515763281843e-05, + "loss": 0.413, + "step": 3131 + }, + { + "epoch": 0.24812834224598931, + "grad_norm": 1.7036543217940978, + "learning_rate": 1.760685034824579e-05, + "loss": 0.4034, + "step": 3132 + }, + { + "epoch": 0.24820756585462467, + "grad_norm": 1.6625956746508452, + "learning_rate": 1.760518443233525e-05, + "loss": 0.3812, + "step": 3133 + }, + { + "epoch": 0.24828678946326005, + "grad_norm": 1.823827034449052, + "learning_rate": 1.7603518015659905e-05, + "loss": 0.4981, + "step": 3134 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 1.5951347269817877, + "learning_rate": 1.7601851098329484e-05, + "loss": 0.4152, + "step": 3135 + }, + { + "epoch": 0.24844523668053078, + "grad_norm": 1.6333279705640327, + "learning_rate": 1.7600183680453745e-05, + "loss": 0.3817, + "step": 3136 + }, + { + "epoch": 0.24852446028916617, + "grad_norm": 1.612506329116598, + "learning_rate": 1.7598515762142484e-05, + "loss": 0.3262, + "step": 3137 + }, + { + "epoch": 0.24860368389780155, + "grad_norm": 1.667174698200025, + "learning_rate": 1.759684734350552e-05, + "loss": 0.482, + "step": 3138 + }, + { + "epoch": 0.24868290750643693, + "grad_norm": 1.5860244743119838, + "learning_rate": 1.759517842465271e-05, + "loss": 0.3543, + "step": 3139 + }, + { + "epoch": 0.24876213111507228, + "grad_norm": 1.6657968318462428, + "learning_rate": 1.759350900569395e-05, + "loss": 0.4006, + "step": 3140 + }, + { + "epoch": 0.24884135472370766, + "grad_norm": 1.4512463582173374, + "learning_rate": 1.759183908673916e-05, + "loss": 0.3589, + "step": 3141 + }, + { + "epoch": 0.24892057833234305, + "grad_norm": 1.9940519905085468, + "learning_rate": 1.759016866789829e-05, + "loss": 0.3304, + "step": 3142 + }, + { + "epoch": 0.2489998019409784, + "grad_norm": 2.0032225011465505, + "learning_rate": 1.7588497749281338e-05, + "loss": 0.3591, + "step": 3143 + }, + { + "epoch": 0.24907902554961378, + "grad_norm": 1.711719266045336, + "learning_rate": 1.7586826330998324e-05, + "loss": 0.3738, + "step": 3144 + }, + { + "epoch": 0.24915824915824916, + "grad_norm": 2.13916379198605, + "learning_rate": 1.7585154413159304e-05, + "loss": 0.3603, + "step": 3145 + }, + { + "epoch": 0.24923747276688454, + "grad_norm": 1.7765353635847283, + "learning_rate": 1.758348199587436e-05, + "loss": 0.3535, + "step": 3146 + }, + { + "epoch": 0.2493166963755199, + "grad_norm": 1.2951752704028645, + "learning_rate": 1.7581809079253616e-05, + "loss": 0.3375, + "step": 3147 + }, + { + "epoch": 0.24939591998415528, + "grad_norm": 1.7800642657783186, + "learning_rate": 1.7580135663407226e-05, + "loss": 0.4327, + "step": 3148 + }, + { + "epoch": 0.24947514359279066, + "grad_norm": 1.991001877459909, + "learning_rate": 1.7578461748445374e-05, + "loss": 0.4391, + "step": 3149 + }, + { + "epoch": 0.24955436720142601, + "grad_norm": 1.8319689521742195, + "learning_rate": 1.7576787334478283e-05, + "loss": 0.3643, + "step": 3150 + }, + { + "epoch": 0.2496335908100614, + "grad_norm": 1.532594908192309, + "learning_rate": 1.7575112421616203e-05, + "loss": 0.3371, + "step": 3151 + }, + { + "epoch": 0.24971281441869678, + "grad_norm": 1.9404567341999421, + "learning_rate": 1.757343700996942e-05, + "loss": 0.3831, + "step": 3152 + }, + { + "epoch": 0.24979203802733216, + "grad_norm": 1.6412692424730824, + "learning_rate": 1.757176109964825e-05, + "loss": 0.4111, + "step": 3153 + }, + { + "epoch": 0.2498712616359675, + "grad_norm": 1.8372979184013194, + "learning_rate": 1.7570084690763042e-05, + "loss": 0.4445, + "step": 3154 + }, + { + "epoch": 0.2499504852446029, + "grad_norm": 1.5492598635557118, + "learning_rate": 1.7568407783424187e-05, + "loss": 0.3488, + "step": 3155 + }, + { + "epoch": 0.2500297088532383, + "grad_norm": 1.6773650227703059, + "learning_rate": 1.7566730377742093e-05, + "loss": 0.3525, + "step": 3156 + }, + { + "epoch": 0.25010893246187366, + "grad_norm": 1.3484960434550548, + "learning_rate": 1.7565052473827213e-05, + "loss": 0.3274, + "step": 3157 + }, + { + "epoch": 0.25018815607050904, + "grad_norm": 1.7040834307595112, + "learning_rate": 1.7563374071790028e-05, + "loss": 0.3084, + "step": 3158 + }, + { + "epoch": 0.25026737967914436, + "grad_norm": 1.7158746015627382, + "learning_rate": 1.7561695171741054e-05, + "loss": 0.4214, + "step": 3159 + }, + { + "epoch": 0.25034660328777975, + "grad_norm": 1.5382135792599925, + "learning_rate": 1.7560015773790837e-05, + "loss": 0.3863, + "step": 3160 + }, + { + "epoch": 0.2504258268964151, + "grad_norm": 1.6423749980612374, + "learning_rate": 1.7558335878049955e-05, + "loss": 0.4629, + "step": 3161 + }, + { + "epoch": 0.2505050505050505, + "grad_norm": 1.6094871513361309, + "learning_rate": 1.7556655484629028e-05, + "loss": 0.4016, + "step": 3162 + }, + { + "epoch": 0.2505842741136859, + "grad_norm": 1.9636668367552694, + "learning_rate": 1.7554974593638697e-05, + "loss": 0.3705, + "step": 3163 + }, + { + "epoch": 0.25066349772232127, + "grad_norm": 1.4463471760821986, + "learning_rate": 1.755329320518964e-05, + "loss": 0.355, + "step": 3164 + }, + { + "epoch": 0.25074272133095665, + "grad_norm": 2.218429902654406, + "learning_rate": 1.7551611319392573e-05, + "loss": 0.4233, + "step": 3165 + }, + { + "epoch": 0.250821944939592, + "grad_norm": 1.692074924221293, + "learning_rate": 1.7549928936358232e-05, + "loss": 0.256, + "step": 3166 + }, + { + "epoch": 0.25090116854822736, + "grad_norm": 1.9555349113087643, + "learning_rate": 1.75482460561974e-05, + "loss": 0.4198, + "step": 3167 + }, + { + "epoch": 0.25098039215686274, + "grad_norm": 1.8304323777677494, + "learning_rate": 1.7546562679020884e-05, + "loss": 0.3067, + "step": 3168 + }, + { + "epoch": 0.2510596157654981, + "grad_norm": 1.6974395122185204, + "learning_rate": 1.7544878804939528e-05, + "loss": 0.3313, + "step": 3169 + }, + { + "epoch": 0.2511388393741335, + "grad_norm": 1.5269362241735296, + "learning_rate": 1.7543194434064208e-05, + "loss": 0.3688, + "step": 3170 + }, + { + "epoch": 0.2512180629827689, + "grad_norm": 1.4531822613412166, + "learning_rate": 1.754150956650583e-05, + "loss": 0.3558, + "step": 3171 + }, + { + "epoch": 0.2512972865914042, + "grad_norm": 2.3714676792356273, + "learning_rate": 1.753982420237533e-05, + "loss": 0.399, + "step": 3172 + }, + { + "epoch": 0.2513765102000396, + "grad_norm": 1.77194609758977, + "learning_rate": 1.753813834178369e-05, + "loss": 0.4464, + "step": 3173 + }, + { + "epoch": 0.251455733808675, + "grad_norm": 1.5697093494779089, + "learning_rate": 1.753645198484191e-05, + "loss": 0.2932, + "step": 3174 + }, + { + "epoch": 0.25153495741731036, + "grad_norm": 1.8609773962843452, + "learning_rate": 1.753476513166103e-05, + "loss": 0.3814, + "step": 3175 + }, + { + "epoch": 0.25161418102594574, + "grad_norm": 1.5335317957817038, + "learning_rate": 1.7533077782352123e-05, + "loss": 0.3822, + "step": 3176 + }, + { + "epoch": 0.2516934046345811, + "grad_norm": 1.4065319394040334, + "learning_rate": 1.753138993702629e-05, + "loss": 0.3144, + "step": 3177 + }, + { + "epoch": 0.2517726282432165, + "grad_norm": 1.8547513542068992, + "learning_rate": 1.752970159579467e-05, + "loss": 0.382, + "step": 3178 + }, + { + "epoch": 0.2518518518518518, + "grad_norm": 1.5701521302473442, + "learning_rate": 1.7528012758768426e-05, + "loss": 0.3788, + "step": 3179 + }, + { + "epoch": 0.2519310754604872, + "grad_norm": 1.9113128812111697, + "learning_rate": 1.7526323426058767e-05, + "loss": 0.4386, + "step": 3180 + }, + { + "epoch": 0.2520102990691226, + "grad_norm": 1.928975548651707, + "learning_rate": 1.7524633597776923e-05, + "loss": 0.4944, + "step": 3181 + }, + { + "epoch": 0.25208952267775797, + "grad_norm": 1.7284713786438117, + "learning_rate": 1.7522943274034165e-05, + "loss": 0.3802, + "step": 3182 + }, + { + "epoch": 0.25216874628639335, + "grad_norm": 1.5495622369767326, + "learning_rate": 1.752125245494179e-05, + "loss": 0.371, + "step": 3183 + }, + { + "epoch": 0.25224796989502873, + "grad_norm": 1.540973369500184, + "learning_rate": 1.751956114061113e-05, + "loss": 0.4582, + "step": 3184 + }, + { + "epoch": 0.2523271935036641, + "grad_norm": 2.0746743217922674, + "learning_rate": 1.751786933115355e-05, + "loss": 0.4601, + "step": 3185 + }, + { + "epoch": 0.25240641711229944, + "grad_norm": 1.448268277596241, + "learning_rate": 1.751617702668045e-05, + "loss": 0.2938, + "step": 3186 + }, + { + "epoch": 0.2524856407209348, + "grad_norm": 1.9235119463678907, + "learning_rate": 1.751448422730326e-05, + "loss": 0.406, + "step": 3187 + }, + { + "epoch": 0.2525648643295702, + "grad_norm": 1.8716330620152517, + "learning_rate": 1.7512790933133435e-05, + "loss": 0.4377, + "step": 3188 + }, + { + "epoch": 0.2526440879382056, + "grad_norm": 1.7226498304368687, + "learning_rate": 1.7511097144282482e-05, + "loss": 0.417, + "step": 3189 + }, + { + "epoch": 0.25272331154684097, + "grad_norm": 1.7647831656485702, + "learning_rate": 1.7509402860861923e-05, + "loss": 0.4161, + "step": 3190 + }, + { + "epoch": 0.25280253515547635, + "grad_norm": 2.128200677494788, + "learning_rate": 1.7507708082983313e-05, + "loss": 0.3842, + "step": 3191 + }, + { + "epoch": 0.25288175876411173, + "grad_norm": 1.7179748091492708, + "learning_rate": 1.7506012810758254e-05, + "loss": 0.4917, + "step": 3192 + }, + { + "epoch": 0.25296098237274706, + "grad_norm": 1.7216071210366086, + "learning_rate": 1.750431704429837e-05, + "loss": 0.4393, + "step": 3193 + }, + { + "epoch": 0.25304020598138244, + "grad_norm": 1.906337990626111, + "learning_rate": 1.7502620783715316e-05, + "loss": 0.3848, + "step": 3194 + }, + { + "epoch": 0.2531194295900178, + "grad_norm": 1.9206035963370554, + "learning_rate": 1.7500924029120782e-05, + "loss": 0.3593, + "step": 3195 + }, + { + "epoch": 0.2531986531986532, + "grad_norm": 1.7121886205047347, + "learning_rate": 1.7499226780626494e-05, + "loss": 0.375, + "step": 3196 + }, + { + "epoch": 0.2532778768072886, + "grad_norm": 1.6569079521209802, + "learning_rate": 1.7497529038344208e-05, + "loss": 0.3634, + "step": 3197 + }, + { + "epoch": 0.25335710041592396, + "grad_norm": 1.499445551334616, + "learning_rate": 1.7495830802385707e-05, + "loss": 0.3811, + "step": 3198 + }, + { + "epoch": 0.25343632402455935, + "grad_norm": 1.672430062163695, + "learning_rate": 1.7494132072862818e-05, + "loss": 0.3561, + "step": 3199 + }, + { + "epoch": 0.25351554763319467, + "grad_norm": 1.684210748301114, + "learning_rate": 1.7492432849887387e-05, + "loss": 0.343, + "step": 3200 + }, + { + "epoch": 0.25359477124183005, + "grad_norm": 2.0828978127171087, + "learning_rate": 1.749073313357131e-05, + "loss": 0.4565, + "step": 3201 + }, + { + "epoch": 0.25367399485046543, + "grad_norm": 1.3221965180283575, + "learning_rate": 1.7489032924026496e-05, + "loss": 0.2588, + "step": 3202 + }, + { + "epoch": 0.2537532184591008, + "grad_norm": 1.4379988377589241, + "learning_rate": 1.74873322213649e-05, + "loss": 0.3154, + "step": 3203 + }, + { + "epoch": 0.2538324420677362, + "grad_norm": 1.6805473159948106, + "learning_rate": 1.7485631025698504e-05, + "loss": 0.4549, + "step": 3204 + }, + { + "epoch": 0.2539116656763716, + "grad_norm": 1.669544371262534, + "learning_rate": 1.7483929337139326e-05, + "loss": 0.4012, + "step": 3205 + }, + { + "epoch": 0.2539908892850069, + "grad_norm": 1.4702193400041064, + "learning_rate": 1.748222715579941e-05, + "loss": 0.2759, + "step": 3206 + }, + { + "epoch": 0.2540701128936423, + "grad_norm": 1.7922742957576494, + "learning_rate": 1.7480524481790835e-05, + "loss": 0.4298, + "step": 3207 + }, + { + "epoch": 0.25414933650227767, + "grad_norm": 1.8066914837276697, + "learning_rate": 1.7478821315225717e-05, + "loss": 0.2921, + "step": 3208 + }, + { + "epoch": 0.25422856011091305, + "grad_norm": 2.02909534486103, + "learning_rate": 1.7477117656216206e-05, + "loss": 0.5853, + "step": 3209 + }, + { + "epoch": 0.25430778371954843, + "grad_norm": 1.6108195464393773, + "learning_rate": 1.7475413504874474e-05, + "loss": 0.3208, + "step": 3210 + }, + { + "epoch": 0.2543870073281838, + "grad_norm": 1.7398891844169309, + "learning_rate": 1.7473708861312727e-05, + "loss": 0.3715, + "step": 3211 + }, + { + "epoch": 0.2544662309368192, + "grad_norm": 1.7312257525084316, + "learning_rate": 1.7472003725643215e-05, + "loss": 0.3793, + "step": 3212 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 1.7429281544251238, + "learning_rate": 1.747029809797821e-05, + "loss": 0.3524, + "step": 3213 + }, + { + "epoch": 0.2546246781540899, + "grad_norm": 1.6307735119147664, + "learning_rate": 1.7468591978430024e-05, + "loss": 0.3206, + "step": 3214 + }, + { + "epoch": 0.2547039017627253, + "grad_norm": 1.6272110635613188, + "learning_rate": 1.746688536711099e-05, + "loss": 0.3269, + "step": 3215 + }, + { + "epoch": 0.25478312537136066, + "grad_norm": 1.9045383812607315, + "learning_rate": 1.7465178264133482e-05, + "loss": 0.3967, + "step": 3216 + }, + { + "epoch": 0.25486234897999605, + "grad_norm": 1.7515333861950393, + "learning_rate": 1.7463470669609907e-05, + "loss": 0.3921, + "step": 3217 + }, + { + "epoch": 0.2549415725886314, + "grad_norm": 1.5532013247525969, + "learning_rate": 1.74617625836527e-05, + "loss": 0.3645, + "step": 3218 + }, + { + "epoch": 0.2550207961972668, + "grad_norm": 1.5985882784825727, + "learning_rate": 1.746005400637433e-05, + "loss": 0.3814, + "step": 3219 + }, + { + "epoch": 0.25510001980590213, + "grad_norm": 1.615170264841815, + "learning_rate": 1.74583449378873e-05, + "loss": 0.3451, + "step": 3220 + }, + { + "epoch": 0.2551792434145375, + "grad_norm": 1.5959340879303858, + "learning_rate": 1.7456635378304143e-05, + "loss": 0.3722, + "step": 3221 + }, + { + "epoch": 0.2552584670231729, + "grad_norm": 1.8074639912496229, + "learning_rate": 1.7454925327737426e-05, + "loss": 0.4414, + "step": 3222 + }, + { + "epoch": 0.2553376906318083, + "grad_norm": 1.665254380750593, + "learning_rate": 1.7453214786299746e-05, + "loss": 0.3456, + "step": 3223 + }, + { + "epoch": 0.25541691424044366, + "grad_norm": 1.7293852982090319, + "learning_rate": 1.7451503754103735e-05, + "loss": 0.375, + "step": 3224 + }, + { + "epoch": 0.25549613784907904, + "grad_norm": 1.6662525049862082, + "learning_rate": 1.7449792231262056e-05, + "loss": 0.3708, + "step": 3225 + }, + { + "epoch": 0.2555753614577144, + "grad_norm": 1.7728653922123545, + "learning_rate": 1.7448080217887403e-05, + "loss": 0.4221, + "step": 3226 + }, + { + "epoch": 0.25565458506634975, + "grad_norm": 1.7349799473578216, + "learning_rate": 1.7446367714092508e-05, + "loss": 0.3862, + "step": 3227 + }, + { + "epoch": 0.25573380867498513, + "grad_norm": 1.5609924658347427, + "learning_rate": 1.7444654719990128e-05, + "loss": 0.2764, + "step": 3228 + }, + { + "epoch": 0.2558130322836205, + "grad_norm": 1.6115811337131152, + "learning_rate": 1.7442941235693058e-05, + "loss": 0.4664, + "step": 3229 + }, + { + "epoch": 0.2558922558922559, + "grad_norm": 1.7156195058998613, + "learning_rate": 1.744122726131412e-05, + "loss": 0.4192, + "step": 3230 + }, + { + "epoch": 0.2559714795008913, + "grad_norm": 1.9625487708419151, + "learning_rate": 1.7439512796966165e-05, + "loss": 0.4409, + "step": 3231 + }, + { + "epoch": 0.25605070310952666, + "grad_norm": 1.4591860207755285, + "learning_rate": 1.7437797842762098e-05, + "loss": 0.3282, + "step": 3232 + }, + { + "epoch": 0.25612992671816204, + "grad_norm": 1.77300957976758, + "learning_rate": 1.743608239881483e-05, + "loss": 0.3466, + "step": 3233 + }, + { + "epoch": 0.25620915032679736, + "grad_norm": 1.3599882542683719, + "learning_rate": 1.7434366465237312e-05, + "loss": 0.2755, + "step": 3234 + }, + { + "epoch": 0.25628837393543275, + "grad_norm": 1.7019073937903981, + "learning_rate": 1.7432650042142535e-05, + "loss": 0.4418, + "step": 3235 + }, + { + "epoch": 0.2563675975440681, + "grad_norm": 1.8245809277019283, + "learning_rate": 1.743093312964352e-05, + "loss": 0.453, + "step": 3236 + }, + { + "epoch": 0.2564468211527035, + "grad_norm": 1.709948353151467, + "learning_rate": 1.742921572785331e-05, + "loss": 0.49, + "step": 3237 + }, + { + "epoch": 0.2565260447613389, + "grad_norm": 1.6703575384654685, + "learning_rate": 1.7427497836884995e-05, + "loss": 0.3408, + "step": 3238 + }, + { + "epoch": 0.25660526836997427, + "grad_norm": 1.5532255982849867, + "learning_rate": 1.7425779456851683e-05, + "loss": 0.3392, + "step": 3239 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 1.7879256118800355, + "learning_rate": 1.7424060587866526e-05, + "loss": 0.5296, + "step": 3240 + }, + { + "epoch": 0.256763715587245, + "grad_norm": 2.206171587689333, + "learning_rate": 1.74223412300427e-05, + "loss": 0.4342, + "step": 3241 + }, + { + "epoch": 0.25684293919588036, + "grad_norm": 1.4817479413885015, + "learning_rate": 1.7420621383493423e-05, + "loss": 0.271, + "step": 3242 + }, + { + "epoch": 0.25692216280451574, + "grad_norm": 1.7800792075635854, + "learning_rate": 1.7418901048331927e-05, + "loss": 0.4097, + "step": 3243 + }, + { + "epoch": 0.2570013864131511, + "grad_norm": 1.6663641434176393, + "learning_rate": 1.7417180224671497e-05, + "loss": 0.4495, + "step": 3244 + }, + { + "epoch": 0.2570806100217865, + "grad_norm": 1.6227235508242879, + "learning_rate": 1.741545891262544e-05, + "loss": 0.3635, + "step": 3245 + }, + { + "epoch": 0.2571598336304219, + "grad_norm": 1.8956225105118398, + "learning_rate": 1.7413737112307092e-05, + "loss": 0.4964, + "step": 3246 + }, + { + "epoch": 0.2572390572390572, + "grad_norm": 1.4715692775762617, + "learning_rate": 1.741201482382983e-05, + "loss": 0.3367, + "step": 3247 + }, + { + "epoch": 0.2573182808476926, + "grad_norm": 1.5415910700017137, + "learning_rate": 1.7410292047307054e-05, + "loss": 0.3601, + "step": 3248 + }, + { + "epoch": 0.257397504456328, + "grad_norm": 1.3956682106832476, + "learning_rate": 1.7408568782852204e-05, + "loss": 0.3177, + "step": 3249 + }, + { + "epoch": 0.25747672806496336, + "grad_norm": 1.6911254880516622, + "learning_rate": 1.7406845030578747e-05, + "loss": 0.4653, + "step": 3250 + }, + { + "epoch": 0.25755595167359874, + "grad_norm": 1.5339807389303075, + "learning_rate": 1.7405120790600185e-05, + "loss": 0.3131, + "step": 3251 + }, + { + "epoch": 0.2576351752822341, + "grad_norm": 1.6271599584279126, + "learning_rate": 1.740339606303005e-05, + "loss": 0.3545, + "step": 3252 + }, + { + "epoch": 0.2577143988908695, + "grad_norm": 1.1668484645949957, + "learning_rate": 1.7401670847981906e-05, + "loss": 0.3353, + "step": 3253 + }, + { + "epoch": 0.2577936224995048, + "grad_norm": 1.634604353439678, + "learning_rate": 1.7399945145569353e-05, + "loss": 0.359, + "step": 3254 + }, + { + "epoch": 0.2578728461081402, + "grad_norm": 1.4617874407314366, + "learning_rate": 1.7398218955906017e-05, + "loss": 0.3457, + "step": 3255 + }, + { + "epoch": 0.2579520697167756, + "grad_norm": 1.8205880583034841, + "learning_rate": 1.7396492279105562e-05, + "loss": 0.4108, + "step": 3256 + }, + { + "epoch": 0.25803129332541097, + "grad_norm": 2.040877014179288, + "learning_rate": 1.7394765115281678e-05, + "loss": 0.3658, + "step": 3257 + }, + { + "epoch": 0.25811051693404635, + "grad_norm": 1.6206207362456535, + "learning_rate": 1.7393037464548094e-05, + "loss": 0.3229, + "step": 3258 + }, + { + "epoch": 0.25818974054268173, + "grad_norm": 1.6121485523473766, + "learning_rate": 1.7391309327018566e-05, + "loss": 0.3437, + "step": 3259 + }, + { + "epoch": 0.2582689641513171, + "grad_norm": 1.4122326007324562, + "learning_rate": 1.7389580702806884e-05, + "loss": 0.3247, + "step": 3260 + }, + { + "epoch": 0.25834818775995244, + "grad_norm": 2.0386769280130657, + "learning_rate": 1.7387851592026868e-05, + "loss": 0.4364, + "step": 3261 + }, + { + "epoch": 0.2584274113685878, + "grad_norm": 1.7136696001880456, + "learning_rate": 1.738612199479237e-05, + "loss": 0.4282, + "step": 3262 + }, + { + "epoch": 0.2585066349772232, + "grad_norm": 1.8339871927007187, + "learning_rate": 1.7384391911217283e-05, + "loss": 0.4445, + "step": 3263 + }, + { + "epoch": 0.2585858585858586, + "grad_norm": 1.5219299345390735, + "learning_rate": 1.738266134141552e-05, + "loss": 0.4529, + "step": 3264 + }, + { + "epoch": 0.25866508219449397, + "grad_norm": 1.9120484964893236, + "learning_rate": 1.738093028550103e-05, + "loss": 0.4504, + "step": 3265 + }, + { + "epoch": 0.25874430580312935, + "grad_norm": 1.753636552923268, + "learning_rate": 1.7379198743587794e-05, + "loss": 0.3837, + "step": 3266 + }, + { + "epoch": 0.25882352941176473, + "grad_norm": 1.3001908948057963, + "learning_rate": 1.7377466715789828e-05, + "loss": 0.3077, + "step": 3267 + }, + { + "epoch": 0.25890275302040006, + "grad_norm": 1.5332872581989534, + "learning_rate": 1.7375734202221174e-05, + "loss": 0.3004, + "step": 3268 + }, + { + "epoch": 0.25898197662903544, + "grad_norm": 1.3683319559779654, + "learning_rate": 1.7374001202995918e-05, + "loss": 0.273, + "step": 3269 + }, + { + "epoch": 0.2590612002376708, + "grad_norm": 1.3785631799196048, + "learning_rate": 1.7372267718228163e-05, + "loss": 0.2715, + "step": 3270 + }, + { + "epoch": 0.2591404238463062, + "grad_norm": 1.6523402383212709, + "learning_rate": 1.7370533748032047e-05, + "loss": 0.3255, + "step": 3271 + }, + { + "epoch": 0.2592196474549416, + "grad_norm": 1.7100681060049254, + "learning_rate": 1.7368799292521754e-05, + "loss": 0.367, + "step": 3272 + }, + { + "epoch": 0.25929887106357696, + "grad_norm": 1.8381154197824574, + "learning_rate": 1.736706435181148e-05, + "loss": 0.4882, + "step": 3273 + }, + { + "epoch": 0.25937809467221234, + "grad_norm": 1.7769185714791518, + "learning_rate": 1.736532892601547e-05, + "loss": 0.3668, + "step": 3274 + }, + { + "epoch": 0.25945731828084767, + "grad_norm": 1.614739880032071, + "learning_rate": 1.7363593015247987e-05, + "loss": 0.4068, + "step": 3275 + }, + { + "epoch": 0.25953654188948305, + "grad_norm": 1.6686526054706903, + "learning_rate": 1.7361856619623338e-05, + "loss": 0.3517, + "step": 3276 + }, + { + "epoch": 0.25961576549811843, + "grad_norm": 1.3845379465465386, + "learning_rate": 1.736011973925585e-05, + "loss": 0.2664, + "step": 3277 + }, + { + "epoch": 0.2596949891067538, + "grad_norm": 1.8041947719154796, + "learning_rate": 1.7358382374259895e-05, + "loss": 0.4285, + "step": 3278 + }, + { + "epoch": 0.2597742127153892, + "grad_norm": 2.101592644959429, + "learning_rate": 1.7356644524749867e-05, + "loss": 0.4763, + "step": 3279 + }, + { + "epoch": 0.2598534363240246, + "grad_norm": 1.7127252719806318, + "learning_rate": 1.7354906190840194e-05, + "loss": 0.3894, + "step": 3280 + }, + { + "epoch": 0.25993265993265996, + "grad_norm": 1.9155344662945042, + "learning_rate": 1.7353167372645337e-05, + "loss": 0.4061, + "step": 3281 + }, + { + "epoch": 0.2600118835412953, + "grad_norm": 1.537353996431871, + "learning_rate": 1.735142807027979e-05, + "loss": 0.3862, + "step": 3282 + }, + { + "epoch": 0.26009110714993067, + "grad_norm": 1.8639382077306432, + "learning_rate": 1.734968828385808e-05, + "loss": 0.4072, + "step": 3283 + }, + { + "epoch": 0.26017033075856605, + "grad_norm": 1.5517712988859211, + "learning_rate": 1.7347948013494758e-05, + "loss": 0.332, + "step": 3284 + }, + { + "epoch": 0.26024955436720143, + "grad_norm": 1.5701971241474542, + "learning_rate": 1.7346207259304415e-05, + "loss": 0.4073, + "step": 3285 + }, + { + "epoch": 0.2603287779758368, + "grad_norm": 1.619479916136069, + "learning_rate": 1.7344466021401673e-05, + "loss": 0.429, + "step": 3286 + }, + { + "epoch": 0.2604080015844722, + "grad_norm": 1.3688675002118789, + "learning_rate": 1.734272429990118e-05, + "loss": 0.3021, + "step": 3287 + }, + { + "epoch": 0.2604872251931075, + "grad_norm": 1.7996011501403537, + "learning_rate": 1.7340982094917627e-05, + "loss": 0.4407, + "step": 3288 + }, + { + "epoch": 0.2605664488017429, + "grad_norm": 1.5205398410631559, + "learning_rate": 1.7339239406565723e-05, + "loss": 0.2782, + "step": 3289 + }, + { + "epoch": 0.2606456724103783, + "grad_norm": 1.5345973807606839, + "learning_rate": 1.733749623496022e-05, + "loss": 0.3287, + "step": 3290 + }, + { + "epoch": 0.26072489601901366, + "grad_norm": 1.4237933502086417, + "learning_rate": 1.7335752580215898e-05, + "loss": 0.2837, + "step": 3291 + }, + { + "epoch": 0.26080411962764904, + "grad_norm": 1.5194990844878373, + "learning_rate": 1.733400844244756e-05, + "loss": 0.3551, + "step": 3292 + }, + { + "epoch": 0.2608833432362844, + "grad_norm": 1.4201031104343034, + "learning_rate": 1.733226382177006e-05, + "loss": 0.3439, + "step": 3293 + }, + { + "epoch": 0.2609625668449198, + "grad_norm": 1.9310239268394354, + "learning_rate": 1.7330518718298263e-05, + "loss": 0.3837, + "step": 3294 + }, + { + "epoch": 0.26104179045355513, + "grad_norm": 1.8131192379395416, + "learning_rate": 1.7328773132147086e-05, + "loss": 0.3797, + "step": 3295 + }, + { + "epoch": 0.2611210140621905, + "grad_norm": 1.8352342399026462, + "learning_rate": 1.732702706343146e-05, + "loss": 0.4536, + "step": 3296 + }, + { + "epoch": 0.2612002376708259, + "grad_norm": 1.7530039791797933, + "learning_rate": 1.7325280512266357e-05, + "loss": 0.4423, + "step": 3297 + }, + { + "epoch": 0.2612794612794613, + "grad_norm": 1.5180542285600627, + "learning_rate": 1.7323533478766777e-05, + "loss": 0.3463, + "step": 3298 + }, + { + "epoch": 0.26135868488809666, + "grad_norm": 1.5185056587650425, + "learning_rate": 1.732178596304776e-05, + "loss": 0.3702, + "step": 3299 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 1.4751522151610112, + "learning_rate": 1.7320037965224365e-05, + "loss": 0.2908, + "step": 3300 + }, + { + "epoch": 0.2615171321053674, + "grad_norm": 1.6631568912451127, + "learning_rate": 1.731828948541169e-05, + "loss": 0.3441, + "step": 3301 + }, + { + "epoch": 0.26159635571400275, + "grad_norm": 1.9358517600779925, + "learning_rate": 1.731654052372487e-05, + "loss": 0.3505, + "step": 3302 + }, + { + "epoch": 0.26167557932263813, + "grad_norm": 1.7128564417616743, + "learning_rate": 1.731479108027906e-05, + "loss": 0.4425, + "step": 3303 + }, + { + "epoch": 0.2617548029312735, + "grad_norm": 1.9449670266261203, + "learning_rate": 1.7313041155189454e-05, + "loss": 0.5081, + "step": 3304 + }, + { + "epoch": 0.2618340265399089, + "grad_norm": 1.7524071242720565, + "learning_rate": 1.7311290748571273e-05, + "loss": 0.3776, + "step": 3305 + }, + { + "epoch": 0.2619132501485443, + "grad_norm": 1.6417450037095627, + "learning_rate": 1.7309539860539783e-05, + "loss": 0.3596, + "step": 3306 + }, + { + "epoch": 0.26199247375717966, + "grad_norm": 1.5566731603845965, + "learning_rate": 1.7307788491210257e-05, + "loss": 0.3483, + "step": 3307 + }, + { + "epoch": 0.26207169736581504, + "grad_norm": 1.7211189297295166, + "learning_rate": 1.7306036640698024e-05, + "loss": 0.3337, + "step": 3308 + }, + { + "epoch": 0.26215092097445036, + "grad_norm": 1.8411600333616034, + "learning_rate": 1.7304284309118436e-05, + "loss": 0.533, + "step": 3309 + }, + { + "epoch": 0.26223014458308574, + "grad_norm": 1.5722581157641682, + "learning_rate": 1.7302531496586866e-05, + "loss": 0.4684, + "step": 3310 + }, + { + "epoch": 0.2623093681917211, + "grad_norm": 1.5763170390840733, + "learning_rate": 1.730077820321874e-05, + "loss": 0.3724, + "step": 3311 + }, + { + "epoch": 0.2623885918003565, + "grad_norm": 1.5864109089882439, + "learning_rate": 1.7299024429129497e-05, + "loss": 0.3388, + "step": 3312 + }, + { + "epoch": 0.2624678154089919, + "grad_norm": 1.423259744318166, + "learning_rate": 1.7297270174434613e-05, + "loss": 0.36, + "step": 3313 + }, + { + "epoch": 0.26254703901762727, + "grad_norm": 1.61098808275931, + "learning_rate": 1.7295515439249608e-05, + "loss": 0.3721, + "step": 3314 + }, + { + "epoch": 0.26262626262626265, + "grad_norm": 1.6476951663832888, + "learning_rate": 1.7293760223690008e-05, + "loss": 0.5019, + "step": 3315 + }, + { + "epoch": 0.262705486234898, + "grad_norm": 1.7424649832854857, + "learning_rate": 1.729200452787139e-05, + "loss": 0.3878, + "step": 3316 + }, + { + "epoch": 0.26278470984353336, + "grad_norm": 1.4927138998244718, + "learning_rate": 1.729024835190937e-05, + "loss": 0.3379, + "step": 3317 + }, + { + "epoch": 0.26286393345216874, + "grad_norm": 1.3498204992430085, + "learning_rate": 1.7288491695919567e-05, + "loss": 0.2768, + "step": 3318 + }, + { + "epoch": 0.2629431570608041, + "grad_norm": 1.792856069684127, + "learning_rate": 1.728673456001766e-05, + "loss": 0.3524, + "step": 3319 + }, + { + "epoch": 0.2630223806694395, + "grad_norm": 1.4390018950690866, + "learning_rate": 1.728497694431934e-05, + "loss": 0.3075, + "step": 3320 + }, + { + "epoch": 0.2631016042780749, + "grad_norm": 1.9079963667902557, + "learning_rate": 1.7283218848940344e-05, + "loss": 0.4084, + "step": 3321 + }, + { + "epoch": 0.26318082788671027, + "grad_norm": 1.986725560597437, + "learning_rate": 1.728146027399643e-05, + "loss": 0.3944, + "step": 3322 + }, + { + "epoch": 0.2632600514953456, + "grad_norm": 1.7253385507472883, + "learning_rate": 1.7279701219603394e-05, + "loss": 0.4065, + "step": 3323 + }, + { + "epoch": 0.263339275103981, + "grad_norm": 1.5853531029605916, + "learning_rate": 1.727794168587706e-05, + "loss": 0.3435, + "step": 3324 + }, + { + "epoch": 0.26341849871261636, + "grad_norm": 1.5907427790014843, + "learning_rate": 1.7276181672933287e-05, + "loss": 0.4163, + "step": 3325 + }, + { + "epoch": 0.26349772232125174, + "grad_norm": 1.5624424177004532, + "learning_rate": 1.7274421180887958e-05, + "loss": 0.3467, + "step": 3326 + }, + { + "epoch": 0.2635769459298871, + "grad_norm": 1.583032504542314, + "learning_rate": 1.7272660209857e-05, + "loss": 0.3606, + "step": 3327 + }, + { + "epoch": 0.2636561695385225, + "grad_norm": 1.7219062864971715, + "learning_rate": 1.727089875995636e-05, + "loss": 0.3901, + "step": 3328 + }, + { + "epoch": 0.2637353931471578, + "grad_norm": 2.0512689513697424, + "learning_rate": 1.726913683130202e-05, + "loss": 0.4817, + "step": 3329 + }, + { + "epoch": 0.2638146167557932, + "grad_norm": 1.9662867938735469, + "learning_rate": 1.7267374424009998e-05, + "loss": 0.3973, + "step": 3330 + }, + { + "epoch": 0.2638938403644286, + "grad_norm": 1.7380230192425585, + "learning_rate": 1.726561153819634e-05, + "loss": 0.4458, + "step": 3331 + }, + { + "epoch": 0.26397306397306397, + "grad_norm": 1.4692365484664618, + "learning_rate": 1.7263848173977122e-05, + "loss": 0.301, + "step": 3332 + }, + { + "epoch": 0.26405228758169935, + "grad_norm": 1.407761262354165, + "learning_rate": 1.726208433146845e-05, + "loss": 0.3355, + "step": 3333 + }, + { + "epoch": 0.26413151119033473, + "grad_norm": 1.561809652869918, + "learning_rate": 1.726032001078647e-05, + "loss": 0.3287, + "step": 3334 + }, + { + "epoch": 0.2642107347989701, + "grad_norm": 1.527523248887085, + "learning_rate": 1.725855521204735e-05, + "loss": 0.3405, + "step": 3335 + }, + { + "epoch": 0.26428995840760544, + "grad_norm": 1.45554849608881, + "learning_rate": 1.7256789935367296e-05, + "loss": 0.3084, + "step": 3336 + }, + { + "epoch": 0.2643691820162408, + "grad_norm": 1.3737044393835163, + "learning_rate": 1.7255024180862546e-05, + "loss": 0.2936, + "step": 3337 + }, + { + "epoch": 0.2644484056248762, + "grad_norm": 1.7634863946887358, + "learning_rate": 1.7253257948649357e-05, + "loss": 0.3462, + "step": 3338 + }, + { + "epoch": 0.2645276292335116, + "grad_norm": 1.4938558323530604, + "learning_rate": 1.7251491238844038e-05, + "loss": 0.318, + "step": 3339 + }, + { + "epoch": 0.26460685284214697, + "grad_norm": 1.5452151131431804, + "learning_rate": 1.7249724051562905e-05, + "loss": 0.2942, + "step": 3340 + }, + { + "epoch": 0.26468607645078235, + "grad_norm": 1.660362710985787, + "learning_rate": 1.7247956386922334e-05, + "loss": 0.3156, + "step": 3341 + }, + { + "epoch": 0.26476530005941773, + "grad_norm": 1.9626762286954302, + "learning_rate": 1.7246188245038705e-05, + "loss": 0.414, + "step": 3342 + }, + { + "epoch": 0.26484452366805306, + "grad_norm": 2.1202140891212604, + "learning_rate": 1.7244419626028454e-05, + "loss": 0.5136, + "step": 3343 + }, + { + "epoch": 0.26492374727668844, + "grad_norm": 1.3501153366686796, + "learning_rate": 1.724265053000802e-05, + "loss": 0.2859, + "step": 3344 + }, + { + "epoch": 0.2650029708853238, + "grad_norm": 1.6756306919223005, + "learning_rate": 1.7240880957093903e-05, + "loss": 0.3199, + "step": 3345 + }, + { + "epoch": 0.2650821944939592, + "grad_norm": 1.6808831385906877, + "learning_rate": 1.7239110907402615e-05, + "loss": 0.3344, + "step": 3346 + }, + { + "epoch": 0.2651614181025946, + "grad_norm": 1.7225330801956422, + "learning_rate": 1.72373403810507e-05, + "loss": 0.3757, + "step": 3347 + }, + { + "epoch": 0.26524064171122996, + "grad_norm": 1.6804858499925897, + "learning_rate": 1.7235569378154752e-05, + "loss": 0.3361, + "step": 3348 + }, + { + "epoch": 0.26531986531986534, + "grad_norm": 1.6250276167029933, + "learning_rate": 1.7233797898831376e-05, + "loss": 0.3741, + "step": 3349 + }, + { + "epoch": 0.26539908892850067, + "grad_norm": 1.6045888552538323, + "learning_rate": 1.7232025943197213e-05, + "loss": 0.3235, + "step": 3350 + }, + { + "epoch": 0.26547831253713605, + "grad_norm": 1.6531223265752097, + "learning_rate": 1.723025351136894e-05, + "loss": 0.2859, + "step": 3351 + }, + { + "epoch": 0.26555753614577143, + "grad_norm": 1.7519217981551096, + "learning_rate": 1.722848060346326e-05, + "loss": 0.3767, + "step": 3352 + }, + { + "epoch": 0.2656367597544068, + "grad_norm": 1.4914775938723737, + "learning_rate": 1.7226707219596918e-05, + "loss": 0.3947, + "step": 3353 + }, + { + "epoch": 0.2657159833630422, + "grad_norm": 2.4259006023562626, + "learning_rate": 1.7224933359886676e-05, + "loss": 0.4762, + "step": 3354 + }, + { + "epoch": 0.2657952069716776, + "grad_norm": 2.3933572506990406, + "learning_rate": 1.7223159024449338e-05, + "loss": 0.5025, + "step": 3355 + }, + { + "epoch": 0.26587443058031296, + "grad_norm": 1.9928635626609925, + "learning_rate": 1.7221384213401732e-05, + "loss": 0.3798, + "step": 3356 + }, + { + "epoch": 0.2659536541889483, + "grad_norm": 1.6264149159310208, + "learning_rate": 1.7219608926860726e-05, + "loss": 0.3026, + "step": 3357 + }, + { + "epoch": 0.26603287779758367, + "grad_norm": 1.505716507760458, + "learning_rate": 1.721783316494321e-05, + "loss": 0.3416, + "step": 3358 + }, + { + "epoch": 0.26611210140621905, + "grad_norm": 1.7892726599381132, + "learning_rate": 1.7216056927766106e-05, + "loss": 0.4391, + "step": 3359 + }, + { + "epoch": 0.26619132501485443, + "grad_norm": 1.8659356473520252, + "learning_rate": 1.721428021544638e-05, + "loss": 0.3934, + "step": 3360 + }, + { + "epoch": 0.2662705486234898, + "grad_norm": 1.78167583266563, + "learning_rate": 1.7212503028101012e-05, + "loss": 0.4821, + "step": 3361 + }, + { + "epoch": 0.2663497722321252, + "grad_norm": 2.01762735741912, + "learning_rate": 1.721072536584702e-05, + "loss": 0.5091, + "step": 3362 + }, + { + "epoch": 0.2664289958407606, + "grad_norm": 1.5625136026564748, + "learning_rate": 1.7208947228801464e-05, + "loss": 0.3349, + "step": 3363 + }, + { + "epoch": 0.2665082194493959, + "grad_norm": 1.553926046566275, + "learning_rate": 1.7207168617081418e-05, + "loss": 0.3269, + "step": 3364 + }, + { + "epoch": 0.2665874430580313, + "grad_norm": 1.4846642854500622, + "learning_rate": 1.7205389530804e-05, + "loss": 0.2674, + "step": 3365 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.495955137748792, + "learning_rate": 1.7203609970086347e-05, + "loss": 0.3027, + "step": 3366 + }, + { + "epoch": 0.26674589027530204, + "grad_norm": 1.5461248926759212, + "learning_rate": 1.720182993504564e-05, + "loss": 0.3788, + "step": 3367 + }, + { + "epoch": 0.2668251138839374, + "grad_norm": 1.6554169100869451, + "learning_rate": 1.7200049425799087e-05, + "loss": 0.3942, + "step": 3368 + }, + { + "epoch": 0.2669043374925728, + "grad_norm": 1.4257443636439064, + "learning_rate": 1.7198268442463923e-05, + "loss": 0.3629, + "step": 3369 + }, + { + "epoch": 0.26698356110120813, + "grad_norm": 1.8431318748270273, + "learning_rate": 1.719648698515742e-05, + "loss": 0.2901, + "step": 3370 + }, + { + "epoch": 0.2670627847098435, + "grad_norm": 1.946288264880617, + "learning_rate": 1.7194705053996873e-05, + "loss": 0.3552, + "step": 3371 + }, + { + "epoch": 0.2671420083184789, + "grad_norm": 1.6385894155512677, + "learning_rate": 1.719292264909962e-05, + "loss": 0.4054, + "step": 3372 + }, + { + "epoch": 0.2672212319271143, + "grad_norm": 1.7888456740127965, + "learning_rate": 1.7191139770583015e-05, + "loss": 0.3932, + "step": 3373 + }, + { + "epoch": 0.26730045553574966, + "grad_norm": 1.491248320105741, + "learning_rate": 1.7189356418564463e-05, + "loss": 0.3565, + "step": 3374 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 1.5460736581095083, + "learning_rate": 1.7187572593161382e-05, + "loss": 0.2965, + "step": 3375 + }, + { + "epoch": 0.2674589027530204, + "grad_norm": 1.5293542309810222, + "learning_rate": 1.7185788294491232e-05, + "loss": 0.3729, + "step": 3376 + }, + { + "epoch": 0.26753812636165575, + "grad_norm": 1.3706419719168244, + "learning_rate": 1.7184003522671497e-05, + "loss": 0.2951, + "step": 3377 + }, + { + "epoch": 0.26761734997029113, + "grad_norm": 1.608347860554224, + "learning_rate": 1.7182218277819697e-05, + "loss": 0.2526, + "step": 3378 + }, + { + "epoch": 0.2676965735789265, + "grad_norm": 1.7595006193398166, + "learning_rate": 1.718043256005338e-05, + "loss": 0.3859, + "step": 3379 + }, + { + "epoch": 0.2677757971875619, + "grad_norm": 1.7559585724803755, + "learning_rate": 1.717864636949013e-05, + "loss": 0.2931, + "step": 3380 + }, + { + "epoch": 0.2678550207961973, + "grad_norm": 1.310024941510185, + "learning_rate": 1.7176859706247563e-05, + "loss": 0.3288, + "step": 3381 + }, + { + "epoch": 0.26793424440483266, + "grad_norm": 1.513550056577479, + "learning_rate": 1.717507257044331e-05, + "loss": 0.4051, + "step": 3382 + }, + { + "epoch": 0.26801346801346804, + "grad_norm": 1.2878885549535424, + "learning_rate": 1.717328496219506e-05, + "loss": 0.2257, + "step": 3383 + }, + { + "epoch": 0.26809269162210336, + "grad_norm": 2.0902664750080624, + "learning_rate": 1.7171496881620507e-05, + "loss": 0.4951, + "step": 3384 + }, + { + "epoch": 0.26817191523073874, + "grad_norm": 1.5848800451269245, + "learning_rate": 1.716970832883739e-05, + "loss": 0.3506, + "step": 3385 + }, + { + "epoch": 0.2682511388393741, + "grad_norm": 1.610290992074119, + "learning_rate": 1.716791930396348e-05, + "loss": 0.394, + "step": 3386 + }, + { + "epoch": 0.2683303624480095, + "grad_norm": 1.7897502883514513, + "learning_rate": 1.716612980711657e-05, + "loss": 0.4322, + "step": 3387 + }, + { + "epoch": 0.2684095860566449, + "grad_norm": 1.5437628925966387, + "learning_rate": 1.7164339838414496e-05, + "loss": 0.3179, + "step": 3388 + }, + { + "epoch": 0.26848880966528027, + "grad_norm": 1.4805249462135643, + "learning_rate": 1.7162549397975118e-05, + "loss": 0.2903, + "step": 3389 + }, + { + "epoch": 0.26856803327391565, + "grad_norm": 1.4974867630290967, + "learning_rate": 1.7160758485916325e-05, + "loss": 0.2499, + "step": 3390 + }, + { + "epoch": 0.268647256882551, + "grad_norm": 1.9093299717142924, + "learning_rate": 1.715896710235604e-05, + "loss": 0.3616, + "step": 3391 + }, + { + "epoch": 0.26872648049118636, + "grad_norm": 1.7232279379099156, + "learning_rate": 1.715717524741222e-05, + "loss": 0.3729, + "step": 3392 + }, + { + "epoch": 0.26880570409982174, + "grad_norm": 1.8772630322521775, + "learning_rate": 1.7155382921202844e-05, + "loss": 0.489, + "step": 3393 + }, + { + "epoch": 0.2688849277084571, + "grad_norm": 1.8797427093619954, + "learning_rate": 1.7153590123845938e-05, + "loss": 0.4683, + "step": 3394 + }, + { + "epoch": 0.2689641513170925, + "grad_norm": 1.7251500808938338, + "learning_rate": 1.715179685545954e-05, + "loss": 0.4664, + "step": 3395 + }, + { + "epoch": 0.2690433749257279, + "grad_norm": 1.500267294745199, + "learning_rate": 1.7150003116161734e-05, + "loss": 0.3243, + "step": 3396 + }, + { + "epoch": 0.26912259853436327, + "grad_norm": 1.6275369323286235, + "learning_rate": 1.714820890607062e-05, + "loss": 0.3648, + "step": 3397 + }, + { + "epoch": 0.2692018221429986, + "grad_norm": 1.7570445115889985, + "learning_rate": 1.714641422530435e-05, + "loss": 0.5036, + "step": 3398 + }, + { + "epoch": 0.269281045751634, + "grad_norm": 1.3641120873671535, + "learning_rate": 1.7144619073981088e-05, + "loss": 0.3056, + "step": 3399 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 1.4855812915601856, + "learning_rate": 1.7142823452219036e-05, + "loss": 0.3156, + "step": 3400 + }, + { + "epoch": 0.26943949296890474, + "grad_norm": 1.486186001565436, + "learning_rate": 1.714102736013643e-05, + "loss": 0.3461, + "step": 3401 + }, + { + "epoch": 0.2695187165775401, + "grad_norm": 1.218889634796814, + "learning_rate": 1.7139230797851537e-05, + "loss": 0.3196, + "step": 3402 + }, + { + "epoch": 0.2695979401861755, + "grad_norm": 1.756841247467928, + "learning_rate": 1.7137433765482644e-05, + "loss": 0.3885, + "step": 3403 + }, + { + "epoch": 0.2696771637948109, + "grad_norm": 1.8770515314755276, + "learning_rate": 1.713563626314808e-05, + "loss": 0.5336, + "step": 3404 + }, + { + "epoch": 0.2697563874034462, + "grad_norm": 1.4177512401656744, + "learning_rate": 1.71338382909662e-05, + "loss": 0.2604, + "step": 3405 + }, + { + "epoch": 0.2698356110120816, + "grad_norm": 1.5552309993682953, + "learning_rate": 1.71320398490554e-05, + "loss": 0.3866, + "step": 3406 + }, + { + "epoch": 0.26991483462071697, + "grad_norm": 1.6592529694386382, + "learning_rate": 1.713024093753409e-05, + "loss": 0.3642, + "step": 3407 + }, + { + "epoch": 0.26999405822935235, + "grad_norm": 1.7573719912049914, + "learning_rate": 1.7128441556520723e-05, + "loss": 0.333, + "step": 3408 + }, + { + "epoch": 0.27007328183798773, + "grad_norm": 1.5499733294923757, + "learning_rate": 1.7126641706133782e-05, + "loss": 0.3346, + "step": 3409 + }, + { + "epoch": 0.2701525054466231, + "grad_norm": 1.6953161001930295, + "learning_rate": 1.7124841386491774e-05, + "loss": 0.3896, + "step": 3410 + }, + { + "epoch": 0.27023172905525844, + "grad_norm": 1.3858870935401675, + "learning_rate": 1.7123040597713242e-05, + "loss": 0.2548, + "step": 3411 + }, + { + "epoch": 0.2703109526638938, + "grad_norm": 1.59478076231617, + "learning_rate": 1.7121239339916763e-05, + "loss": 0.3358, + "step": 3412 + }, + { + "epoch": 0.2703901762725292, + "grad_norm": 1.6838183742554305, + "learning_rate": 1.7119437613220936e-05, + "loss": 0.2646, + "step": 3413 + }, + { + "epoch": 0.2704693998811646, + "grad_norm": 1.6209859192175429, + "learning_rate": 1.71176354177444e-05, + "loss": 0.4011, + "step": 3414 + }, + { + "epoch": 0.27054862348979997, + "grad_norm": 1.5075837943245174, + "learning_rate": 1.711583275360582e-05, + "loss": 0.2835, + "step": 3415 + }, + { + "epoch": 0.27062784709843535, + "grad_norm": 1.655221148733233, + "learning_rate": 1.711402962092389e-05, + "loss": 0.3185, + "step": 3416 + }, + { + "epoch": 0.27070707070707073, + "grad_norm": 1.533622757074057, + "learning_rate": 1.7112226019817345e-05, + "loss": 0.2713, + "step": 3417 + }, + { + "epoch": 0.27078629431570606, + "grad_norm": 1.6629319939851415, + "learning_rate": 1.7110421950404935e-05, + "loss": 0.4364, + "step": 3418 + }, + { + "epoch": 0.27086551792434144, + "grad_norm": 1.6914745944315743, + "learning_rate": 1.710861741280545e-05, + "loss": 0.4079, + "step": 3419 + }, + { + "epoch": 0.2709447415329768, + "grad_norm": 1.4721714867478424, + "learning_rate": 1.710681240713772e-05, + "loss": 0.2924, + "step": 3420 + }, + { + "epoch": 0.2710239651416122, + "grad_norm": 1.7452123015392331, + "learning_rate": 1.7105006933520584e-05, + "loss": 0.3267, + "step": 3421 + }, + { + "epoch": 0.2711031887502476, + "grad_norm": 1.5274338558979625, + "learning_rate": 1.710320099207293e-05, + "loss": 0.2803, + "step": 3422 + }, + { + "epoch": 0.27118241235888296, + "grad_norm": 1.4656147474770578, + "learning_rate": 1.7101394582913667e-05, + "loss": 0.3104, + "step": 3423 + }, + { + "epoch": 0.27126163596751834, + "grad_norm": 1.3700754454017707, + "learning_rate": 1.709958770616174e-05, + "loss": 0.2974, + "step": 3424 + }, + { + "epoch": 0.27134085957615367, + "grad_norm": 1.3210020683231187, + "learning_rate": 1.7097780361936128e-05, + "loss": 0.2361, + "step": 3425 + }, + { + "epoch": 0.27142008318478905, + "grad_norm": 1.4043889058183885, + "learning_rate": 1.709597255035583e-05, + "loss": 0.3122, + "step": 3426 + }, + { + "epoch": 0.27149930679342443, + "grad_norm": 1.7570200327027756, + "learning_rate": 1.709416427153988e-05, + "loss": 0.4325, + "step": 3427 + }, + { + "epoch": 0.2715785304020598, + "grad_norm": 1.5317336147697496, + "learning_rate": 1.7092355525607352e-05, + "loss": 0.3623, + "step": 3428 + }, + { + "epoch": 0.2716577540106952, + "grad_norm": 1.6145151680008565, + "learning_rate": 1.7090546312677335e-05, + "loss": 0.333, + "step": 3429 + }, + { + "epoch": 0.2717369776193306, + "grad_norm": 1.6430748318588881, + "learning_rate": 1.7088736632868964e-05, + "loss": 0.3505, + "step": 3430 + }, + { + "epoch": 0.27181620122796596, + "grad_norm": 1.6904084585860188, + "learning_rate": 1.7086926486301393e-05, + "loss": 0.3139, + "step": 3431 + }, + { + "epoch": 0.2718954248366013, + "grad_norm": 1.8389760852783352, + "learning_rate": 1.7085115873093814e-05, + "loss": 0.3519, + "step": 3432 + }, + { + "epoch": 0.27197464844523667, + "grad_norm": 1.4048243211770552, + "learning_rate": 1.7083304793365445e-05, + "loss": 0.3459, + "step": 3433 + }, + { + "epoch": 0.27205387205387205, + "grad_norm": 1.4151686475921654, + "learning_rate": 1.7081493247235537e-05, + "loss": 0.3709, + "step": 3434 + }, + { + "epoch": 0.27213309566250743, + "grad_norm": 1.625735936780301, + "learning_rate": 1.7079681234823374e-05, + "loss": 0.3908, + "step": 3435 + }, + { + "epoch": 0.2722123192711428, + "grad_norm": 1.6669548596155164, + "learning_rate": 1.7077868756248265e-05, + "loss": 0.4416, + "step": 3436 + }, + { + "epoch": 0.2722915428797782, + "grad_norm": 1.6006588096745782, + "learning_rate": 1.7076055811629556e-05, + "loss": 0.4119, + "step": 3437 + }, + { + "epoch": 0.2723707664884136, + "grad_norm": 1.667564668993029, + "learning_rate": 1.7074242401086623e-05, + "loss": 0.3719, + "step": 3438 + }, + { + "epoch": 0.2724499900970489, + "grad_norm": 1.7789692914448836, + "learning_rate": 1.7072428524738865e-05, + "loss": 0.4219, + "step": 3439 + }, + { + "epoch": 0.2725292137056843, + "grad_norm": 1.4521293138855655, + "learning_rate": 1.707061418270572e-05, + "loss": 0.357, + "step": 3440 + }, + { + "epoch": 0.27260843731431966, + "grad_norm": 1.5012763092846912, + "learning_rate": 1.706879937510665e-05, + "loss": 0.409, + "step": 3441 + }, + { + "epoch": 0.27268766092295504, + "grad_norm": 1.6692016419563391, + "learning_rate": 1.7066984102061155e-05, + "loss": 0.379, + "step": 3442 + }, + { + "epoch": 0.2727668845315904, + "grad_norm": 1.7448511882401803, + "learning_rate": 1.706516836368876e-05, + "loss": 0.4669, + "step": 3443 + }, + { + "epoch": 0.2728461081402258, + "grad_norm": 1.3450045257317975, + "learning_rate": 1.7063352160109026e-05, + "loss": 0.3617, + "step": 3444 + }, + { + "epoch": 0.27292533174886113, + "grad_norm": 1.6888020855972348, + "learning_rate": 1.7061535491441538e-05, + "loss": 0.485, + "step": 3445 + }, + { + "epoch": 0.2730045553574965, + "grad_norm": 2.0743685029768355, + "learning_rate": 1.7059718357805915e-05, + "loss": 0.4779, + "step": 3446 + }, + { + "epoch": 0.2730837789661319, + "grad_norm": 1.6506907663741655, + "learning_rate": 1.705790075932181e-05, + "loss": 0.3791, + "step": 3447 + }, + { + "epoch": 0.2731630025747673, + "grad_norm": 2.0570297165198115, + "learning_rate": 1.7056082696108896e-05, + "loss": 0.4215, + "step": 3448 + }, + { + "epoch": 0.27324222618340266, + "grad_norm": 1.7779728335131182, + "learning_rate": 1.7054264168286892e-05, + "loss": 0.3329, + "step": 3449 + }, + { + "epoch": 0.27332144979203804, + "grad_norm": 1.3885423578645373, + "learning_rate": 1.7052445175975533e-05, + "loss": 0.2732, + "step": 3450 + }, + { + "epoch": 0.2734006734006734, + "grad_norm": 1.8481845829444703, + "learning_rate": 1.7050625719294593e-05, + "loss": 0.3973, + "step": 3451 + }, + { + "epoch": 0.27347989700930875, + "grad_norm": 1.5123705819660078, + "learning_rate": 1.7048805798363876e-05, + "loss": 0.2943, + "step": 3452 + }, + { + "epoch": 0.27355912061794413, + "grad_norm": 1.786858506354238, + "learning_rate": 1.7046985413303215e-05, + "loss": 0.4477, + "step": 3453 + }, + { + "epoch": 0.2736383442265795, + "grad_norm": 1.5471780854791461, + "learning_rate": 1.7045164564232474e-05, + "loss": 0.3578, + "step": 3454 + }, + { + "epoch": 0.2737175678352149, + "grad_norm": 1.7805195904238957, + "learning_rate": 1.704334325127154e-05, + "loss": 0.2828, + "step": 3455 + }, + { + "epoch": 0.2737967914438503, + "grad_norm": 1.732430938778358, + "learning_rate": 1.704152147454035e-05, + "loss": 0.2986, + "step": 3456 + }, + { + "epoch": 0.27387601505248566, + "grad_norm": 1.758498757053166, + "learning_rate": 1.7039699234158846e-05, + "loss": 0.423, + "step": 3457 + }, + { + "epoch": 0.27395523866112104, + "grad_norm": 1.6006205311132922, + "learning_rate": 1.7037876530247025e-05, + "loss": 0.3457, + "step": 3458 + }, + { + "epoch": 0.27403446226975636, + "grad_norm": 1.3896396579463255, + "learning_rate": 1.7036053362924896e-05, + "loss": 0.3312, + "step": 3459 + }, + { + "epoch": 0.27411368587839174, + "grad_norm": 1.764344372472803, + "learning_rate": 1.7034229732312512e-05, + "loss": 0.3944, + "step": 3460 + }, + { + "epoch": 0.2741929094870271, + "grad_norm": 1.7081658344683073, + "learning_rate": 1.703240563852994e-05, + "loss": 0.4221, + "step": 3461 + }, + { + "epoch": 0.2742721330956625, + "grad_norm": 1.7209613237632448, + "learning_rate": 1.70305810816973e-05, + "loss": 0.3428, + "step": 3462 + }, + { + "epoch": 0.2743513567042979, + "grad_norm": 1.8368380787938738, + "learning_rate": 1.7028756061934722e-05, + "loss": 0.3526, + "step": 3463 + }, + { + "epoch": 0.27443058031293327, + "grad_norm": 1.5457859204276865, + "learning_rate": 1.702693057936238e-05, + "loss": 0.3908, + "step": 3464 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 1.756414280743525, + "learning_rate": 1.702510463410047e-05, + "loss": 0.4255, + "step": 3465 + }, + { + "epoch": 0.274589027530204, + "grad_norm": 1.5175551884141112, + "learning_rate": 1.7023278226269222e-05, + "loss": 0.3431, + "step": 3466 + }, + { + "epoch": 0.27466825113883936, + "grad_norm": 1.763552152346543, + "learning_rate": 1.7021451355988895e-05, + "loss": 0.4228, + "step": 3467 + }, + { + "epoch": 0.27474747474747474, + "grad_norm": 1.2026595961010276, + "learning_rate": 1.7019624023379784e-05, + "loss": 0.2514, + "step": 3468 + }, + { + "epoch": 0.2748266983561101, + "grad_norm": 1.691867059457729, + "learning_rate": 1.7017796228562206e-05, + "loss": 0.5292, + "step": 3469 + }, + { + "epoch": 0.2749059219647455, + "grad_norm": 1.560317059000484, + "learning_rate": 1.7015967971656513e-05, + "loss": 0.3913, + "step": 3470 + }, + { + "epoch": 0.2749851455733809, + "grad_norm": 1.4002738026635486, + "learning_rate": 1.7014139252783092e-05, + "loss": 0.3197, + "step": 3471 + }, + { + "epoch": 0.27506436918201627, + "grad_norm": 1.6314215963213896, + "learning_rate": 1.7012310072062348e-05, + "loss": 0.2881, + "step": 3472 + }, + { + "epoch": 0.2751435927906516, + "grad_norm": 1.4151646134979659, + "learning_rate": 1.7010480429614726e-05, + "loss": 0.3346, + "step": 3473 + }, + { + "epoch": 0.275222816399287, + "grad_norm": 1.650463046409657, + "learning_rate": 1.70086503255607e-05, + "loss": 0.3368, + "step": 3474 + }, + { + "epoch": 0.27530204000792236, + "grad_norm": 1.624409202663714, + "learning_rate": 1.7006819760020773e-05, + "loss": 0.4098, + "step": 3475 + }, + { + "epoch": 0.27538126361655774, + "grad_norm": 1.4761181870737787, + "learning_rate": 1.700498873311548e-05, + "loss": 0.3473, + "step": 3476 + }, + { + "epoch": 0.2754604872251931, + "grad_norm": 1.5863268081605195, + "learning_rate": 1.7003157244965387e-05, + "loss": 0.4174, + "step": 3477 + }, + { + "epoch": 0.2755397108338285, + "grad_norm": 1.6749235256213921, + "learning_rate": 1.700132529569109e-05, + "loss": 0.3959, + "step": 3478 + }, + { + "epoch": 0.2756189344424639, + "grad_norm": 1.7993624870938862, + "learning_rate": 1.69994928854132e-05, + "loss": 0.4267, + "step": 3479 + }, + { + "epoch": 0.2756981580510992, + "grad_norm": 1.8977725110457364, + "learning_rate": 1.6997660014252392e-05, + "loss": 0.4097, + "step": 3480 + }, + { + "epoch": 0.2757773816597346, + "grad_norm": 1.6812032829572645, + "learning_rate": 1.699582668232934e-05, + "loss": 0.4293, + "step": 3481 + }, + { + "epoch": 0.27585660526836997, + "grad_norm": 1.4960697291126734, + "learning_rate": 1.6993992889764758e-05, + "loss": 0.3356, + "step": 3482 + }, + { + "epoch": 0.27593582887700535, + "grad_norm": 1.5295950997830625, + "learning_rate": 1.69921586366794e-05, + "loss": 0.3528, + "step": 3483 + }, + { + "epoch": 0.27601505248564073, + "grad_norm": 1.629124127734205, + "learning_rate": 1.6990323923194042e-05, + "loss": 0.4128, + "step": 3484 + }, + { + "epoch": 0.2760942760942761, + "grad_norm": 1.799205153426002, + "learning_rate": 1.698848874942949e-05, + "loss": 0.4616, + "step": 3485 + }, + { + "epoch": 0.27617349970291144, + "grad_norm": 1.4499513586437884, + "learning_rate": 1.698665311550658e-05, + "loss": 0.3323, + "step": 3486 + }, + { + "epoch": 0.2762527233115468, + "grad_norm": 1.6006614593568531, + "learning_rate": 1.6984817021546177e-05, + "loss": 0.3606, + "step": 3487 + }, + { + "epoch": 0.2763319469201822, + "grad_norm": 1.6802327549415106, + "learning_rate": 1.6982980467669183e-05, + "loss": 0.4688, + "step": 3488 + }, + { + "epoch": 0.2764111705288176, + "grad_norm": 1.5479145940067485, + "learning_rate": 1.6981143453996524e-05, + "loss": 0.2289, + "step": 3489 + }, + { + "epoch": 0.27649039413745297, + "grad_norm": 1.506043670783407, + "learning_rate": 1.697930598064916e-05, + "loss": 0.3444, + "step": 3490 + }, + { + "epoch": 0.27656961774608835, + "grad_norm": 1.7625349504827108, + "learning_rate": 1.697746804774808e-05, + "loss": 0.4255, + "step": 3491 + }, + { + "epoch": 0.27664884135472373, + "grad_norm": 1.5368530244966128, + "learning_rate": 1.6975629655414304e-05, + "loss": 0.303, + "step": 3492 + }, + { + "epoch": 0.27672806496335906, + "grad_norm": 1.6029409993459238, + "learning_rate": 1.6973790803768875e-05, + "loss": 0.3902, + "step": 3493 + }, + { + "epoch": 0.27680728857199444, + "grad_norm": 1.5809861960759883, + "learning_rate": 1.6971951492932882e-05, + "loss": 0.2595, + "step": 3494 + }, + { + "epoch": 0.2768865121806298, + "grad_norm": 1.9003975628676633, + "learning_rate": 1.697011172302743e-05, + "loss": 0.4297, + "step": 3495 + }, + { + "epoch": 0.2769657357892652, + "grad_norm": 1.7487107537465267, + "learning_rate": 1.696827149417366e-05, + "loss": 0.4236, + "step": 3496 + }, + { + "epoch": 0.2770449593979006, + "grad_norm": 1.5212680768840097, + "learning_rate": 1.696643080649274e-05, + "loss": 0.3356, + "step": 3497 + }, + { + "epoch": 0.27712418300653596, + "grad_norm": 1.9136349947741031, + "learning_rate": 1.696458966010587e-05, + "loss": 0.4792, + "step": 3498 + }, + { + "epoch": 0.27720340661517134, + "grad_norm": 1.4732937295891226, + "learning_rate": 1.6962748055134283e-05, + "loss": 0.3877, + "step": 3499 + }, + { + "epoch": 0.27728263022380667, + "grad_norm": 1.8798186862878368, + "learning_rate": 1.696090599169924e-05, + "loss": 0.4363, + "step": 3500 + }, + { + "epoch": 0.27736185383244205, + "grad_norm": 1.768072113898181, + "learning_rate": 1.695906346992203e-05, + "loss": 0.405, + "step": 3501 + }, + { + "epoch": 0.27744107744107743, + "grad_norm": 1.5366420685646789, + "learning_rate": 1.6957220489923978e-05, + "loss": 0.334, + "step": 3502 + }, + { + "epoch": 0.2775203010497128, + "grad_norm": 1.467712076375398, + "learning_rate": 1.695537705182643e-05, + "loss": 0.2649, + "step": 3503 + }, + { + "epoch": 0.2775995246583482, + "grad_norm": 1.7573735086257902, + "learning_rate": 1.695353315575077e-05, + "loss": 0.3206, + "step": 3504 + }, + { + "epoch": 0.2776787482669836, + "grad_norm": 1.6932444911950197, + "learning_rate": 1.6951688801818413e-05, + "loss": 0.2739, + "step": 3505 + }, + { + "epoch": 0.27775797187561896, + "grad_norm": 1.8150111543318028, + "learning_rate": 1.6949843990150798e-05, + "loss": 0.4105, + "step": 3506 + }, + { + "epoch": 0.2778371954842543, + "grad_norm": 1.6464606181851476, + "learning_rate": 1.6947998720869394e-05, + "loss": 0.3744, + "step": 3507 + }, + { + "epoch": 0.27791641909288967, + "grad_norm": 1.47741556209451, + "learning_rate": 1.6946152994095705e-05, + "loss": 0.3519, + "step": 3508 + }, + { + "epoch": 0.27799564270152505, + "grad_norm": 1.466738540705516, + "learning_rate": 1.6944306809951264e-05, + "loss": 0.2829, + "step": 3509 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 1.6131810392094053, + "learning_rate": 1.694246016855764e-05, + "loss": 0.3526, + "step": 3510 + }, + { + "epoch": 0.2781540899187958, + "grad_norm": 1.4707793401323985, + "learning_rate": 1.694061307003641e-05, + "loss": 0.2998, + "step": 3511 + }, + { + "epoch": 0.2782333135274312, + "grad_norm": 1.6707136558403675, + "learning_rate": 1.693876551450921e-05, + "loss": 0.3658, + "step": 3512 + }, + { + "epoch": 0.2783125371360666, + "grad_norm": 1.646277141807071, + "learning_rate": 1.693691750209769e-05, + "loss": 0.3348, + "step": 3513 + }, + { + "epoch": 0.2783917607447019, + "grad_norm": 1.589557915750613, + "learning_rate": 1.6935069032923525e-05, + "loss": 0.4126, + "step": 3514 + }, + { + "epoch": 0.2784709843533373, + "grad_norm": 1.744988982843674, + "learning_rate": 1.6933220107108438e-05, + "loss": 0.5218, + "step": 3515 + }, + { + "epoch": 0.27855020796197266, + "grad_norm": 1.8068087976162832, + "learning_rate": 1.6931370724774166e-05, + "loss": 0.3489, + "step": 3516 + }, + { + "epoch": 0.27862943157060804, + "grad_norm": 1.601528820318888, + "learning_rate": 1.6929520886042486e-05, + "loss": 0.3275, + "step": 3517 + }, + { + "epoch": 0.2787086551792434, + "grad_norm": 1.5622175615918097, + "learning_rate": 1.6927670591035195e-05, + "loss": 0.4182, + "step": 3518 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 1.8951382408782513, + "learning_rate": 1.692581983987413e-05, + "loss": 0.3882, + "step": 3519 + }, + { + "epoch": 0.2788671023965142, + "grad_norm": 1.6295321212478375, + "learning_rate": 1.6923968632681155e-05, + "loss": 0.3463, + "step": 3520 + }, + { + "epoch": 0.2789463260051495, + "grad_norm": 1.4200048304005606, + "learning_rate": 1.6922116969578163e-05, + "loss": 0.2261, + "step": 3521 + }, + { + "epoch": 0.2790255496137849, + "grad_norm": 1.7766957545518904, + "learning_rate": 1.692026485068707e-05, + "loss": 0.4466, + "step": 3522 + }, + { + "epoch": 0.2791047732224203, + "grad_norm": 1.785747832773107, + "learning_rate": 1.6918412276129837e-05, + "loss": 0.4344, + "step": 3523 + }, + { + "epoch": 0.27918399683105566, + "grad_norm": 1.6077267271447753, + "learning_rate": 1.691655924602845e-05, + "loss": 0.3834, + "step": 3524 + }, + { + "epoch": 0.27926322043969104, + "grad_norm": 1.9594099423058808, + "learning_rate": 1.6914705760504913e-05, + "loss": 0.465, + "step": 3525 + }, + { + "epoch": 0.2793424440483264, + "grad_norm": 1.492125004500428, + "learning_rate": 1.6912851819681272e-05, + "loss": 0.266, + "step": 3526 + }, + { + "epoch": 0.27942166765696175, + "grad_norm": 1.8400968383905338, + "learning_rate": 1.69109974236796e-05, + "loss": 0.4018, + "step": 3527 + }, + { + "epoch": 0.27950089126559713, + "grad_norm": 1.526204915859762, + "learning_rate": 1.6909142572622003e-05, + "loss": 0.3215, + "step": 3528 + }, + { + "epoch": 0.2795801148742325, + "grad_norm": 1.7297234727761364, + "learning_rate": 1.6907287266630614e-05, + "loss": 0.3751, + "step": 3529 + }, + { + "epoch": 0.2796593384828679, + "grad_norm": 1.602004668749506, + "learning_rate": 1.6905431505827595e-05, + "loss": 0.3161, + "step": 3530 + }, + { + "epoch": 0.2797385620915033, + "grad_norm": 1.7382902472599635, + "learning_rate": 1.6903575290335136e-05, + "loss": 0.2935, + "step": 3531 + }, + { + "epoch": 0.27981778570013865, + "grad_norm": 1.4289480413399747, + "learning_rate": 1.690171862027546e-05, + "loss": 0.2746, + "step": 3532 + }, + { + "epoch": 0.27989700930877404, + "grad_norm": 1.7974312161166537, + "learning_rate": 1.6899861495770827e-05, + "loss": 0.4417, + "step": 3533 + }, + { + "epoch": 0.27997623291740936, + "grad_norm": 1.7742212172800194, + "learning_rate": 1.689800391694351e-05, + "loss": 0.2631, + "step": 3534 + }, + { + "epoch": 0.28005545652604474, + "grad_norm": 1.6929891102217385, + "learning_rate": 1.689614588391583e-05, + "loss": 0.4269, + "step": 3535 + }, + { + "epoch": 0.2801346801346801, + "grad_norm": 1.3175051341640325, + "learning_rate": 1.689428739681012e-05, + "loss": 0.3223, + "step": 3536 + }, + { + "epoch": 0.2802139037433155, + "grad_norm": 1.409101625512571, + "learning_rate": 1.6892428455748762e-05, + "loss": 0.2801, + "step": 3537 + }, + { + "epoch": 0.2802931273519509, + "grad_norm": 1.2630127014473755, + "learning_rate": 1.6890569060854156e-05, + "loss": 0.26, + "step": 3538 + }, + { + "epoch": 0.28037235096058627, + "grad_norm": 1.8192073478133002, + "learning_rate": 1.6888709212248728e-05, + "loss": 0.4691, + "step": 3539 + }, + { + "epoch": 0.28045157456922165, + "grad_norm": 1.9492757929953899, + "learning_rate": 1.6886848910054947e-05, + "loss": 0.5208, + "step": 3540 + }, + { + "epoch": 0.280530798177857, + "grad_norm": 1.7001053794245082, + "learning_rate": 1.6884988154395304e-05, + "loss": 0.3743, + "step": 3541 + }, + { + "epoch": 0.28061002178649236, + "grad_norm": 1.555818105861714, + "learning_rate": 1.688312694539232e-05, + "loss": 0.4427, + "step": 3542 + }, + { + "epoch": 0.28068924539512774, + "grad_norm": 1.7617632297817103, + "learning_rate": 1.6881265283168543e-05, + "loss": 0.4516, + "step": 3543 + }, + { + "epoch": 0.2807684690037631, + "grad_norm": 1.7065956849786186, + "learning_rate": 1.6879403167846556e-05, + "loss": 0.3538, + "step": 3544 + }, + { + "epoch": 0.2808476926123985, + "grad_norm": 1.592015712746443, + "learning_rate": 1.6877540599548977e-05, + "loss": 0.4562, + "step": 3545 + }, + { + "epoch": 0.2809269162210339, + "grad_norm": 1.7462708848976471, + "learning_rate": 1.6875677578398442e-05, + "loss": 0.4419, + "step": 3546 + }, + { + "epoch": 0.28100613982966927, + "grad_norm": 2.0023386855341196, + "learning_rate": 1.6873814104517617e-05, + "loss": 0.4469, + "step": 3547 + }, + { + "epoch": 0.2810853634383046, + "grad_norm": 1.6070230401212557, + "learning_rate": 1.6871950178029216e-05, + "loss": 0.3239, + "step": 3548 + }, + { + "epoch": 0.28116458704694, + "grad_norm": 1.5582492370040515, + "learning_rate": 1.6870085799055956e-05, + "loss": 0.3928, + "step": 3549 + }, + { + "epoch": 0.28124381065557535, + "grad_norm": 1.4845910874482904, + "learning_rate": 1.6868220967720604e-05, + "loss": 0.3363, + "step": 3550 + }, + { + "epoch": 0.28132303426421074, + "grad_norm": 1.7708579870049843, + "learning_rate": 1.686635568414595e-05, + "loss": 0.4809, + "step": 3551 + }, + { + "epoch": 0.2814022578728461, + "grad_norm": 1.5147265926372622, + "learning_rate": 1.686448994845481e-05, + "loss": 0.3501, + "step": 3552 + }, + { + "epoch": 0.2814814814814815, + "grad_norm": 1.9661251600762675, + "learning_rate": 1.6862623760770038e-05, + "loss": 0.5826, + "step": 3553 + }, + { + "epoch": 0.2815607050901169, + "grad_norm": 1.5511779143150988, + "learning_rate": 1.6860757121214513e-05, + "loss": 0.229, + "step": 3554 + }, + { + "epoch": 0.2816399286987522, + "grad_norm": 1.6877606595650976, + "learning_rate": 1.685889002991114e-05, + "loss": 0.477, + "step": 3555 + }, + { + "epoch": 0.2817191523073876, + "grad_norm": 1.550230499538908, + "learning_rate": 1.6857022486982865e-05, + "loss": 0.3786, + "step": 3556 + }, + { + "epoch": 0.28179837591602297, + "grad_norm": 1.767368902490723, + "learning_rate": 1.6855154492552656e-05, + "loss": 0.3383, + "step": 3557 + }, + { + "epoch": 0.28187759952465835, + "grad_norm": 1.465005676072625, + "learning_rate": 1.6853286046743505e-05, + "loss": 0.3332, + "step": 3558 + }, + { + "epoch": 0.28195682313329373, + "grad_norm": 1.6038488986910835, + "learning_rate": 1.6851417149678442e-05, + "loss": 0.3883, + "step": 3559 + }, + { + "epoch": 0.2820360467419291, + "grad_norm": 1.636018145618707, + "learning_rate": 1.684954780148053e-05, + "loss": 0.3857, + "step": 3560 + }, + { + "epoch": 0.2821152703505645, + "grad_norm": 1.5417685126993563, + "learning_rate": 1.684767800227285e-05, + "loss": 0.3102, + "step": 3561 + }, + { + "epoch": 0.2821944939591998, + "grad_norm": 1.5205095980407326, + "learning_rate": 1.6845807752178528e-05, + "loss": 0.361, + "step": 3562 + }, + { + "epoch": 0.2822737175678352, + "grad_norm": 1.5356691480212799, + "learning_rate": 1.68439370513207e-05, + "loss": 0.2851, + "step": 3563 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 1.7413950399991796, + "learning_rate": 1.6842065899822548e-05, + "loss": 0.3764, + "step": 3564 + }, + { + "epoch": 0.28243216478510597, + "grad_norm": 1.9135949586534597, + "learning_rate": 1.6840194297807283e-05, + "loss": 0.4358, + "step": 3565 + }, + { + "epoch": 0.28251138839374135, + "grad_norm": 1.9011800764645568, + "learning_rate": 1.6838322245398135e-05, + "loss": 0.3765, + "step": 3566 + }, + { + "epoch": 0.28259061200237673, + "grad_norm": 1.6927354842088744, + "learning_rate": 1.6836449742718367e-05, + "loss": 0.3494, + "step": 3567 + }, + { + "epoch": 0.28266983561101205, + "grad_norm": 1.7830644087708565, + "learning_rate": 1.6834576789891282e-05, + "loss": 0.4242, + "step": 3568 + }, + { + "epoch": 0.28274905921964744, + "grad_norm": 1.8936036889716845, + "learning_rate": 1.68327033870402e-05, + "loss": 0.4626, + "step": 3569 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 1.5418801057911977, + "learning_rate": 1.6830829534288475e-05, + "loss": 0.3039, + "step": 3570 + }, + { + "epoch": 0.2829075064369182, + "grad_norm": 1.9611365615816758, + "learning_rate": 1.6828955231759495e-05, + "loss": 0.3371, + "step": 3571 + }, + { + "epoch": 0.2829867300455536, + "grad_norm": 1.54659309175576, + "learning_rate": 1.682708047957667e-05, + "loss": 0.3619, + "step": 3572 + }, + { + "epoch": 0.28306595365418896, + "grad_norm": 1.86565103419617, + "learning_rate": 1.682520527786345e-05, + "loss": 0.4165, + "step": 3573 + }, + { + "epoch": 0.28314517726282434, + "grad_norm": 1.6522161904280512, + "learning_rate": 1.6823329626743298e-05, + "loss": 0.2955, + "step": 3574 + }, + { + "epoch": 0.28322440087145967, + "grad_norm": 1.4667178434231913, + "learning_rate": 1.6821453526339727e-05, + "loss": 0.3278, + "step": 3575 + }, + { + "epoch": 0.28330362448009505, + "grad_norm": 1.838070548155054, + "learning_rate": 1.6819576976776262e-05, + "loss": 0.3991, + "step": 3576 + }, + { + "epoch": 0.28338284808873043, + "grad_norm": 1.8244461812184616, + "learning_rate": 1.6817699978176464e-05, + "loss": 0.4738, + "step": 3577 + }, + { + "epoch": 0.2834620716973658, + "grad_norm": 1.759398180587922, + "learning_rate": 1.681582253066393e-05, + "loss": 0.3788, + "step": 3578 + }, + { + "epoch": 0.2835412953060012, + "grad_norm": 1.4334897523217673, + "learning_rate": 1.681394463436228e-05, + "loss": 0.2866, + "step": 3579 + }, + { + "epoch": 0.2836205189146366, + "grad_norm": 1.7906084844323114, + "learning_rate": 1.6812066289395157e-05, + "loss": 0.5063, + "step": 3580 + }, + { + "epoch": 0.28369974252327196, + "grad_norm": 1.6472053512655485, + "learning_rate": 1.681018749588625e-05, + "loss": 0.3431, + "step": 3581 + }, + { + "epoch": 0.2837789661319073, + "grad_norm": 1.5857126964109334, + "learning_rate": 1.6808308253959263e-05, + "loss": 0.3562, + "step": 3582 + }, + { + "epoch": 0.28385818974054267, + "grad_norm": 1.5514486881035434, + "learning_rate": 1.680642856373794e-05, + "loss": 0.4192, + "step": 3583 + }, + { + "epoch": 0.28393741334917805, + "grad_norm": 1.6444978527983385, + "learning_rate": 1.680454842534604e-05, + "loss": 0.3861, + "step": 3584 + }, + { + "epoch": 0.28401663695781343, + "grad_norm": 1.7347252108863274, + "learning_rate": 1.6802667838907374e-05, + "loss": 0.3744, + "step": 3585 + }, + { + "epoch": 0.2840958605664488, + "grad_norm": 1.7459594993989513, + "learning_rate": 1.680078680454576e-05, + "loss": 0.4487, + "step": 3586 + }, + { + "epoch": 0.2841750841750842, + "grad_norm": 1.5893724205079869, + "learning_rate": 1.6798905322385063e-05, + "loss": 0.3463, + "step": 3587 + }, + { + "epoch": 0.2842543077837196, + "grad_norm": 1.9394135541698432, + "learning_rate": 1.6797023392549157e-05, + "loss": 0.4756, + "step": 3588 + }, + { + "epoch": 0.2843335313923549, + "grad_norm": 1.6215229282396264, + "learning_rate": 1.679514101516197e-05, + "loss": 0.3123, + "step": 3589 + }, + { + "epoch": 0.2844127550009903, + "grad_norm": 1.5051473318343307, + "learning_rate": 1.6793258190347445e-05, + "loss": 0.3647, + "step": 3590 + }, + { + "epoch": 0.28449197860962566, + "grad_norm": 1.6319692724433477, + "learning_rate": 1.679137491822955e-05, + "loss": 0.3785, + "step": 3591 + }, + { + "epoch": 0.28457120221826104, + "grad_norm": 1.2785138619559915, + "learning_rate": 1.6789491198932302e-05, + "loss": 0.3282, + "step": 3592 + }, + { + "epoch": 0.2846504258268964, + "grad_norm": 1.5242877948579932, + "learning_rate": 1.6787607032579724e-05, + "loss": 0.355, + "step": 3593 + }, + { + "epoch": 0.2847296494355318, + "grad_norm": 1.4239024085664382, + "learning_rate": 1.678572241929588e-05, + "loss": 0.3155, + "step": 3594 + }, + { + "epoch": 0.2848088730441672, + "grad_norm": 1.7002248937531939, + "learning_rate": 1.6783837359204868e-05, + "loss": 0.4815, + "step": 3595 + }, + { + "epoch": 0.2848880966528025, + "grad_norm": 1.8248986115549153, + "learning_rate": 1.6781951852430813e-05, + "loss": 0.3612, + "step": 3596 + }, + { + "epoch": 0.2849673202614379, + "grad_norm": 1.3955557988943879, + "learning_rate": 1.6780065899097853e-05, + "loss": 0.2804, + "step": 3597 + }, + { + "epoch": 0.2850465438700733, + "grad_norm": 1.6120218485253899, + "learning_rate": 1.677817949933018e-05, + "loss": 0.3879, + "step": 3598 + }, + { + "epoch": 0.28512576747870866, + "grad_norm": 1.4285686904092474, + "learning_rate": 1.6776292653252e-05, + "loss": 0.2915, + "step": 3599 + }, + { + "epoch": 0.28520499108734404, + "grad_norm": 1.9313834081163612, + "learning_rate": 1.6774405360987556e-05, + "loss": 0.4056, + "step": 3600 + }, + { + "epoch": 0.2852842146959794, + "grad_norm": 1.496380441023455, + "learning_rate": 1.6772517622661115e-05, + "loss": 0.2844, + "step": 3601 + }, + { + "epoch": 0.2853634383046148, + "grad_norm": 1.4792953236678106, + "learning_rate": 1.6770629438396973e-05, + "loss": 0.3116, + "step": 3602 + }, + { + "epoch": 0.28544266191325013, + "grad_norm": 1.5181475927751238, + "learning_rate": 1.676874080831947e-05, + "loss": 0.4181, + "step": 3603 + }, + { + "epoch": 0.2855218855218855, + "grad_norm": 1.4877365513085468, + "learning_rate": 1.676685173255294e-05, + "loss": 0.342, + "step": 3604 + }, + { + "epoch": 0.2856011091305209, + "grad_norm": 1.8036031255484326, + "learning_rate": 1.6764962211221796e-05, + "loss": 0.4385, + "step": 3605 + }, + { + "epoch": 0.2856803327391563, + "grad_norm": 1.486537671144364, + "learning_rate": 1.6763072244450435e-05, + "loss": 0.3813, + "step": 3606 + }, + { + "epoch": 0.28575955634779165, + "grad_norm": 1.4513846095705085, + "learning_rate": 1.676118183236331e-05, + "loss": 0.2682, + "step": 3607 + }, + { + "epoch": 0.28583877995642704, + "grad_norm": 1.550222438443536, + "learning_rate": 1.6759290975084894e-05, + "loss": 0.301, + "step": 3608 + }, + { + "epoch": 0.28591800356506236, + "grad_norm": 1.3022403793706947, + "learning_rate": 1.675739967273969e-05, + "loss": 0.2648, + "step": 3609 + }, + { + "epoch": 0.28599722717369774, + "grad_norm": 1.6172990474791722, + "learning_rate": 1.675550792545223e-05, + "loss": 0.4253, + "step": 3610 + }, + { + "epoch": 0.2860764507823331, + "grad_norm": 1.5438222467284817, + "learning_rate": 1.6753615733347085e-05, + "loss": 0.4286, + "step": 3611 + }, + { + "epoch": 0.2861556743909685, + "grad_norm": 1.8211315822390446, + "learning_rate": 1.6751723096548834e-05, + "loss": 0.4643, + "step": 3612 + }, + { + "epoch": 0.2862348979996039, + "grad_norm": 1.4013422933530293, + "learning_rate": 1.6749830015182106e-05, + "loss": 0.297, + "step": 3613 + }, + { + "epoch": 0.28631412160823927, + "grad_norm": 1.7063383931892007, + "learning_rate": 1.6747936489371552e-05, + "loss": 0.4357, + "step": 3614 + }, + { + "epoch": 0.28639334521687465, + "grad_norm": 1.5081876713250542, + "learning_rate": 1.674604251924185e-05, + "loss": 0.3652, + "step": 3615 + }, + { + "epoch": 0.28647256882551, + "grad_norm": 1.591421911800057, + "learning_rate": 1.6744148104917705e-05, + "loss": 0.4056, + "step": 3616 + }, + { + "epoch": 0.28655179243414536, + "grad_norm": 1.480660887258956, + "learning_rate": 1.6742253246523856e-05, + "loss": 0.3463, + "step": 3617 + }, + { + "epoch": 0.28663101604278074, + "grad_norm": 1.3844072874530815, + "learning_rate": 1.6740357944185074e-05, + "loss": 0.3125, + "step": 3618 + }, + { + "epoch": 0.2867102396514161, + "grad_norm": 1.180234939996569, + "learning_rate": 1.6738462198026154e-05, + "loss": 0.3364, + "step": 3619 + }, + { + "epoch": 0.2867894632600515, + "grad_norm": 1.3599034163326482, + "learning_rate": 1.6736566008171925e-05, + "loss": 0.3164, + "step": 3620 + }, + { + "epoch": 0.2868686868686869, + "grad_norm": 1.8749851622283111, + "learning_rate": 1.6734669374747237e-05, + "loss": 0.4001, + "step": 3621 + }, + { + "epoch": 0.28694791047732227, + "grad_norm": 1.5930901287496209, + "learning_rate": 1.6732772297876975e-05, + "loss": 0.3607, + "step": 3622 + }, + { + "epoch": 0.2870271340859576, + "grad_norm": 1.3820294439640533, + "learning_rate": 1.6730874777686053e-05, + "loss": 0.331, + "step": 3623 + }, + { + "epoch": 0.287106357694593, + "grad_norm": 1.3583426516974253, + "learning_rate": 1.6728976814299413e-05, + "loss": 0.3344, + "step": 3624 + }, + { + "epoch": 0.28718558130322835, + "grad_norm": 1.575334033532164, + "learning_rate": 1.6727078407842028e-05, + "loss": 0.3369, + "step": 3625 + }, + { + "epoch": 0.28726480491186374, + "grad_norm": 1.9255241823776548, + "learning_rate": 1.67251795584389e-05, + "loss": 0.3792, + "step": 3626 + }, + { + "epoch": 0.2873440285204991, + "grad_norm": 1.5721585979541202, + "learning_rate": 1.6723280266215057e-05, + "loss": 0.3417, + "step": 3627 + }, + { + "epoch": 0.2874232521291345, + "grad_norm": 1.3321013125151693, + "learning_rate": 1.672138053129556e-05, + "loss": 0.336, + "step": 3628 + }, + { + "epoch": 0.2875024757377699, + "grad_norm": 1.5977789410414887, + "learning_rate": 1.6719480353805493e-05, + "loss": 0.3966, + "step": 3629 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 1.7722218042203264, + "learning_rate": 1.671757973386998e-05, + "loss": 0.3594, + "step": 3630 + }, + { + "epoch": 0.2876609229550406, + "grad_norm": 2.4183955376847033, + "learning_rate": 1.6715678671614162e-05, + "loss": 0.4452, + "step": 3631 + }, + { + "epoch": 0.28774014656367597, + "grad_norm": 1.544197375645035, + "learning_rate": 1.6713777167163215e-05, + "loss": 0.3337, + "step": 3632 + }, + { + "epoch": 0.28781937017231135, + "grad_norm": 1.775270778435445, + "learning_rate": 1.6711875220642352e-05, + "loss": 0.4508, + "step": 3633 + }, + { + "epoch": 0.28789859378094673, + "grad_norm": 1.4933435345664674, + "learning_rate": 1.6709972832176797e-05, + "loss": 0.322, + "step": 3634 + }, + { + "epoch": 0.2879778173895821, + "grad_norm": 1.8384884621027973, + "learning_rate": 1.670807000189182e-05, + "loss": 0.3989, + "step": 3635 + }, + { + "epoch": 0.2880570409982175, + "grad_norm": 1.6849671471603531, + "learning_rate": 1.6706166729912712e-05, + "loss": 0.3266, + "step": 3636 + }, + { + "epoch": 0.2881362646068528, + "grad_norm": 1.833308292889901, + "learning_rate": 1.670426301636479e-05, + "loss": 0.3839, + "step": 3637 + }, + { + "epoch": 0.2882154882154882, + "grad_norm": 1.5373957630120187, + "learning_rate": 1.6702358861373408e-05, + "loss": 0.3622, + "step": 3638 + }, + { + "epoch": 0.2882947118241236, + "grad_norm": 1.552858478187155, + "learning_rate": 1.6700454265063943e-05, + "loss": 0.349, + "step": 3639 + }, + { + "epoch": 0.28837393543275897, + "grad_norm": 1.6062859579383988, + "learning_rate": 1.6698549227561805e-05, + "loss": 0.3414, + "step": 3640 + }, + { + "epoch": 0.28845315904139435, + "grad_norm": 1.7176547535061093, + "learning_rate": 1.6696643748992434e-05, + "loss": 0.3501, + "step": 3641 + }, + { + "epoch": 0.28853238265002973, + "grad_norm": 1.5550389555711928, + "learning_rate": 1.6694737829481292e-05, + "loss": 0.3445, + "step": 3642 + }, + { + "epoch": 0.2886116062586651, + "grad_norm": 1.7039094289641743, + "learning_rate": 1.669283146915388e-05, + "loss": 0.3559, + "step": 3643 + }, + { + "epoch": 0.28869082986730044, + "grad_norm": 1.5778669974174928, + "learning_rate": 1.6690924668135718e-05, + "loss": 0.3288, + "step": 3644 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 1.462204256319542, + "learning_rate": 1.668901742655236e-05, + "loss": 0.3299, + "step": 3645 + }, + { + "epoch": 0.2888492770845712, + "grad_norm": 1.9797578307548085, + "learning_rate": 1.6687109744529394e-05, + "loss": 0.3228, + "step": 3646 + }, + { + "epoch": 0.2889285006932066, + "grad_norm": 1.8619979227611663, + "learning_rate": 1.6685201622192422e-05, + "loss": 0.3885, + "step": 3647 + }, + { + "epoch": 0.28900772430184196, + "grad_norm": 1.7633755501190216, + "learning_rate": 1.6683293059667096e-05, + "loss": 0.3463, + "step": 3648 + }, + { + "epoch": 0.28908694791047734, + "grad_norm": 1.7407588148043072, + "learning_rate": 1.6681384057079076e-05, + "loss": 0.365, + "step": 3649 + }, + { + "epoch": 0.28916617151911267, + "grad_norm": 1.63931320851029, + "learning_rate": 1.6679474614554066e-05, + "loss": 0.3673, + "step": 3650 + }, + { + "epoch": 0.28924539512774805, + "grad_norm": 1.701622710902603, + "learning_rate": 1.667756473221779e-05, + "loss": 0.3408, + "step": 3651 + }, + { + "epoch": 0.28932461873638343, + "grad_norm": 1.7940237286487946, + "learning_rate": 1.667565441019601e-05, + "loss": 0.4036, + "step": 3652 + }, + { + "epoch": 0.2894038423450188, + "grad_norm": 1.3365666457609597, + "learning_rate": 1.6673743648614507e-05, + "loss": 0.2774, + "step": 3653 + }, + { + "epoch": 0.2894830659536542, + "grad_norm": 1.617247333049898, + "learning_rate": 1.66718324475991e-05, + "loss": 0.3726, + "step": 3654 + }, + { + "epoch": 0.2895622895622896, + "grad_norm": 1.801136603704768, + "learning_rate": 1.6669920807275622e-05, + "loss": 0.4702, + "step": 3655 + }, + { + "epoch": 0.28964151317092496, + "grad_norm": 1.6496845076227367, + "learning_rate": 1.666800872776996e-05, + "loss": 0.4065, + "step": 3656 + }, + { + "epoch": 0.2897207367795603, + "grad_norm": 1.6073287286430564, + "learning_rate": 1.6666096209208e-05, + "loss": 0.3445, + "step": 3657 + }, + { + "epoch": 0.28979996038819567, + "grad_norm": 2.0635826256773475, + "learning_rate": 1.6664183251715687e-05, + "loss": 0.4878, + "step": 3658 + }, + { + "epoch": 0.28987918399683105, + "grad_norm": 1.5853944059926224, + "learning_rate": 1.666226985541897e-05, + "loss": 0.3283, + "step": 3659 + }, + { + "epoch": 0.28995840760546643, + "grad_norm": 1.5294033148153279, + "learning_rate": 1.666035602044384e-05, + "loss": 0.2541, + "step": 3660 + }, + { + "epoch": 0.2900376312141018, + "grad_norm": 1.553102308427671, + "learning_rate": 1.665844174691631e-05, + "loss": 0.3476, + "step": 3661 + }, + { + "epoch": 0.2901168548227372, + "grad_norm": 1.599079528408568, + "learning_rate": 1.6656527034962433e-05, + "loss": 0.4552, + "step": 3662 + }, + { + "epoch": 0.2901960784313726, + "grad_norm": 1.3978944067426902, + "learning_rate": 1.665461188470828e-05, + "loss": 0.2751, + "step": 3663 + }, + { + "epoch": 0.2902753020400079, + "grad_norm": 1.7886649165700803, + "learning_rate": 1.6652696296279954e-05, + "loss": 0.3276, + "step": 3664 + }, + { + "epoch": 0.2903545256486433, + "grad_norm": 1.4567574127010015, + "learning_rate": 1.6650780269803587e-05, + "loss": 0.3033, + "step": 3665 + }, + { + "epoch": 0.29043374925727866, + "grad_norm": 1.7960727770474372, + "learning_rate": 1.664886380540534e-05, + "loss": 0.3951, + "step": 3666 + }, + { + "epoch": 0.29051297286591404, + "grad_norm": 1.4746401531627715, + "learning_rate": 1.664694690321141e-05, + "loss": 0.3797, + "step": 3667 + }, + { + "epoch": 0.2905921964745494, + "grad_norm": 1.8651030781180145, + "learning_rate": 1.6645029563348e-05, + "loss": 0.3675, + "step": 3668 + }, + { + "epoch": 0.2906714200831848, + "grad_norm": 1.544899839721725, + "learning_rate": 1.6643111785941374e-05, + "loss": 0.3284, + "step": 3669 + }, + { + "epoch": 0.2907506436918202, + "grad_norm": 1.6870378337498664, + "learning_rate": 1.66411935711178e-05, + "loss": 0.3742, + "step": 3670 + }, + { + "epoch": 0.2908298673004555, + "grad_norm": 1.921389935370315, + "learning_rate": 1.6639274919003582e-05, + "loss": 0.3077, + "step": 3671 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 1.5554839911492166, + "learning_rate": 1.6637355829725057e-05, + "loss": 0.3984, + "step": 3672 + }, + { + "epoch": 0.2909883145177263, + "grad_norm": 1.462193690870757, + "learning_rate": 1.663543630340859e-05, + "loss": 0.2794, + "step": 3673 + }, + { + "epoch": 0.29106753812636166, + "grad_norm": 1.7819231651448524, + "learning_rate": 1.6633516340180568e-05, + "loss": 0.4415, + "step": 3674 + }, + { + "epoch": 0.29114676173499704, + "grad_norm": 1.477082577320874, + "learning_rate": 1.6631595940167416e-05, + "loss": 0.3798, + "step": 3675 + }, + { + "epoch": 0.2912259853436324, + "grad_norm": 1.7948912984018348, + "learning_rate": 1.662967510349558e-05, + "loss": 0.4033, + "step": 3676 + }, + { + "epoch": 0.2913052089522678, + "grad_norm": 1.622229483769328, + "learning_rate": 1.6627753830291536e-05, + "loss": 0.3754, + "step": 3677 + }, + { + "epoch": 0.29138443256090313, + "grad_norm": 1.475226981239045, + "learning_rate": 1.6625832120681795e-05, + "loss": 0.389, + "step": 3678 + }, + { + "epoch": 0.2914636561695385, + "grad_norm": 1.4448973930137625, + "learning_rate": 1.6623909974792888e-05, + "loss": 0.3615, + "step": 3679 + }, + { + "epoch": 0.2915428797781739, + "grad_norm": 1.6242515508177402, + "learning_rate": 1.6621987392751385e-05, + "loss": 0.37, + "step": 3680 + }, + { + "epoch": 0.2916221033868093, + "grad_norm": 1.4842416611925109, + "learning_rate": 1.6620064374683874e-05, + "loss": 0.2904, + "step": 3681 + }, + { + "epoch": 0.29170132699544465, + "grad_norm": 1.8226877878692371, + "learning_rate": 1.6618140920716976e-05, + "loss": 0.3205, + "step": 3682 + }, + { + "epoch": 0.29178055060408004, + "grad_norm": 1.5582042564604095, + "learning_rate": 1.6616217030977345e-05, + "loss": 0.3537, + "step": 3683 + }, + { + "epoch": 0.29185977421271536, + "grad_norm": 1.8693609962015734, + "learning_rate": 1.6614292705591658e-05, + "loss": 0.3738, + "step": 3684 + }, + { + "epoch": 0.29193899782135074, + "grad_norm": 1.4929929643184192, + "learning_rate": 1.6612367944686617e-05, + "loss": 0.2502, + "step": 3685 + }, + { + "epoch": 0.2920182214299861, + "grad_norm": 1.5231909340028893, + "learning_rate": 1.6610442748388972e-05, + "loss": 0.3516, + "step": 3686 + }, + { + "epoch": 0.2920974450386215, + "grad_norm": 1.807927774894062, + "learning_rate": 1.6608517116825473e-05, + "loss": 0.305, + "step": 3687 + }, + { + "epoch": 0.2921766686472569, + "grad_norm": 1.7156559061161423, + "learning_rate": 1.6606591050122924e-05, + "loss": 0.3707, + "step": 3688 + }, + { + "epoch": 0.29225589225589227, + "grad_norm": 1.7756116117316123, + "learning_rate": 1.660466454840814e-05, + "loss": 0.3398, + "step": 3689 + }, + { + "epoch": 0.29233511586452765, + "grad_norm": 1.4715483555346898, + "learning_rate": 1.6602737611807975e-05, + "loss": 0.2963, + "step": 3690 + }, + { + "epoch": 0.292414339473163, + "grad_norm": 1.3492264844913355, + "learning_rate": 1.660081024044931e-05, + "loss": 0.3042, + "step": 3691 + }, + { + "epoch": 0.29249356308179836, + "grad_norm": 1.5876136880426355, + "learning_rate": 1.659888243445905e-05, + "loss": 0.4111, + "step": 3692 + }, + { + "epoch": 0.29257278669043374, + "grad_norm": 1.6843865515874483, + "learning_rate": 1.6596954193964136e-05, + "loss": 0.347, + "step": 3693 + }, + { + "epoch": 0.2926520102990691, + "grad_norm": 1.6159047178595056, + "learning_rate": 1.659502551909153e-05, + "loss": 0.2566, + "step": 3694 + }, + { + "epoch": 0.2927312339077045, + "grad_norm": 1.6326633787297915, + "learning_rate": 1.6593096409968227e-05, + "loss": 0.4207, + "step": 3695 + }, + { + "epoch": 0.2928104575163399, + "grad_norm": 1.6312351788466415, + "learning_rate": 1.6591166866721247e-05, + "loss": 0.2902, + "step": 3696 + }, + { + "epoch": 0.29288968112497527, + "grad_norm": 1.6743139956406583, + "learning_rate": 1.658923688947765e-05, + "loss": 0.4262, + "step": 3697 + }, + { + "epoch": 0.2929689047336106, + "grad_norm": 1.594665167211578, + "learning_rate": 1.6587306478364502e-05, + "loss": 0.3031, + "step": 3698 + }, + { + "epoch": 0.293048128342246, + "grad_norm": 1.8226401098855778, + "learning_rate": 1.658537563350892e-05, + "loss": 0.3209, + "step": 3699 + }, + { + "epoch": 0.29312735195088135, + "grad_norm": 1.9183583484565196, + "learning_rate": 1.6583444355038042e-05, + "loss": 0.4386, + "step": 3700 + }, + { + "epoch": 0.29320657555951674, + "grad_norm": 1.2217138579713387, + "learning_rate": 1.6581512643079028e-05, + "loss": 0.2286, + "step": 3701 + }, + { + "epoch": 0.2932857991681521, + "grad_norm": 1.7245372312858804, + "learning_rate": 1.657958049775908e-05, + "loss": 0.2874, + "step": 3702 + }, + { + "epoch": 0.2933650227767875, + "grad_norm": 1.5545081582187148, + "learning_rate": 1.6577647919205407e-05, + "loss": 0.2811, + "step": 3703 + }, + { + "epoch": 0.2934442463854229, + "grad_norm": 1.7646696527010508, + "learning_rate": 1.6575714907545272e-05, + "loss": 0.3848, + "step": 3704 + }, + { + "epoch": 0.2935234699940582, + "grad_norm": 1.4763703725637947, + "learning_rate": 1.6573781462905954e-05, + "loss": 0.269, + "step": 3705 + }, + { + "epoch": 0.2936026936026936, + "grad_norm": 1.466020055182049, + "learning_rate": 1.6571847585414754e-05, + "loss": 0.3398, + "step": 3706 + }, + { + "epoch": 0.29368191721132897, + "grad_norm": 1.6648461105635022, + "learning_rate": 1.6569913275199013e-05, + "loss": 0.4153, + "step": 3707 + }, + { + "epoch": 0.29376114081996435, + "grad_norm": 1.6762880173918289, + "learning_rate": 1.6567978532386094e-05, + "loss": 0.3389, + "step": 3708 + }, + { + "epoch": 0.29384036442859973, + "grad_norm": 1.4084573794364987, + "learning_rate": 1.6566043357103393e-05, + "loss": 0.3095, + "step": 3709 + }, + { + "epoch": 0.2939195880372351, + "grad_norm": 1.5264587159337366, + "learning_rate": 1.656410774947833e-05, + "loss": 0.3541, + "step": 3710 + }, + { + "epoch": 0.2939988116458705, + "grad_norm": 1.7712955241935733, + "learning_rate": 1.6562171709638355e-05, + "loss": 0.4035, + "step": 3711 + }, + { + "epoch": 0.2940780352545058, + "grad_norm": 1.650402565242953, + "learning_rate": 1.656023523771095e-05, + "loss": 0.3859, + "step": 3712 + }, + { + "epoch": 0.2941572588631412, + "grad_norm": 1.457668188975639, + "learning_rate": 1.655829833382362e-05, + "loss": 0.3065, + "step": 3713 + }, + { + "epoch": 0.2942364824717766, + "grad_norm": 1.7683403141845395, + "learning_rate": 1.6556360998103903e-05, + "loss": 0.4861, + "step": 3714 + }, + { + "epoch": 0.29431570608041197, + "grad_norm": 1.3387389380927102, + "learning_rate": 1.655442323067936e-05, + "loss": 0.3494, + "step": 3715 + }, + { + "epoch": 0.29439492968904735, + "grad_norm": 1.5460035560143075, + "learning_rate": 1.6552485031677586e-05, + "loss": 0.3607, + "step": 3716 + }, + { + "epoch": 0.29447415329768273, + "grad_norm": 1.5644628630902204, + "learning_rate": 1.65505464012262e-05, + "loss": 0.3323, + "step": 3717 + }, + { + "epoch": 0.2945533769063181, + "grad_norm": 1.5263992181145956, + "learning_rate": 1.6548607339452853e-05, + "loss": 0.2947, + "step": 3718 + }, + { + "epoch": 0.29463260051495344, + "grad_norm": 1.680623589400432, + "learning_rate": 1.6546667846485224e-05, + "loss": 0.3836, + "step": 3719 + }, + { + "epoch": 0.2947118241235888, + "grad_norm": 1.5645426107179712, + "learning_rate": 1.6544727922451014e-05, + "loss": 0.3296, + "step": 3720 + }, + { + "epoch": 0.2947910477322242, + "grad_norm": 1.5798023138428012, + "learning_rate": 1.654278756747796e-05, + "loss": 0.3759, + "step": 3721 + }, + { + "epoch": 0.2948702713408596, + "grad_norm": 1.6448449681708475, + "learning_rate": 1.6540846781693837e-05, + "loss": 0.3258, + "step": 3722 + }, + { + "epoch": 0.29494949494949496, + "grad_norm": 1.692579158191838, + "learning_rate": 1.6538905565226416e-05, + "loss": 0.3276, + "step": 3723 + }, + { + "epoch": 0.29502871855813034, + "grad_norm": 1.157165029471748, + "learning_rate": 1.6536963918203532e-05, + "loss": 0.3096, + "step": 3724 + }, + { + "epoch": 0.29510794216676567, + "grad_norm": 1.5351515561844389, + "learning_rate": 1.6535021840753026e-05, + "loss": 0.3423, + "step": 3725 + }, + { + "epoch": 0.29518716577540105, + "grad_norm": 1.626572147130463, + "learning_rate": 1.6533079333002775e-05, + "loss": 0.45, + "step": 3726 + }, + { + "epoch": 0.29526638938403643, + "grad_norm": 1.4759940209148366, + "learning_rate": 1.6531136395080687e-05, + "loss": 0.3755, + "step": 3727 + }, + { + "epoch": 0.2953456129926718, + "grad_norm": 1.6047735388780962, + "learning_rate": 1.6529193027114692e-05, + "loss": 0.4001, + "step": 3728 + }, + { + "epoch": 0.2954248366013072, + "grad_norm": 1.6936689428996885, + "learning_rate": 1.6527249229232754e-05, + "loss": 0.4437, + "step": 3729 + }, + { + "epoch": 0.2955040602099426, + "grad_norm": 1.5483223620704214, + "learning_rate": 1.652530500156286e-05, + "loss": 0.2854, + "step": 3730 + }, + { + "epoch": 0.29558328381857796, + "grad_norm": 1.9279758485809468, + "learning_rate": 1.652336034423303e-05, + "loss": 0.4761, + "step": 3731 + }, + { + "epoch": 0.2956625074272133, + "grad_norm": 1.3885258750564502, + "learning_rate": 1.6521415257371312e-05, + "loss": 0.2676, + "step": 3732 + }, + { + "epoch": 0.29574173103584867, + "grad_norm": 2.0173393527273866, + "learning_rate": 1.6519469741105777e-05, + "loss": 0.4428, + "step": 3733 + }, + { + "epoch": 0.29582095464448405, + "grad_norm": 1.2769367437849652, + "learning_rate": 1.6517523795564527e-05, + "loss": 0.3903, + "step": 3734 + }, + { + "epoch": 0.29590017825311943, + "grad_norm": 1.4168831246146063, + "learning_rate": 1.6515577420875698e-05, + "loss": 0.2817, + "step": 3735 + }, + { + "epoch": 0.2959794018617548, + "grad_norm": 1.4590632301807038, + "learning_rate": 1.6513630617167446e-05, + "loss": 0.4161, + "step": 3736 + }, + { + "epoch": 0.2960586254703902, + "grad_norm": 1.4957658708651385, + "learning_rate": 1.6511683384567957e-05, + "loss": 0.3314, + "step": 3737 + }, + { + "epoch": 0.2961378490790256, + "grad_norm": 1.6931108922356175, + "learning_rate": 1.6509735723205453e-05, + "loss": 0.3407, + "step": 3738 + }, + { + "epoch": 0.2962170726876609, + "grad_norm": 1.9257459743957412, + "learning_rate": 1.6507787633208173e-05, + "loss": 0.4885, + "step": 3739 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.4618563336714918, + "learning_rate": 1.650583911470439e-05, + "loss": 0.2855, + "step": 3740 + }, + { + "epoch": 0.29637551990493166, + "grad_norm": 1.578070839901341, + "learning_rate": 1.6503890167822406e-05, + "loss": 0.4354, + "step": 3741 + }, + { + "epoch": 0.29645474351356704, + "grad_norm": 1.625889368351407, + "learning_rate": 1.6501940792690547e-05, + "loss": 0.3298, + "step": 3742 + }, + { + "epoch": 0.2965339671222024, + "grad_norm": 2.0722239035668673, + "learning_rate": 1.6499990989437177e-05, + "loss": 0.4087, + "step": 3743 + }, + { + "epoch": 0.2966131907308378, + "grad_norm": 1.7203826844700671, + "learning_rate": 1.6498040758190673e-05, + "loss": 0.3445, + "step": 3744 + }, + { + "epoch": 0.2966924143394732, + "grad_norm": 1.5562219947285774, + "learning_rate": 1.6496090099079452e-05, + "loss": 0.3462, + "step": 3745 + }, + { + "epoch": 0.2967716379481085, + "grad_norm": 1.5974700535982191, + "learning_rate": 1.6494139012231954e-05, + "loss": 0.2918, + "step": 3746 + }, + { + "epoch": 0.2968508615567439, + "grad_norm": 1.8451398968677453, + "learning_rate": 1.6492187497776654e-05, + "loss": 0.4844, + "step": 3747 + }, + { + "epoch": 0.2969300851653793, + "grad_norm": 1.3668940341916167, + "learning_rate": 1.6490235555842044e-05, + "loss": 0.286, + "step": 3748 + }, + { + "epoch": 0.29700930877401466, + "grad_norm": 1.6436202528195933, + "learning_rate": 1.6488283186556648e-05, + "loss": 0.3584, + "step": 3749 + }, + { + "epoch": 0.29708853238265004, + "grad_norm": 1.4993676939339717, + "learning_rate": 1.6486330390049027e-05, + "loss": 0.465, + "step": 3750 + }, + { + "epoch": 0.2971677559912854, + "grad_norm": 1.7093675941475246, + "learning_rate": 1.648437716644776e-05, + "loss": 0.3096, + "step": 3751 + }, + { + "epoch": 0.2972469795999208, + "grad_norm": 2.0648495925544057, + "learning_rate": 1.6482423515881455e-05, + "loss": 0.4456, + "step": 3752 + }, + { + "epoch": 0.29732620320855613, + "grad_norm": 1.4552570422854636, + "learning_rate": 1.6480469438478756e-05, + "loss": 0.2939, + "step": 3753 + }, + { + "epoch": 0.2974054268171915, + "grad_norm": 1.6662426434991227, + "learning_rate": 1.6478514934368326e-05, + "loss": 0.3757, + "step": 3754 + }, + { + "epoch": 0.2974846504258269, + "grad_norm": 1.5962851467094807, + "learning_rate": 1.647656000367886e-05, + "loss": 0.3251, + "step": 3755 + }, + { + "epoch": 0.2975638740344623, + "grad_norm": 1.5103269275543294, + "learning_rate": 1.647460464653908e-05, + "loss": 0.3744, + "step": 3756 + }, + { + "epoch": 0.29764309764309765, + "grad_norm": 1.5483971219441284, + "learning_rate": 1.6472648863077737e-05, + "loss": 0.3312, + "step": 3757 + }, + { + "epoch": 0.29772232125173304, + "grad_norm": 1.4794407460995247, + "learning_rate": 1.6470692653423614e-05, + "loss": 0.3563, + "step": 3758 + }, + { + "epoch": 0.2978015448603684, + "grad_norm": 1.5927313826335552, + "learning_rate": 1.6468736017705515e-05, + "loss": 0.4335, + "step": 3759 + }, + { + "epoch": 0.29788076846900374, + "grad_norm": 1.5609038027387239, + "learning_rate": 1.646677895605227e-05, + "loss": 0.3471, + "step": 3760 + }, + { + "epoch": 0.2979599920776391, + "grad_norm": 1.5743584159085167, + "learning_rate": 1.6464821468592748e-05, + "loss": 0.4019, + "step": 3761 + }, + { + "epoch": 0.2980392156862745, + "grad_norm": 1.5769263913700893, + "learning_rate": 1.646286355545584e-05, + "loss": 0.3243, + "step": 3762 + }, + { + "epoch": 0.2981184392949099, + "grad_norm": 1.4067949846430399, + "learning_rate": 1.6460905216770467e-05, + "loss": 0.3049, + "step": 3763 + }, + { + "epoch": 0.29819766290354527, + "grad_norm": 1.6794696717920552, + "learning_rate": 1.6458946452665573e-05, + "loss": 0.4214, + "step": 3764 + }, + { + "epoch": 0.29827688651218065, + "grad_norm": 1.7329479945034956, + "learning_rate": 1.6456987263270132e-05, + "loss": 0.3726, + "step": 3765 + }, + { + "epoch": 0.298356110120816, + "grad_norm": 1.714100066221399, + "learning_rate": 1.645502764871315e-05, + "loss": 0.4985, + "step": 3766 + }, + { + "epoch": 0.29843533372945136, + "grad_norm": 1.6324421847987027, + "learning_rate": 1.6453067609123656e-05, + "loss": 0.4205, + "step": 3767 + }, + { + "epoch": 0.29851455733808674, + "grad_norm": 1.7559885452665687, + "learning_rate": 1.6451107144630708e-05, + "loss": 0.457, + "step": 3768 + }, + { + "epoch": 0.2985937809467221, + "grad_norm": 1.4335785231140599, + "learning_rate": 1.6449146255363395e-05, + "loss": 0.3186, + "step": 3769 + }, + { + "epoch": 0.2986730045553575, + "grad_norm": 1.436299620477261, + "learning_rate": 1.6447184941450833e-05, + "loss": 0.2621, + "step": 3770 + }, + { + "epoch": 0.2987522281639929, + "grad_norm": 1.5070547075831144, + "learning_rate": 1.644522320302217e-05, + "loss": 0.4093, + "step": 3771 + }, + { + "epoch": 0.29883145177262826, + "grad_norm": 1.6426450608368386, + "learning_rate": 1.6443261040206566e-05, + "loss": 0.346, + "step": 3772 + }, + { + "epoch": 0.2989106753812636, + "grad_norm": 1.753915088153408, + "learning_rate": 1.6441298453133224e-05, + "loss": 0.4704, + "step": 3773 + }, + { + "epoch": 0.298989898989899, + "grad_norm": 1.624887100688437, + "learning_rate": 1.6439335441931376e-05, + "loss": 0.3534, + "step": 3774 + }, + { + "epoch": 0.29906912259853435, + "grad_norm": 1.335493481898599, + "learning_rate": 1.6437372006730276e-05, + "loss": 0.2843, + "step": 3775 + }, + { + "epoch": 0.29914834620716974, + "grad_norm": 1.6182677105252745, + "learning_rate": 1.64354081476592e-05, + "loss": 0.4009, + "step": 3776 + }, + { + "epoch": 0.2992275698158051, + "grad_norm": 1.6379374535945792, + "learning_rate": 1.643344386484746e-05, + "loss": 0.4189, + "step": 3777 + }, + { + "epoch": 0.2993067934244405, + "grad_norm": 1.4489817796737543, + "learning_rate": 1.64314791584244e-05, + "loss": 0.3679, + "step": 3778 + }, + { + "epoch": 0.2993860170330759, + "grad_norm": 1.4954248788830213, + "learning_rate": 1.6429514028519383e-05, + "loss": 0.302, + "step": 3779 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 1.3727648316046877, + "learning_rate": 1.6427548475261807e-05, + "loss": 0.3157, + "step": 3780 + }, + { + "epoch": 0.2995444642503466, + "grad_norm": 1.6889827463718825, + "learning_rate": 1.642558249878109e-05, + "loss": 0.3195, + "step": 3781 + }, + { + "epoch": 0.29962368785898197, + "grad_norm": 2.0113385117015583, + "learning_rate": 1.642361609920668e-05, + "loss": 0.346, + "step": 3782 + }, + { + "epoch": 0.29970291146761735, + "grad_norm": 1.3005111421180062, + "learning_rate": 1.6421649276668065e-05, + "loss": 0.236, + "step": 3783 + }, + { + "epoch": 0.29978213507625273, + "grad_norm": 1.814189959560917, + "learning_rate": 1.641968203129474e-05, + "loss": 0.3674, + "step": 3784 + }, + { + "epoch": 0.2998613586848881, + "grad_norm": 1.3463800810596662, + "learning_rate": 1.641771436321624e-05, + "loss": 0.2878, + "step": 3785 + }, + { + "epoch": 0.2999405822935235, + "grad_norm": 1.5277775792857364, + "learning_rate": 1.6415746272562133e-05, + "loss": 0.3004, + "step": 3786 + }, + { + "epoch": 0.3000198059021588, + "grad_norm": 1.7202398641078458, + "learning_rate": 1.6413777759462005e-05, + "loss": 0.3964, + "step": 3787 + }, + { + "epoch": 0.3000990295107942, + "grad_norm": 1.5780142294976065, + "learning_rate": 1.6411808824045472e-05, + "loss": 0.4101, + "step": 3788 + }, + { + "epoch": 0.3001782531194296, + "grad_norm": 1.8318551528148257, + "learning_rate": 1.640983946644218e-05, + "loss": 0.4305, + "step": 3789 + }, + { + "epoch": 0.30025747672806496, + "grad_norm": 1.5673051254852584, + "learning_rate": 1.64078696867818e-05, + "loss": 0.2752, + "step": 3790 + }, + { + "epoch": 0.30033670033670035, + "grad_norm": 1.514300134962165, + "learning_rate": 1.6405899485194034e-05, + "loss": 0.2852, + "step": 3791 + }, + { + "epoch": 0.3004159239453357, + "grad_norm": 1.7127679297603502, + "learning_rate": 1.640392886180861e-05, + "loss": 0.413, + "step": 3792 + }, + { + "epoch": 0.3004951475539711, + "grad_norm": 1.608088617741294, + "learning_rate": 1.6401957816755286e-05, + "loss": 0.3283, + "step": 3793 + }, + { + "epoch": 0.30057437116260644, + "grad_norm": 1.519048287661634, + "learning_rate": 1.6399986350163844e-05, + "loss": 0.357, + "step": 3794 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 1.7846920775902686, + "learning_rate": 1.6398014462164093e-05, + "loss": 0.3559, + "step": 3795 + }, + { + "epoch": 0.3007328183798772, + "grad_norm": 1.515741033757868, + "learning_rate": 1.6396042152885874e-05, + "loss": 0.303, + "step": 3796 + }, + { + "epoch": 0.3008120419885126, + "grad_norm": 1.5575963897544893, + "learning_rate": 1.639406942245906e-05, + "loss": 0.361, + "step": 3797 + }, + { + "epoch": 0.30089126559714796, + "grad_norm": 1.5934799767005252, + "learning_rate": 1.639209627101354e-05, + "loss": 0.4425, + "step": 3798 + }, + { + "epoch": 0.30097048920578334, + "grad_norm": 1.9555295092338374, + "learning_rate": 1.6390122698679234e-05, + "loss": 0.31, + "step": 3799 + }, + { + "epoch": 0.3010497128144187, + "grad_norm": 1.4861878242769602, + "learning_rate": 1.6388148705586097e-05, + "loss": 0.3422, + "step": 3800 + }, + { + "epoch": 0.30112893642305405, + "grad_norm": 1.6141246381221648, + "learning_rate": 1.6386174291864106e-05, + "loss": 0.3316, + "step": 3801 + }, + { + "epoch": 0.30120816003168943, + "grad_norm": 1.6918582588963165, + "learning_rate": 1.6384199457643264e-05, + "loss": 0.3889, + "step": 3802 + }, + { + "epoch": 0.3012873836403248, + "grad_norm": 1.961692310085184, + "learning_rate": 1.6382224203053607e-05, + "loss": 0.32, + "step": 3803 + }, + { + "epoch": 0.3013666072489602, + "grad_norm": 1.3305776959827942, + "learning_rate": 1.6380248528225197e-05, + "loss": 0.2861, + "step": 3804 + }, + { + "epoch": 0.3014458308575956, + "grad_norm": 1.487523496772002, + "learning_rate": 1.6378272433288122e-05, + "loss": 0.3223, + "step": 3805 + }, + { + "epoch": 0.30152505446623096, + "grad_norm": 1.8278446964295654, + "learning_rate": 1.6376295918372495e-05, + "loss": 0.4469, + "step": 3806 + }, + { + "epoch": 0.3016042780748663, + "grad_norm": 1.6868377234102636, + "learning_rate": 1.6374318983608464e-05, + "loss": 0.5389, + "step": 3807 + }, + { + "epoch": 0.30168350168350166, + "grad_norm": 2.084304915031796, + "learning_rate": 1.63723416291262e-05, + "loss": 0.392, + "step": 3808 + }, + { + "epoch": 0.30176272529213705, + "grad_norm": 1.7967013467244641, + "learning_rate": 1.63703638550559e-05, + "loss": 0.3478, + "step": 3809 + }, + { + "epoch": 0.3018419489007724, + "grad_norm": 1.8273655052356867, + "learning_rate": 1.6368385661527795e-05, + "loss": 0.3582, + "step": 3810 + }, + { + "epoch": 0.3019211725094078, + "grad_norm": 1.6828164947296698, + "learning_rate": 1.6366407048672135e-05, + "loss": 0.2921, + "step": 3811 + }, + { + "epoch": 0.3020003961180432, + "grad_norm": 1.5472148067232565, + "learning_rate": 1.6364428016619202e-05, + "loss": 0.3374, + "step": 3812 + }, + { + "epoch": 0.30207961972667857, + "grad_norm": 1.5730655180342485, + "learning_rate": 1.636244856549931e-05, + "loss": 0.3068, + "step": 3813 + }, + { + "epoch": 0.3021588433353139, + "grad_norm": 1.4756547347443372, + "learning_rate": 1.6360468695442797e-05, + "loss": 0.359, + "step": 3814 + }, + { + "epoch": 0.3022380669439493, + "grad_norm": 1.7130972831106233, + "learning_rate": 1.6358488406580023e-05, + "loss": 0.4333, + "step": 3815 + }, + { + "epoch": 0.30231729055258466, + "grad_norm": 1.6734596236717552, + "learning_rate": 1.635650769904138e-05, + "loss": 0.4587, + "step": 3816 + }, + { + "epoch": 0.30239651416122004, + "grad_norm": 1.729405056866127, + "learning_rate": 1.6354526572957292e-05, + "loss": 0.4987, + "step": 3817 + }, + { + "epoch": 0.3024757377698554, + "grad_norm": 1.6373355510413856, + "learning_rate": 1.6352545028458206e-05, + "loss": 0.4399, + "step": 3818 + }, + { + "epoch": 0.3025549613784908, + "grad_norm": 1.4211971196948625, + "learning_rate": 1.6350563065674596e-05, + "loss": 0.2919, + "step": 3819 + }, + { + "epoch": 0.3026341849871262, + "grad_norm": 1.6523017112193026, + "learning_rate": 1.6348580684736962e-05, + "loss": 0.4078, + "step": 3820 + }, + { + "epoch": 0.3027134085957615, + "grad_norm": 1.3075951775593628, + "learning_rate": 1.6346597885775843e-05, + "loss": 0.2245, + "step": 3821 + }, + { + "epoch": 0.3027926322043969, + "grad_norm": 1.6879622665776723, + "learning_rate": 1.6344614668921787e-05, + "loss": 0.4017, + "step": 3822 + }, + { + "epoch": 0.3028718558130323, + "grad_norm": 1.6495169799144414, + "learning_rate": 1.6342631034305386e-05, + "loss": 0.4104, + "step": 3823 + }, + { + "epoch": 0.30295107942166766, + "grad_norm": 1.55359658127163, + "learning_rate": 1.634064698205725e-05, + "loss": 0.3575, + "step": 3824 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.3333725357229693, + "learning_rate": 1.6338662512308013e-05, + "loss": 0.3173, + "step": 3825 + }, + { + "epoch": 0.3031095266389384, + "grad_norm": 1.4457097277733202, + "learning_rate": 1.6336677625188357e-05, + "loss": 0.3363, + "step": 3826 + }, + { + "epoch": 0.3031887502475738, + "grad_norm": 1.555676829281651, + "learning_rate": 1.6334692320828968e-05, + "loss": 0.3219, + "step": 3827 + }, + { + "epoch": 0.3032679738562091, + "grad_norm": 1.3530281421596255, + "learning_rate": 1.6332706599360568e-05, + "loss": 0.306, + "step": 3828 + }, + { + "epoch": 0.3033471974648445, + "grad_norm": 1.6454117978423115, + "learning_rate": 1.633072046091391e-05, + "loss": 0.3432, + "step": 3829 + }, + { + "epoch": 0.3034264210734799, + "grad_norm": 1.3930626696608348, + "learning_rate": 1.6328733905619775e-05, + "loss": 0.289, + "step": 3830 + }, + { + "epoch": 0.30350564468211527, + "grad_norm": 1.7497257245471574, + "learning_rate": 1.632674693360896e-05, + "loss": 0.3326, + "step": 3831 + }, + { + "epoch": 0.30358486829075065, + "grad_norm": 1.6349530365308782, + "learning_rate": 1.6324759545012306e-05, + "loss": 0.3893, + "step": 3832 + }, + { + "epoch": 0.30366409189938603, + "grad_norm": 1.4708960442420949, + "learning_rate": 1.6322771739960664e-05, + "loss": 0.3058, + "step": 3833 + }, + { + "epoch": 0.3037433155080214, + "grad_norm": 1.4778862415043623, + "learning_rate": 1.6320783518584926e-05, + "loss": 0.3498, + "step": 3834 + }, + { + "epoch": 0.30382253911665674, + "grad_norm": 1.5604297876082858, + "learning_rate": 1.631879488101601e-05, + "loss": 0.4417, + "step": 3835 + }, + { + "epoch": 0.3039017627252921, + "grad_norm": 1.4588366078489061, + "learning_rate": 1.6316805827384856e-05, + "loss": 0.3357, + "step": 3836 + }, + { + "epoch": 0.3039809863339275, + "grad_norm": 1.8178459538402403, + "learning_rate": 1.631481635782243e-05, + "loss": 0.4265, + "step": 3837 + }, + { + "epoch": 0.3040602099425629, + "grad_norm": 1.5781182190565286, + "learning_rate": 1.631282647245973e-05, + "loss": 0.352, + "step": 3838 + }, + { + "epoch": 0.30413943355119827, + "grad_norm": 1.6039945513928018, + "learning_rate": 1.6310836171427788e-05, + "loss": 0.3298, + "step": 3839 + }, + { + "epoch": 0.30421865715983365, + "grad_norm": 1.846700518571357, + "learning_rate": 1.6308845454857647e-05, + "loss": 0.387, + "step": 3840 + }, + { + "epoch": 0.30429788076846903, + "grad_norm": 1.2115120647603959, + "learning_rate": 1.6306854322880386e-05, + "loss": 0.2871, + "step": 3841 + }, + { + "epoch": 0.30437710437710436, + "grad_norm": 3.2410836256660036, + "learning_rate": 1.630486277562712e-05, + "loss": 0.4369, + "step": 3842 + }, + { + "epoch": 0.30445632798573974, + "grad_norm": 1.671704239532948, + "learning_rate": 1.6302870813228974e-05, + "loss": 0.4362, + "step": 3843 + }, + { + "epoch": 0.3045355515943751, + "grad_norm": 1.6085542460584332, + "learning_rate": 1.6300878435817115e-05, + "loss": 0.3678, + "step": 3844 + }, + { + "epoch": 0.3046147752030105, + "grad_norm": 1.6325921389417042, + "learning_rate": 1.6298885643522724e-05, + "loss": 0.3531, + "step": 3845 + }, + { + "epoch": 0.3046939988116459, + "grad_norm": 1.3342545808697894, + "learning_rate": 1.6296892436477024e-05, + "loss": 0.2439, + "step": 3846 + }, + { + "epoch": 0.30477322242028126, + "grad_norm": 1.7166473609594897, + "learning_rate": 1.6294898814811258e-05, + "loss": 0.3329, + "step": 3847 + }, + { + "epoch": 0.3048524460289166, + "grad_norm": 1.873728421446356, + "learning_rate": 1.629290477865669e-05, + "loss": 0.3875, + "step": 3848 + }, + { + "epoch": 0.30493166963755197, + "grad_norm": 1.8667987810816882, + "learning_rate": 1.6290910328144627e-05, + "loss": 0.3824, + "step": 3849 + }, + { + "epoch": 0.30501089324618735, + "grad_norm": 1.6637950329647817, + "learning_rate": 1.6288915463406386e-05, + "loss": 0.3281, + "step": 3850 + }, + { + "epoch": 0.30509011685482273, + "grad_norm": 1.8548886263458098, + "learning_rate": 1.6286920184573324e-05, + "loss": 0.4248, + "step": 3851 + }, + { + "epoch": 0.3051693404634581, + "grad_norm": 1.489059185016581, + "learning_rate": 1.6284924491776815e-05, + "loss": 0.3011, + "step": 3852 + }, + { + "epoch": 0.3052485640720935, + "grad_norm": 1.8782566040331756, + "learning_rate": 1.6282928385148273e-05, + "loss": 0.4654, + "step": 3853 + }, + { + "epoch": 0.3053277876807289, + "grad_norm": 1.6943009082616145, + "learning_rate": 1.6280931864819125e-05, + "loss": 0.3491, + "step": 3854 + }, + { + "epoch": 0.3054070112893642, + "grad_norm": 1.8388427081702123, + "learning_rate": 1.6278934930920834e-05, + "loss": 0.535, + "step": 3855 + }, + { + "epoch": 0.3054862348979996, + "grad_norm": 1.570905121726635, + "learning_rate": 1.6276937583584895e-05, + "loss": 0.3378, + "step": 3856 + }, + { + "epoch": 0.30556545850663497, + "grad_norm": 1.624265042521842, + "learning_rate": 1.6274939822942818e-05, + "loss": 0.3788, + "step": 3857 + }, + { + "epoch": 0.30564468211527035, + "grad_norm": 1.6533235915875253, + "learning_rate": 1.6272941649126146e-05, + "loss": 0.3481, + "step": 3858 + }, + { + "epoch": 0.30572390572390573, + "grad_norm": 1.8394126429612057, + "learning_rate": 1.627094306226645e-05, + "loss": 0.4744, + "step": 3859 + }, + { + "epoch": 0.3058031293325411, + "grad_norm": 1.6354535733095188, + "learning_rate": 1.6268944062495324e-05, + "loss": 0.4803, + "step": 3860 + }, + { + "epoch": 0.3058823529411765, + "grad_norm": 1.800699104887716, + "learning_rate": 1.62669446499444e-05, + "loss": 0.3522, + "step": 3861 + }, + { + "epoch": 0.3059615765498118, + "grad_norm": 1.4575471666967872, + "learning_rate": 1.6264944824745326e-05, + "loss": 0.3918, + "step": 3862 + }, + { + "epoch": 0.3060408001584472, + "grad_norm": 1.518255624908039, + "learning_rate": 1.6262944587029777e-05, + "loss": 0.3738, + "step": 3863 + }, + { + "epoch": 0.3061200237670826, + "grad_norm": 1.6706320996404647, + "learning_rate": 1.6260943936929462e-05, + "loss": 0.3605, + "step": 3864 + }, + { + "epoch": 0.30619924737571796, + "grad_norm": 1.9328751141669256, + "learning_rate": 1.6258942874576117e-05, + "loss": 0.405, + "step": 3865 + }, + { + "epoch": 0.30627847098435335, + "grad_norm": 1.9661923984908691, + "learning_rate": 1.62569414001015e-05, + "loss": 0.438, + "step": 3866 + }, + { + "epoch": 0.3063576945929887, + "grad_norm": 1.4170629236575045, + "learning_rate": 1.6254939513637397e-05, + "loss": 0.2903, + "step": 3867 + }, + { + "epoch": 0.3064369182016241, + "grad_norm": 1.159523129000082, + "learning_rate": 1.6252937215315622e-05, + "loss": 0.2969, + "step": 3868 + }, + { + "epoch": 0.30651614181025943, + "grad_norm": 1.8377983854490623, + "learning_rate": 1.6250934505268025e-05, + "loss": 0.3265, + "step": 3869 + }, + { + "epoch": 0.3065953654188948, + "grad_norm": 1.4498893510292903, + "learning_rate": 1.6248931383626464e-05, + "loss": 0.3737, + "step": 3870 + }, + { + "epoch": 0.3066745890275302, + "grad_norm": 1.7024558999697288, + "learning_rate": 1.6246927850522837e-05, + "loss": 0.3777, + "step": 3871 + }, + { + "epoch": 0.3067538126361656, + "grad_norm": 2.3036091533260517, + "learning_rate": 1.624492390608907e-05, + "loss": 0.3056, + "step": 3872 + }, + { + "epoch": 0.30683303624480096, + "grad_norm": 1.5073818451059444, + "learning_rate": 1.6242919550457116e-05, + "loss": 0.3244, + "step": 3873 + }, + { + "epoch": 0.30691225985343634, + "grad_norm": 1.4083047702812181, + "learning_rate": 1.6240914783758946e-05, + "loss": 0.31, + "step": 3874 + }, + { + "epoch": 0.3069914834620717, + "grad_norm": 2.0002940997862138, + "learning_rate": 1.6238909606126568e-05, + "loss": 0.4826, + "step": 3875 + }, + { + "epoch": 0.30707070707070705, + "grad_norm": 1.3655340967693788, + "learning_rate": 1.6236904017692016e-05, + "loss": 0.3025, + "step": 3876 + }, + { + "epoch": 0.30714993067934243, + "grad_norm": 1.307926462504353, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.2097, + "step": 3877 + }, + { + "epoch": 0.3072291542879778, + "grad_norm": 1.597160466642504, + "learning_rate": 1.6232891608944627e-05, + "loss": 0.5153, + "step": 3878 + }, + { + "epoch": 0.3073083778966132, + "grad_norm": 1.6280797024562519, + "learning_rate": 1.6230884788895998e-05, + "loss": 0.3942, + "step": 3879 + }, + { + "epoch": 0.3073876015052486, + "grad_norm": 1.8013277125103866, + "learning_rate": 1.622887755857358e-05, + "loss": 0.4869, + "step": 3880 + }, + { + "epoch": 0.30746682511388396, + "grad_norm": 1.8721027773073764, + "learning_rate": 1.6226869918109553e-05, + "loss": 0.4217, + "step": 3881 + }, + { + "epoch": 0.30754604872251934, + "grad_norm": 1.4823194796731884, + "learning_rate": 1.62248618676361e-05, + "loss": 0.3684, + "step": 3882 + }, + { + "epoch": 0.30762527233115466, + "grad_norm": 1.7362565226495277, + "learning_rate": 1.6222853407285447e-05, + "loss": 0.3745, + "step": 3883 + }, + { + "epoch": 0.30770449593979005, + "grad_norm": 1.5033862134684643, + "learning_rate": 1.622084453718984e-05, + "loss": 0.3089, + "step": 3884 + }, + { + "epoch": 0.3077837195484254, + "grad_norm": 1.7048923056998178, + "learning_rate": 1.621883525748155e-05, + "loss": 0.4797, + "step": 3885 + }, + { + "epoch": 0.3078629431570608, + "grad_norm": 1.3406782417977685, + "learning_rate": 1.6216825568292885e-05, + "loss": 0.2417, + "step": 3886 + }, + { + "epoch": 0.3079421667656962, + "grad_norm": 2.097270834261255, + "learning_rate": 1.6214815469756165e-05, + "loss": 0.4651, + "step": 3887 + }, + { + "epoch": 0.30802139037433157, + "grad_norm": 1.5855389651706662, + "learning_rate": 1.6212804962003757e-05, + "loss": 0.3393, + "step": 3888 + }, + { + "epoch": 0.3081006139829669, + "grad_norm": 1.7502683201692009, + "learning_rate": 1.6210794045168033e-05, + "loss": 0.4894, + "step": 3889 + }, + { + "epoch": 0.3081798375916023, + "grad_norm": 1.7314268439339218, + "learning_rate": 1.6208782719381403e-05, + "loss": 0.4039, + "step": 3890 + }, + { + "epoch": 0.30825906120023766, + "grad_norm": 1.7203426537643356, + "learning_rate": 1.6206770984776307e-05, + "loss": 0.3743, + "step": 3891 + }, + { + "epoch": 0.30833828480887304, + "grad_norm": 1.7270705470709433, + "learning_rate": 1.620475884148521e-05, + "loss": 0.3745, + "step": 3892 + }, + { + "epoch": 0.3084175084175084, + "grad_norm": 1.7493818191080077, + "learning_rate": 1.6202746289640594e-05, + "loss": 0.3071, + "step": 3893 + }, + { + "epoch": 0.3084967320261438, + "grad_norm": 1.559848287006385, + "learning_rate": 1.620073332937498e-05, + "loss": 0.4243, + "step": 3894 + }, + { + "epoch": 0.3085759556347792, + "grad_norm": 1.7019075087887992, + "learning_rate": 1.6198719960820917e-05, + "loss": 0.2881, + "step": 3895 + }, + { + "epoch": 0.3086551792434145, + "grad_norm": 1.481829633295651, + "learning_rate": 1.619670618411097e-05, + "loss": 0.4019, + "step": 3896 + }, + { + "epoch": 0.3087344028520499, + "grad_norm": 1.5701358815567066, + "learning_rate": 1.6194691999377736e-05, + "loss": 0.3249, + "step": 3897 + }, + { + "epoch": 0.3088136264606853, + "grad_norm": 1.6538956280133923, + "learning_rate": 1.619267740675384e-05, + "loss": 0.3555, + "step": 3898 + }, + { + "epoch": 0.30889285006932066, + "grad_norm": 1.2012242832602773, + "learning_rate": 1.6190662406371937e-05, + "loss": 0.2477, + "step": 3899 + }, + { + "epoch": 0.30897207367795604, + "grad_norm": 1.6532369613693252, + "learning_rate": 1.6188646998364703e-05, + "loss": 0.3867, + "step": 3900 + }, + { + "epoch": 0.3090512972865914, + "grad_norm": 1.5200424322822466, + "learning_rate": 1.6186631182864835e-05, + "loss": 0.3138, + "step": 3901 + }, + { + "epoch": 0.3091305208952268, + "grad_norm": 1.4371225729579056, + "learning_rate": 1.6184614960005078e-05, + "loss": 0.2851, + "step": 3902 + }, + { + "epoch": 0.3092097445038621, + "grad_norm": 1.7941207407615067, + "learning_rate": 1.6182598329918185e-05, + "loss": 0.3511, + "step": 3903 + }, + { + "epoch": 0.3092889681124975, + "grad_norm": 1.4622290904890745, + "learning_rate": 1.6180581292736938e-05, + "loss": 0.2585, + "step": 3904 + }, + { + "epoch": 0.3093681917211329, + "grad_norm": 1.458089183011868, + "learning_rate": 1.617856384859415e-05, + "loss": 0.3436, + "step": 3905 + }, + { + "epoch": 0.30944741532976827, + "grad_norm": 1.4393034561998665, + "learning_rate": 1.6176545997622662e-05, + "loss": 0.2405, + "step": 3906 + }, + { + "epoch": 0.30952663893840365, + "grad_norm": 1.5693974162032396, + "learning_rate": 1.6174527739955345e-05, + "loss": 0.3897, + "step": 3907 + }, + { + "epoch": 0.30960586254703903, + "grad_norm": 1.4801643184762672, + "learning_rate": 1.6172509075725084e-05, + "loss": 0.3065, + "step": 3908 + }, + { + "epoch": 0.3096850861556744, + "grad_norm": 1.8318505679163875, + "learning_rate": 1.61704900050648e-05, + "loss": 0.3175, + "step": 3909 + }, + { + "epoch": 0.30976430976430974, + "grad_norm": 1.769591033144959, + "learning_rate": 1.616847052810744e-05, + "loss": 0.4635, + "step": 3910 + }, + { + "epoch": 0.3098435333729451, + "grad_norm": 1.5028956758737584, + "learning_rate": 1.6166450644985975e-05, + "loss": 0.3215, + "step": 3911 + }, + { + "epoch": 0.3099227569815805, + "grad_norm": 1.5081839139914694, + "learning_rate": 1.6164430355833407e-05, + "loss": 0.3, + "step": 3912 + }, + { + "epoch": 0.3100019805902159, + "grad_norm": 1.4043088547051357, + "learning_rate": 1.616240966078276e-05, + "loss": 0.3397, + "step": 3913 + }, + { + "epoch": 0.31008120419885127, + "grad_norm": 1.452618257096568, + "learning_rate": 1.616038855996709e-05, + "loss": 0.2589, + "step": 3914 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 2.615123288208856, + "learning_rate": 1.6158367053519476e-05, + "loss": 0.3309, + "step": 3915 + }, + { + "epoch": 0.31023965141612203, + "grad_norm": 1.697277988130609, + "learning_rate": 1.6156345141573022e-05, + "loss": 0.3964, + "step": 3916 + }, + { + "epoch": 0.31031887502475736, + "grad_norm": 1.8478504351393588, + "learning_rate": 1.6154322824260865e-05, + "loss": 0.4758, + "step": 3917 + }, + { + "epoch": 0.31039809863339274, + "grad_norm": 1.7877943177008, + "learning_rate": 1.615230010171616e-05, + "loss": 0.3514, + "step": 3918 + }, + { + "epoch": 0.3104773222420281, + "grad_norm": 1.5023807468932024, + "learning_rate": 1.61502769740721e-05, + "loss": 0.3596, + "step": 3919 + }, + { + "epoch": 0.3105565458506635, + "grad_norm": 1.429157569810737, + "learning_rate": 1.6148253441461887e-05, + "loss": 0.2953, + "step": 3920 + }, + { + "epoch": 0.3106357694592989, + "grad_norm": 1.550399195990936, + "learning_rate": 1.6146229504018777e-05, + "loss": 0.4026, + "step": 3921 + }, + { + "epoch": 0.31071499306793426, + "grad_norm": 1.77126908240208, + "learning_rate": 1.6144205161876023e-05, + "loss": 0.4375, + "step": 3922 + }, + { + "epoch": 0.3107942166765696, + "grad_norm": 1.4480220999658833, + "learning_rate": 1.6142180415166926e-05, + "loss": 0.2819, + "step": 3923 + }, + { + "epoch": 0.31087344028520497, + "grad_norm": 1.9031412330115325, + "learning_rate": 1.61401552640248e-05, + "loss": 0.4306, + "step": 3924 + }, + { + "epoch": 0.31095266389384035, + "grad_norm": 1.5539700436960762, + "learning_rate": 1.6138129708582996e-05, + "loss": 0.2819, + "step": 3925 + }, + { + "epoch": 0.31103188750247573, + "grad_norm": 1.3850176489443191, + "learning_rate": 1.6136103748974885e-05, + "loss": 0.2509, + "step": 3926 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 1.8014291452920574, + "learning_rate": 1.6134077385333867e-05, + "loss": 0.3507, + "step": 3927 + }, + { + "epoch": 0.3111903347197465, + "grad_norm": 1.5178962006744077, + "learning_rate": 1.613205061779337e-05, + "loss": 0.3414, + "step": 3928 + }, + { + "epoch": 0.3112695583283819, + "grad_norm": 1.434360336941996, + "learning_rate": 1.6130023446486844e-05, + "loss": 0.3486, + "step": 3929 + }, + { + "epoch": 0.3113487819370172, + "grad_norm": 1.8797197326748423, + "learning_rate": 1.612799587154777e-05, + "loss": 0.4426, + "step": 3930 + }, + { + "epoch": 0.3114280055456526, + "grad_norm": 1.4977091745133497, + "learning_rate": 1.6125967893109657e-05, + "loss": 0.2946, + "step": 3931 + }, + { + "epoch": 0.31150722915428797, + "grad_norm": 1.577594247482412, + "learning_rate": 1.6123939511306028e-05, + "loss": 0.3169, + "step": 3932 + }, + { + "epoch": 0.31158645276292335, + "grad_norm": 1.6050750522062, + "learning_rate": 1.6121910726270453e-05, + "loss": 0.3593, + "step": 3933 + }, + { + "epoch": 0.31166567637155873, + "grad_norm": 1.7068248027418338, + "learning_rate": 1.6119881538136514e-05, + "loss": 0.3789, + "step": 3934 + }, + { + "epoch": 0.3117448999801941, + "grad_norm": 1.5085712753038054, + "learning_rate": 1.611785194703782e-05, + "loss": 0.3435, + "step": 3935 + }, + { + "epoch": 0.3118241235888295, + "grad_norm": 1.4848287162898604, + "learning_rate": 1.6115821953108015e-05, + "loss": 0.2968, + "step": 3936 + }, + { + "epoch": 0.3119033471974648, + "grad_norm": 1.7332486526812132, + "learning_rate": 1.611379155648076e-05, + "loss": 0.3817, + "step": 3937 + }, + { + "epoch": 0.3119825708061002, + "grad_norm": 1.4199092339714177, + "learning_rate": 1.611176075728975e-05, + "loss": 0.2963, + "step": 3938 + }, + { + "epoch": 0.3120617944147356, + "grad_norm": 1.5233312762977607, + "learning_rate": 1.61097295556687e-05, + "loss": 0.3474, + "step": 3939 + }, + { + "epoch": 0.31214101802337096, + "grad_norm": 1.6254257284513742, + "learning_rate": 1.610769795175136e-05, + "loss": 0.3002, + "step": 3940 + }, + { + "epoch": 0.31222024163200635, + "grad_norm": 1.3463635393826112, + "learning_rate": 1.6105665945671497e-05, + "loss": 0.2301, + "step": 3941 + }, + { + "epoch": 0.3122994652406417, + "grad_norm": 1.7120530973401842, + "learning_rate": 1.610363353756291e-05, + "loss": 0.374, + "step": 3942 + }, + { + "epoch": 0.3123786888492771, + "grad_norm": 1.2062836633348841, + "learning_rate": 1.6101600727559423e-05, + "loss": 0.2285, + "step": 3943 + }, + { + "epoch": 0.31245791245791243, + "grad_norm": 1.654651560809471, + "learning_rate": 1.6099567515794886e-05, + "loss": 0.3142, + "step": 3944 + }, + { + "epoch": 0.3125371360665478, + "grad_norm": 1.35080593020383, + "learning_rate": 1.609753390240318e-05, + "loss": 0.2227, + "step": 3945 + }, + { + "epoch": 0.3126163596751832, + "grad_norm": 1.5811421030776875, + "learning_rate": 1.6095499887518204e-05, + "loss": 0.2919, + "step": 3946 + }, + { + "epoch": 0.3126955832838186, + "grad_norm": 1.6351038035110637, + "learning_rate": 1.6093465471273894e-05, + "loss": 0.4352, + "step": 3947 + }, + { + "epoch": 0.31277480689245396, + "grad_norm": 1.6594707654159093, + "learning_rate": 1.60914306538042e-05, + "loss": 0.3711, + "step": 3948 + }, + { + "epoch": 0.31285403050108934, + "grad_norm": 1.9952025371852224, + "learning_rate": 1.6089395435243105e-05, + "loss": 0.4503, + "step": 3949 + }, + { + "epoch": 0.3129332541097247, + "grad_norm": 1.5302430177619408, + "learning_rate": 1.6087359815724623e-05, + "loss": 0.3939, + "step": 3950 + }, + { + "epoch": 0.31301247771836005, + "grad_norm": 1.5069625663961839, + "learning_rate": 1.6085323795382785e-05, + "loss": 0.3061, + "step": 3951 + }, + { + "epoch": 0.31309170132699543, + "grad_norm": 1.663772035447464, + "learning_rate": 1.608328737435166e-05, + "loss": 0.3456, + "step": 3952 + }, + { + "epoch": 0.3131709249356308, + "grad_norm": 1.3213608162427135, + "learning_rate": 1.608125055276533e-05, + "loss": 0.2756, + "step": 3953 + }, + { + "epoch": 0.3132501485442662, + "grad_norm": 1.672841048395237, + "learning_rate": 1.607921333075791e-05, + "loss": 0.3514, + "step": 3954 + }, + { + "epoch": 0.3133293721529016, + "grad_norm": 1.8972876988121614, + "learning_rate": 1.607717570846355e-05, + "loss": 0.4897, + "step": 3955 + }, + { + "epoch": 0.31340859576153696, + "grad_norm": 1.6312047293334224, + "learning_rate": 1.6075137686016408e-05, + "loss": 0.3069, + "step": 3956 + }, + { + "epoch": 0.31348781937017234, + "grad_norm": 1.7777307357017949, + "learning_rate": 1.6073099263550677e-05, + "loss": 0.3673, + "step": 3957 + }, + { + "epoch": 0.31356704297880766, + "grad_norm": 1.5043702946374968, + "learning_rate": 1.6071060441200587e-05, + "loss": 0.2619, + "step": 3958 + }, + { + "epoch": 0.31364626658744305, + "grad_norm": 1.6649082876196961, + "learning_rate": 1.6069021219100375e-05, + "loss": 0.4012, + "step": 3959 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 1.4555198485720622, + "learning_rate": 1.606698159738432e-05, + "loss": 0.2845, + "step": 3960 + }, + { + "epoch": 0.3138047138047138, + "grad_norm": 1.5385885315447563, + "learning_rate": 1.606494157618672e-05, + "loss": 0.3841, + "step": 3961 + }, + { + "epoch": 0.3138839374133492, + "grad_norm": 1.370743562057957, + "learning_rate": 1.60629011556419e-05, + "loss": 0.329, + "step": 3962 + }, + { + "epoch": 0.31396316102198457, + "grad_norm": 1.3083362360324118, + "learning_rate": 1.6060860335884208e-05, + "loss": 0.2658, + "step": 3963 + }, + { + "epoch": 0.3140423846306199, + "grad_norm": 1.6555569799844825, + "learning_rate": 1.605881911704803e-05, + "loss": 0.3806, + "step": 3964 + }, + { + "epoch": 0.3141216082392553, + "grad_norm": 1.922946414992996, + "learning_rate": 1.6056777499267764e-05, + "loss": 0.4366, + "step": 3965 + }, + { + "epoch": 0.31420083184789066, + "grad_norm": 1.5994553184309726, + "learning_rate": 1.6054735482677842e-05, + "loss": 0.3834, + "step": 3966 + }, + { + "epoch": 0.31428005545652604, + "grad_norm": 1.5442959135058916, + "learning_rate": 1.6052693067412724e-05, + "loss": 0.2872, + "step": 3967 + }, + { + "epoch": 0.3143592790651614, + "grad_norm": 1.3506474309480183, + "learning_rate": 1.605065025360689e-05, + "loss": 0.3089, + "step": 3968 + }, + { + "epoch": 0.3144385026737968, + "grad_norm": 1.4269200021768844, + "learning_rate": 1.6048607041394856e-05, + "loss": 0.2912, + "step": 3969 + }, + { + "epoch": 0.3145177262824322, + "grad_norm": 1.499493156096059, + "learning_rate": 1.6046563430911148e-05, + "loss": 0.3126, + "step": 3970 + }, + { + "epoch": 0.3145969498910675, + "grad_norm": 1.5649153072445345, + "learning_rate": 1.6044519422290333e-05, + "loss": 0.3493, + "step": 3971 + }, + { + "epoch": 0.3146761734997029, + "grad_norm": 1.3721638257787552, + "learning_rate": 1.6042475015666995e-05, + "loss": 0.3073, + "step": 3972 + }, + { + "epoch": 0.3147553971083383, + "grad_norm": 1.3633820418866238, + "learning_rate": 1.604043021117575e-05, + "loss": 0.3121, + "step": 3973 + }, + { + "epoch": 0.31483462071697366, + "grad_norm": 1.4707268718651163, + "learning_rate": 1.603838500895125e-05, + "loss": 0.3345, + "step": 3974 + }, + { + "epoch": 0.31491384432560904, + "grad_norm": 1.7142632728060705, + "learning_rate": 1.6036339409128146e-05, + "loss": 0.4247, + "step": 3975 + }, + { + "epoch": 0.3149930679342444, + "grad_norm": 1.461935275688194, + "learning_rate": 1.603429341184114e-05, + "loss": 0.3061, + "step": 3976 + }, + { + "epoch": 0.3150722915428798, + "grad_norm": 1.4903185708386406, + "learning_rate": 1.6032247017224944e-05, + "loss": 0.35, + "step": 3977 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 1.5914286040921042, + "learning_rate": 1.603020022541431e-05, + "loss": 0.2862, + "step": 3978 + }, + { + "epoch": 0.3152307387601505, + "grad_norm": 1.5253054198857707, + "learning_rate": 1.6028153036544005e-05, + "loss": 0.3879, + "step": 3979 + }, + { + "epoch": 0.3153099623687859, + "grad_norm": 1.6728327456698378, + "learning_rate": 1.6026105450748826e-05, + "loss": 0.4079, + "step": 3980 + }, + { + "epoch": 0.31538918597742127, + "grad_norm": 1.2396194247137853, + "learning_rate": 1.6024057468163604e-05, + "loss": 0.2308, + "step": 3981 + }, + { + "epoch": 0.31546840958605665, + "grad_norm": 1.385846772012705, + "learning_rate": 1.602200908892318e-05, + "loss": 0.3088, + "step": 3982 + }, + { + "epoch": 0.31554763319469203, + "grad_norm": 1.7970845809909852, + "learning_rate": 1.6019960313162436e-05, + "loss": 0.2787, + "step": 3983 + }, + { + "epoch": 0.3156268568033274, + "grad_norm": 1.4107491670450274, + "learning_rate": 1.601791114101627e-05, + "loss": 0.3101, + "step": 3984 + }, + { + "epoch": 0.31570608041196274, + "grad_norm": 1.7846985614927462, + "learning_rate": 1.6015861572619612e-05, + "loss": 0.3924, + "step": 3985 + }, + { + "epoch": 0.3157853040205981, + "grad_norm": 1.5642993595642487, + "learning_rate": 1.6013811608107415e-05, + "loss": 0.3673, + "step": 3986 + }, + { + "epoch": 0.3158645276292335, + "grad_norm": 1.6853474268140274, + "learning_rate": 1.6011761247614664e-05, + "loss": 0.3455, + "step": 3987 + }, + { + "epoch": 0.3159437512378689, + "grad_norm": 1.394722694232811, + "learning_rate": 1.600971049127636e-05, + "loss": 0.2826, + "step": 3988 + }, + { + "epoch": 0.31602297484650427, + "grad_norm": 1.4182497128014389, + "learning_rate": 1.6007659339227534e-05, + "loss": 0.2925, + "step": 3989 + }, + { + "epoch": 0.31610219845513965, + "grad_norm": 1.6343899804812616, + "learning_rate": 1.6005607791603247e-05, + "loss": 0.2502, + "step": 3990 + }, + { + "epoch": 0.31618142206377503, + "grad_norm": 1.679181719415134, + "learning_rate": 1.6003555848538586e-05, + "loss": 0.3994, + "step": 3991 + }, + { + "epoch": 0.31626064567241036, + "grad_norm": 1.6381870325344796, + "learning_rate": 1.600150351016866e-05, + "loss": 0.3061, + "step": 3992 + }, + { + "epoch": 0.31633986928104574, + "grad_norm": 1.7226596229115656, + "learning_rate": 1.5999450776628607e-05, + "loss": 0.4597, + "step": 3993 + }, + { + "epoch": 0.3164190928896811, + "grad_norm": 1.7248008028923216, + "learning_rate": 1.5997397648053587e-05, + "loss": 0.4571, + "step": 3994 + }, + { + "epoch": 0.3164983164983165, + "grad_norm": 1.4863097883307261, + "learning_rate": 1.599534412457879e-05, + "loss": 0.2978, + "step": 3995 + }, + { + "epoch": 0.3165775401069519, + "grad_norm": 1.8978919659293474, + "learning_rate": 1.5993290206339426e-05, + "loss": 0.3633, + "step": 3996 + }, + { + "epoch": 0.31665676371558726, + "grad_norm": 2.0684281114131244, + "learning_rate": 1.5991235893470745e-05, + "loss": 0.4515, + "step": 3997 + }, + { + "epoch": 0.31673598732422265, + "grad_norm": 1.45627374081648, + "learning_rate": 1.5989181186108003e-05, + "loss": 0.3664, + "step": 3998 + }, + { + "epoch": 0.31681521093285797, + "grad_norm": 1.6866353176737428, + "learning_rate": 1.59871260843865e-05, + "loss": 0.3215, + "step": 3999 + }, + { + "epoch": 0.31689443454149335, + "grad_norm": 1.4502791134570638, + "learning_rate": 1.5985070588441556e-05, + "loss": 0.3696, + "step": 4000 + }, + { + "epoch": 0.31697365815012873, + "grad_norm": 1.8553075739887375, + "learning_rate": 1.598301469840851e-05, + "loss": 0.3829, + "step": 4001 + }, + { + "epoch": 0.3170528817587641, + "grad_norm": 1.5343213790540935, + "learning_rate": 1.598095841442273e-05, + "loss": 0.3064, + "step": 4002 + }, + { + "epoch": 0.3171321053673995, + "grad_norm": 1.453456987076398, + "learning_rate": 1.5978901736619624e-05, + "loss": 0.3491, + "step": 4003 + }, + { + "epoch": 0.3172113289760349, + "grad_norm": 1.6749931422482662, + "learning_rate": 1.5976844665134607e-05, + "loss": 0.3952, + "step": 4004 + }, + { + "epoch": 0.3172905525846702, + "grad_norm": 1.5701432097954482, + "learning_rate": 1.5974787200103124e-05, + "loss": 0.3798, + "step": 4005 + }, + { + "epoch": 0.3173697761933056, + "grad_norm": 1.9168291490562859, + "learning_rate": 1.5972729341660653e-05, + "loss": 0.4636, + "step": 4006 + }, + { + "epoch": 0.31744899980194097, + "grad_norm": 1.5108320652242277, + "learning_rate": 1.597067108994269e-05, + "loss": 0.3008, + "step": 4007 + }, + { + "epoch": 0.31752822341057635, + "grad_norm": 1.3485295180730346, + "learning_rate": 1.5968612445084773e-05, + "loss": 0.2682, + "step": 4008 + }, + { + "epoch": 0.31760744701921173, + "grad_norm": 1.9265023874522214, + "learning_rate": 1.596655340722244e-05, + "loss": 0.4162, + "step": 4009 + }, + { + "epoch": 0.3176866706278471, + "grad_norm": 2.026081117900468, + "learning_rate": 1.5964493976491278e-05, + "loss": 0.3426, + "step": 4010 + }, + { + "epoch": 0.3177658942364825, + "grad_norm": 1.5092003847971471, + "learning_rate": 1.5962434153026884e-05, + "loss": 0.3542, + "step": 4011 + }, + { + "epoch": 0.3178451178451178, + "grad_norm": 2.05854167333894, + "learning_rate": 1.596037393696489e-05, + "loss": 0.3611, + "step": 4012 + }, + { + "epoch": 0.3179243414537532, + "grad_norm": 1.873707046514757, + "learning_rate": 1.5958313328440954e-05, + "loss": 0.2826, + "step": 4013 + }, + { + "epoch": 0.3180035650623886, + "grad_norm": 2.1294997035567325, + "learning_rate": 1.595625232759076e-05, + "loss": 0.6559, + "step": 4014 + }, + { + "epoch": 0.31808278867102396, + "grad_norm": 1.4349506552742113, + "learning_rate": 1.595419093455e-05, + "loss": 0.2362, + "step": 4015 + }, + { + "epoch": 0.31816201227965935, + "grad_norm": 1.8846758055636847, + "learning_rate": 1.5952129149454422e-05, + "loss": 0.4909, + "step": 4016 + }, + { + "epoch": 0.3182412358882947, + "grad_norm": 1.5005705266160903, + "learning_rate": 1.595006697243978e-05, + "loss": 0.3242, + "step": 4017 + }, + { + "epoch": 0.3183204594969301, + "grad_norm": 1.4075272590450865, + "learning_rate": 1.5948004403641853e-05, + "loss": 0.35, + "step": 4018 + }, + { + "epoch": 0.31839968310556543, + "grad_norm": 1.6490155039931167, + "learning_rate": 1.594594144319646e-05, + "loss": 0.3933, + "step": 4019 + }, + { + "epoch": 0.3184789067142008, + "grad_norm": 1.8022323247782437, + "learning_rate": 1.594387809123943e-05, + "loss": 0.3335, + "step": 4020 + }, + { + "epoch": 0.3185581303228362, + "grad_norm": 1.5495894082566937, + "learning_rate": 1.594181434790663e-05, + "loss": 0.3128, + "step": 4021 + }, + { + "epoch": 0.3186373539314716, + "grad_norm": 1.5184128628676445, + "learning_rate": 1.5939750213333948e-05, + "loss": 0.3471, + "step": 4022 + }, + { + "epoch": 0.31871657754010696, + "grad_norm": 1.629351105383533, + "learning_rate": 1.593768568765729e-05, + "loss": 0.315, + "step": 4023 + }, + { + "epoch": 0.31879580114874234, + "grad_norm": 1.517339322393902, + "learning_rate": 1.5935620771012603e-05, + "loss": 0.3716, + "step": 4024 + }, + { + "epoch": 0.3188750247573777, + "grad_norm": 1.4881773229861002, + "learning_rate": 1.5933555463535846e-05, + "loss": 0.2914, + "step": 4025 + }, + { + "epoch": 0.31895424836601305, + "grad_norm": 1.5178618159021746, + "learning_rate": 1.5931489765363014e-05, + "loss": 0.2857, + "step": 4026 + }, + { + "epoch": 0.31903347197464843, + "grad_norm": 1.9089184698038, + "learning_rate": 1.592942367663012e-05, + "loss": 0.4619, + "step": 4027 + }, + { + "epoch": 0.3191126955832838, + "grad_norm": 2.111897087423868, + "learning_rate": 1.5927357197473207e-05, + "loss": 0.4302, + "step": 4028 + }, + { + "epoch": 0.3191919191919192, + "grad_norm": 1.9595311271334965, + "learning_rate": 1.5925290328028346e-05, + "loss": 0.2981, + "step": 4029 + }, + { + "epoch": 0.3192711428005546, + "grad_norm": 2.0274748403299516, + "learning_rate": 1.5923223068431626e-05, + "loss": 0.4058, + "step": 4030 + }, + { + "epoch": 0.31935036640918996, + "grad_norm": 1.7810799202678977, + "learning_rate": 1.592115541881917e-05, + "loss": 0.3805, + "step": 4031 + }, + { + "epoch": 0.31942959001782534, + "grad_norm": 1.5939194886532888, + "learning_rate": 1.5919087379327116e-05, + "loss": 0.3815, + "step": 4032 + }, + { + "epoch": 0.31950881362646066, + "grad_norm": 1.5290876885449083, + "learning_rate": 1.5917018950091642e-05, + "loss": 0.3322, + "step": 4033 + }, + { + "epoch": 0.31958803723509605, + "grad_norm": 1.8130967687643362, + "learning_rate": 1.591495013124894e-05, + "loss": 0.3481, + "step": 4034 + }, + { + "epoch": 0.3196672608437314, + "grad_norm": 1.7781086584506063, + "learning_rate": 1.591288092293523e-05, + "loss": 0.4384, + "step": 4035 + }, + { + "epoch": 0.3197464844523668, + "grad_norm": 1.7810641738759658, + "learning_rate": 1.5910811325286768e-05, + "loss": 0.5053, + "step": 4036 + }, + { + "epoch": 0.3198257080610022, + "grad_norm": 1.4811799838435034, + "learning_rate": 1.5908741338439818e-05, + "loss": 0.2877, + "step": 4037 + }, + { + "epoch": 0.31990493166963757, + "grad_norm": 1.75385673902383, + "learning_rate": 1.5906670962530683e-05, + "loss": 0.3939, + "step": 4038 + }, + { + "epoch": 0.31998415527827295, + "grad_norm": 1.4463272747486247, + "learning_rate": 1.5904600197695684e-05, + "loss": 0.3113, + "step": 4039 + }, + { + "epoch": 0.3200633788869083, + "grad_norm": 1.7662807921687593, + "learning_rate": 1.5902529044071173e-05, + "loss": 0.4202, + "step": 4040 + }, + { + "epoch": 0.32014260249554366, + "grad_norm": 1.804553606907039, + "learning_rate": 1.590045750179353e-05, + "loss": 0.44, + "step": 4041 + }, + { + "epoch": 0.32022182610417904, + "grad_norm": 1.2430074461173346, + "learning_rate": 1.5898385570999146e-05, + "loss": 0.2654, + "step": 4042 + }, + { + "epoch": 0.3203010497128144, + "grad_norm": 1.671144080773817, + "learning_rate": 1.589631325182446e-05, + "loss": 0.3594, + "step": 4043 + }, + { + "epoch": 0.3203802733214498, + "grad_norm": 1.534521781173824, + "learning_rate": 1.589424054440591e-05, + "loss": 0.3198, + "step": 4044 + }, + { + "epoch": 0.3204594969300852, + "grad_norm": 1.4216959107307843, + "learning_rate": 1.5892167448879984e-05, + "loss": 0.292, + "step": 4045 + }, + { + "epoch": 0.3205387205387205, + "grad_norm": 1.6928449619548298, + "learning_rate": 1.5890093965383186e-05, + "loss": 0.4057, + "step": 4046 + }, + { + "epoch": 0.3206179441473559, + "grad_norm": 1.6996355327671042, + "learning_rate": 1.588802009405204e-05, + "loss": 0.3946, + "step": 4047 + }, + { + "epoch": 0.3206971677559913, + "grad_norm": 1.41447755486852, + "learning_rate": 1.5885945835023104e-05, + "loss": 0.4286, + "step": 4048 + }, + { + "epoch": 0.32077639136462666, + "grad_norm": 1.750307274779682, + "learning_rate": 1.5883871188432955e-05, + "loss": 0.4238, + "step": 4049 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 1.4609287919409428, + "learning_rate": 1.5881796154418196e-05, + "loss": 0.2922, + "step": 4050 + }, + { + "epoch": 0.3209348385818974, + "grad_norm": 1.4769480986269932, + "learning_rate": 1.5879720733115464e-05, + "loss": 0.2954, + "step": 4051 + }, + { + "epoch": 0.3210140621905328, + "grad_norm": 1.410933015347825, + "learning_rate": 1.5877644924661412e-05, + "loss": 0.3133, + "step": 4052 + }, + { + "epoch": 0.3210932857991681, + "grad_norm": 1.6978830498046613, + "learning_rate": 1.5875568729192728e-05, + "loss": 0.2862, + "step": 4053 + }, + { + "epoch": 0.3211725094078035, + "grad_norm": 1.246344728350576, + "learning_rate": 1.587349214684611e-05, + "loss": 0.2976, + "step": 4054 + }, + { + "epoch": 0.3212517330164389, + "grad_norm": 1.4697024731348896, + "learning_rate": 1.5871415177758297e-05, + "loss": 0.3432, + "step": 4055 + }, + { + "epoch": 0.32133095662507427, + "grad_norm": 1.6731277566867855, + "learning_rate": 1.5869337822066043e-05, + "loss": 0.3906, + "step": 4056 + }, + { + "epoch": 0.32141018023370965, + "grad_norm": 1.5432068633480194, + "learning_rate": 1.586726007990614e-05, + "loss": 0.3477, + "step": 4057 + }, + { + "epoch": 0.32148940384234503, + "grad_norm": 1.6744279487938063, + "learning_rate": 1.586518195141539e-05, + "loss": 0.2965, + "step": 4058 + }, + { + "epoch": 0.3215686274509804, + "grad_norm": 1.4676313194712571, + "learning_rate": 1.5863103436730627e-05, + "loss": 0.3082, + "step": 4059 + }, + { + "epoch": 0.32164785105961574, + "grad_norm": 1.5485100068176951, + "learning_rate": 1.586102453598872e-05, + "loss": 0.313, + "step": 4060 + }, + { + "epoch": 0.3217270746682511, + "grad_norm": 1.7758709047419037, + "learning_rate": 1.5858945249326545e-05, + "loss": 0.4067, + "step": 4061 + }, + { + "epoch": 0.3218062982768865, + "grad_norm": 2.059127526753576, + "learning_rate": 1.5856865576881016e-05, + "loss": 0.3675, + "step": 4062 + }, + { + "epoch": 0.3218855218855219, + "grad_norm": 1.6719954529409657, + "learning_rate": 1.5854785518789074e-05, + "loss": 0.3794, + "step": 4063 + }, + { + "epoch": 0.32196474549415727, + "grad_norm": 1.4205656804115565, + "learning_rate": 1.5852705075187674e-05, + "loss": 0.3279, + "step": 4064 + }, + { + "epoch": 0.32204396910279265, + "grad_norm": 1.7520572289418674, + "learning_rate": 1.5850624246213805e-05, + "loss": 0.3618, + "step": 4065 + }, + { + "epoch": 0.32212319271142803, + "grad_norm": 1.5448127649430832, + "learning_rate": 1.5848543032004483e-05, + "loss": 0.4441, + "step": 4066 + }, + { + "epoch": 0.32220241632006336, + "grad_norm": 1.327499650582626, + "learning_rate": 1.5846461432696744e-05, + "loss": 0.2979, + "step": 4067 + }, + { + "epoch": 0.32228163992869874, + "grad_norm": 1.870242632491099, + "learning_rate": 1.5844379448427648e-05, + "loss": 0.5472, + "step": 4068 + }, + { + "epoch": 0.3223608635373341, + "grad_norm": 1.1038237678112879, + "learning_rate": 1.5842297079334293e-05, + "loss": 0.207, + "step": 4069 + }, + { + "epoch": 0.3224400871459695, + "grad_norm": 1.4843928080800322, + "learning_rate": 1.5840214325553782e-05, + "loss": 0.3386, + "step": 4070 + }, + { + "epoch": 0.3225193107546049, + "grad_norm": 1.610494537503993, + "learning_rate": 1.583813118722326e-05, + "loss": 0.413, + "step": 4071 + }, + { + "epoch": 0.32259853436324026, + "grad_norm": 1.9873367423345312, + "learning_rate": 1.583604766447989e-05, + "loss": 0.4168, + "step": 4072 + }, + { + "epoch": 0.32267775797187565, + "grad_norm": 1.6020008517869455, + "learning_rate": 1.5833963757460863e-05, + "loss": 0.3319, + "step": 4073 + }, + { + "epoch": 0.32275698158051097, + "grad_norm": 1.7035260271569024, + "learning_rate": 1.5831879466303393e-05, + "loss": 0.3486, + "step": 4074 + }, + { + "epoch": 0.32283620518914635, + "grad_norm": 1.8486188396204775, + "learning_rate": 1.5829794791144723e-05, + "loss": 0.4305, + "step": 4075 + }, + { + "epoch": 0.32291542879778173, + "grad_norm": 1.706390053357653, + "learning_rate": 1.5827709732122115e-05, + "loss": 0.4006, + "step": 4076 + }, + { + "epoch": 0.3229946524064171, + "grad_norm": 1.6359573319287772, + "learning_rate": 1.5825624289372864e-05, + "loss": 0.4537, + "step": 4077 + }, + { + "epoch": 0.3230738760150525, + "grad_norm": 1.8092075318079248, + "learning_rate": 1.5823538463034283e-05, + "loss": 0.2648, + "step": 4078 + }, + { + "epoch": 0.3231530996236879, + "grad_norm": 1.8139323968687953, + "learning_rate": 1.5821452253243718e-05, + "loss": 0.3806, + "step": 4079 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 1.4462998240253209, + "learning_rate": 1.581936566013853e-05, + "loss": 0.4712, + "step": 4080 + }, + { + "epoch": 0.3233115468409586, + "grad_norm": 1.2640091811161325, + "learning_rate": 1.5817278683856117e-05, + "loss": 0.2678, + "step": 4081 + }, + { + "epoch": 0.32339077044959397, + "grad_norm": 1.8705992247017151, + "learning_rate": 1.5815191324533893e-05, + "loss": 0.3709, + "step": 4082 + }, + { + "epoch": 0.32346999405822935, + "grad_norm": 2.209763043359014, + "learning_rate": 1.58131035823093e-05, + "loss": 0.3603, + "step": 4083 + }, + { + "epoch": 0.32354921766686473, + "grad_norm": 1.945451480736648, + "learning_rate": 1.581101545731981e-05, + "loss": 0.3403, + "step": 4084 + }, + { + "epoch": 0.3236284412755001, + "grad_norm": 1.627318957001608, + "learning_rate": 1.580892694970291e-05, + "loss": 0.3793, + "step": 4085 + }, + { + "epoch": 0.3237076648841355, + "grad_norm": 1.3674463815413995, + "learning_rate": 1.580683805959612e-05, + "loss": 0.2761, + "step": 4086 + }, + { + "epoch": 0.3237868884927708, + "grad_norm": 1.3374556326086156, + "learning_rate": 1.5804748787136987e-05, + "loss": 0.2826, + "step": 4087 + }, + { + "epoch": 0.3238661121014062, + "grad_norm": 1.5059445921408614, + "learning_rate": 1.5802659132463076e-05, + "loss": 0.2897, + "step": 4088 + }, + { + "epoch": 0.3239453357100416, + "grad_norm": 1.7116220057117026, + "learning_rate": 1.5800569095711983e-05, + "loss": 0.247, + "step": 4089 + }, + { + "epoch": 0.32402455931867696, + "grad_norm": 1.7416405539798918, + "learning_rate": 1.5798478677021327e-05, + "loss": 0.4265, + "step": 4090 + }, + { + "epoch": 0.32410378292731234, + "grad_norm": 1.8017931813708092, + "learning_rate": 1.5796387876528746e-05, + "loss": 0.4326, + "step": 4091 + }, + { + "epoch": 0.3241830065359477, + "grad_norm": 1.613586066070733, + "learning_rate": 1.579429669437192e-05, + "loss": 0.3269, + "step": 4092 + }, + { + "epoch": 0.3242622301445831, + "grad_norm": 2.0851014721491206, + "learning_rate": 1.579220513068853e-05, + "loss": 0.4173, + "step": 4093 + }, + { + "epoch": 0.32434145375321843, + "grad_norm": 1.6153451371930847, + "learning_rate": 1.5790113185616305e-05, + "loss": 0.3289, + "step": 4094 + }, + { + "epoch": 0.3244206773618538, + "grad_norm": 1.502787218227778, + "learning_rate": 1.5788020859292987e-05, + "loss": 0.2889, + "step": 4095 + }, + { + "epoch": 0.3244999009704892, + "grad_norm": 1.4847709663047504, + "learning_rate": 1.5785928151856345e-05, + "loss": 0.3386, + "step": 4096 + }, + { + "epoch": 0.3245791245791246, + "grad_norm": 1.2691896968235292, + "learning_rate": 1.5783835063444176e-05, + "loss": 0.2441, + "step": 4097 + }, + { + "epoch": 0.32465834818775996, + "grad_norm": 1.6602498908243948, + "learning_rate": 1.57817415941943e-05, + "loss": 0.3856, + "step": 4098 + }, + { + "epoch": 0.32473757179639534, + "grad_norm": 1.6273179574346803, + "learning_rate": 1.5779647744244556e-05, + "loss": 0.3513, + "step": 4099 + }, + { + "epoch": 0.3248167954050307, + "grad_norm": 1.7727630279084312, + "learning_rate": 1.577755351373282e-05, + "loss": 0.4255, + "step": 4100 + }, + { + "epoch": 0.32489601901366605, + "grad_norm": 1.3928566774257036, + "learning_rate": 1.5775458902796982e-05, + "loss": 0.3182, + "step": 4101 + }, + { + "epoch": 0.32497524262230143, + "grad_norm": 1.4425146167237544, + "learning_rate": 1.577336391157497e-05, + "loss": 0.3976, + "step": 4102 + }, + { + "epoch": 0.3250544662309368, + "grad_norm": 1.6250876670082313, + "learning_rate": 1.5771268540204724e-05, + "loss": 0.3798, + "step": 4103 + }, + { + "epoch": 0.3251336898395722, + "grad_norm": 1.2782787809667149, + "learning_rate": 1.576917278882421e-05, + "loss": 0.2678, + "step": 4104 + }, + { + "epoch": 0.3252129134482076, + "grad_norm": 1.4836702415372003, + "learning_rate": 1.576707665757143e-05, + "loss": 0.3413, + "step": 4105 + }, + { + "epoch": 0.32529213705684296, + "grad_norm": 1.5669319128376393, + "learning_rate": 1.5764980146584402e-05, + "loss": 0.3167, + "step": 4106 + }, + { + "epoch": 0.32537136066547834, + "grad_norm": 1.5963128604568757, + "learning_rate": 1.5762883256001168e-05, + "loss": 0.3122, + "step": 4107 + }, + { + "epoch": 0.32545058427411366, + "grad_norm": 1.6443793513002702, + "learning_rate": 1.57607859859598e-05, + "loss": 0.3464, + "step": 4108 + }, + { + "epoch": 0.32552980788274904, + "grad_norm": 1.6174343628805843, + "learning_rate": 1.5758688336598397e-05, + "loss": 0.345, + "step": 4109 + }, + { + "epoch": 0.3256090314913844, + "grad_norm": 1.7371858495014527, + "learning_rate": 1.5756590308055075e-05, + "loss": 0.3202, + "step": 4110 + }, + { + "epoch": 0.3256882551000198, + "grad_norm": 1.7184145267005262, + "learning_rate": 1.5754491900467982e-05, + "loss": 0.414, + "step": 4111 + }, + { + "epoch": 0.3257674787086552, + "grad_norm": 1.8180579342518757, + "learning_rate": 1.5752393113975282e-05, + "loss": 0.4003, + "step": 4112 + }, + { + "epoch": 0.32584670231729057, + "grad_norm": 1.6564691017210889, + "learning_rate": 1.5750293948715178e-05, + "loss": 0.2914, + "step": 4113 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 1.553898807504377, + "learning_rate": 1.5748194404825885e-05, + "loss": 0.4081, + "step": 4114 + }, + { + "epoch": 0.3260051495345613, + "grad_norm": 1.529549987459568, + "learning_rate": 1.574609448244565e-05, + "loss": 0.3162, + "step": 4115 + }, + { + "epoch": 0.32608437314319666, + "grad_norm": 1.4274879881115572, + "learning_rate": 1.574399418171274e-05, + "loss": 0.3161, + "step": 4116 + }, + { + "epoch": 0.32616359675183204, + "grad_norm": 2.261364998445019, + "learning_rate": 1.5741893502765452e-05, + "loss": 0.3985, + "step": 4117 + }, + { + "epoch": 0.3262428203604674, + "grad_norm": 1.4180349264104986, + "learning_rate": 1.5739792445742103e-05, + "loss": 0.2931, + "step": 4118 + }, + { + "epoch": 0.3263220439691028, + "grad_norm": 1.7227585494610789, + "learning_rate": 1.573769101078104e-05, + "loss": 0.3643, + "step": 4119 + }, + { + "epoch": 0.3264012675777382, + "grad_norm": 1.6862048978441324, + "learning_rate": 1.573558919802064e-05, + "loss": 0.3093, + "step": 4120 + }, + { + "epoch": 0.32648049118637357, + "grad_norm": 1.4288303772723825, + "learning_rate": 1.573348700759928e-05, + "loss": 0.334, + "step": 4121 + }, + { + "epoch": 0.3265597147950089, + "grad_norm": 2.0285719530841613, + "learning_rate": 1.573138443965539e-05, + "loss": 0.3629, + "step": 4122 + }, + { + "epoch": 0.3266389384036443, + "grad_norm": 1.6836914910003749, + "learning_rate": 1.572928149432741e-05, + "loss": 0.4494, + "step": 4123 + }, + { + "epoch": 0.32671816201227966, + "grad_norm": 1.3770018885568436, + "learning_rate": 1.5727178171753817e-05, + "loss": 0.3225, + "step": 4124 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 1.401373513311934, + "learning_rate": 1.57250744720731e-05, + "loss": 0.2494, + "step": 4125 + }, + { + "epoch": 0.3268766092295504, + "grad_norm": 1.882570135082064, + "learning_rate": 1.572297039542377e-05, + "loss": 0.4127, + "step": 4126 + }, + { + "epoch": 0.3269558328381858, + "grad_norm": 1.7825013364947273, + "learning_rate": 1.572086594194438e-05, + "loss": 0.3768, + "step": 4127 + }, + { + "epoch": 0.3270350564468211, + "grad_norm": 1.5325148967272597, + "learning_rate": 1.571876111177349e-05, + "loss": 0.3614, + "step": 4128 + }, + { + "epoch": 0.3271142800554565, + "grad_norm": 1.2353497548099759, + "learning_rate": 1.571665590504971e-05, + "loss": 0.2602, + "step": 4129 + }, + { + "epoch": 0.3271935036640919, + "grad_norm": 1.615171667142959, + "learning_rate": 1.5714550321911636e-05, + "loss": 0.3544, + "step": 4130 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 1.5249794703643902, + "learning_rate": 1.5712444362497917e-05, + "loss": 0.2479, + "step": 4131 + }, + { + "epoch": 0.32735195088136265, + "grad_norm": 1.5709063522900069, + "learning_rate": 1.5710338026947227e-05, + "loss": 0.4325, + "step": 4132 + }, + { + "epoch": 0.32743117448999803, + "grad_norm": 1.5583515025518357, + "learning_rate": 1.5708231315398255e-05, + "loss": 0.3221, + "step": 4133 + }, + { + "epoch": 0.3275103980986334, + "grad_norm": 1.7328213074876821, + "learning_rate": 1.570612422798972e-05, + "loss": 0.3227, + "step": 4134 + }, + { + "epoch": 0.32758962170726874, + "grad_norm": 1.5401999941443987, + "learning_rate": 1.5704016764860358e-05, + "loss": 0.3203, + "step": 4135 + }, + { + "epoch": 0.3276688453159041, + "grad_norm": 1.2254292537735298, + "learning_rate": 1.5701908926148933e-05, + "loss": 0.242, + "step": 4136 + }, + { + "epoch": 0.3277480689245395, + "grad_norm": 1.6097644217399403, + "learning_rate": 1.5699800711994247e-05, + "loss": 0.333, + "step": 4137 + }, + { + "epoch": 0.3278272925331749, + "grad_norm": 1.4990357654879476, + "learning_rate": 1.569769212253511e-05, + "loss": 0.3431, + "step": 4138 + }, + { + "epoch": 0.32790651614181027, + "grad_norm": 1.3240800771279073, + "learning_rate": 1.569558315791036e-05, + "loss": 0.3622, + "step": 4139 + }, + { + "epoch": 0.32798573975044565, + "grad_norm": 1.4019169636326965, + "learning_rate": 1.5693473818258866e-05, + "loss": 0.3479, + "step": 4140 + }, + { + "epoch": 0.32806496335908103, + "grad_norm": 1.7774105201084667, + "learning_rate": 1.5691364103719515e-05, + "loss": 0.3919, + "step": 4141 + }, + { + "epoch": 0.32814418696771636, + "grad_norm": 1.7163373701070068, + "learning_rate": 1.5689254014431225e-05, + "loss": 0.4127, + "step": 4142 + }, + { + "epoch": 0.32822341057635174, + "grad_norm": 1.4924339893483967, + "learning_rate": 1.5687143550532932e-05, + "loss": 0.3459, + "step": 4143 + }, + { + "epoch": 0.3283026341849871, + "grad_norm": 1.6899935775801103, + "learning_rate": 1.56850327121636e-05, + "loss": 0.281, + "step": 4144 + }, + { + "epoch": 0.3283818577936225, + "grad_norm": 1.6642851490573314, + "learning_rate": 1.568292149946222e-05, + "loss": 0.3914, + "step": 4145 + }, + { + "epoch": 0.3284610814022579, + "grad_norm": 1.6254158194715103, + "learning_rate": 1.56808099125678e-05, + "loss": 0.2676, + "step": 4146 + }, + { + "epoch": 0.32854030501089326, + "grad_norm": 1.6626302004344038, + "learning_rate": 1.5678697951619386e-05, + "loss": 0.3349, + "step": 4147 + }, + { + "epoch": 0.32861952861952864, + "grad_norm": 1.8090398065346878, + "learning_rate": 1.5676585616756037e-05, + "loss": 0.4564, + "step": 4148 + }, + { + "epoch": 0.32869875222816397, + "grad_norm": 1.4981315587877324, + "learning_rate": 1.5674472908116834e-05, + "loss": 0.3388, + "step": 4149 + }, + { + "epoch": 0.32877797583679935, + "grad_norm": 1.4669306873360455, + "learning_rate": 1.5672359825840895e-05, + "loss": 0.311, + "step": 4150 + }, + { + "epoch": 0.32885719944543473, + "grad_norm": 1.4272680620902973, + "learning_rate": 1.567024637006736e-05, + "loss": 0.2956, + "step": 4151 + }, + { + "epoch": 0.3289364230540701, + "grad_norm": 1.7122151858693686, + "learning_rate": 1.566813254093538e-05, + "loss": 0.3896, + "step": 4152 + }, + { + "epoch": 0.3290156466627055, + "grad_norm": 1.4796597676585008, + "learning_rate": 1.566601833858415e-05, + "loss": 0.3327, + "step": 4153 + }, + { + "epoch": 0.3290948702713409, + "grad_norm": 1.4905342032042195, + "learning_rate": 1.566390376315287e-05, + "loss": 0.3081, + "step": 4154 + }, + { + "epoch": 0.32917409387997626, + "grad_norm": 1.4919596154341799, + "learning_rate": 1.5661788814780782e-05, + "loss": 0.3682, + "step": 4155 + }, + { + "epoch": 0.3292533174886116, + "grad_norm": 1.2962051191888033, + "learning_rate": 1.5659673493607144e-05, + "loss": 0.2799, + "step": 4156 + }, + { + "epoch": 0.32933254109724697, + "grad_norm": 1.7381654459870541, + "learning_rate": 1.565755779977124e-05, + "loss": 0.3794, + "step": 4157 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 1.3163164837949308, + "learning_rate": 1.5655441733412376e-05, + "loss": 0.2999, + "step": 4158 + }, + { + "epoch": 0.32949098831451773, + "grad_norm": 1.2673184430786948, + "learning_rate": 1.5653325294669884e-05, + "loss": 0.2392, + "step": 4159 + }, + { + "epoch": 0.3295702119231531, + "grad_norm": 1.6551438060361676, + "learning_rate": 1.565120848368313e-05, + "loss": 0.2603, + "step": 4160 + }, + { + "epoch": 0.3296494355317885, + "grad_norm": 1.5798922580777348, + "learning_rate": 1.5649091300591482e-05, + "loss": 0.2619, + "step": 4161 + }, + { + "epoch": 0.3297286591404238, + "grad_norm": 1.2103007555802698, + "learning_rate": 1.564697374553436e-05, + "loss": 0.2288, + "step": 4162 + }, + { + "epoch": 0.3298078827490592, + "grad_norm": 1.6451624840053714, + "learning_rate": 1.5644855818651184e-05, + "loss": 0.3791, + "step": 4163 + }, + { + "epoch": 0.3298871063576946, + "grad_norm": 1.9896398890381832, + "learning_rate": 1.564273752008141e-05, + "loss": 0.5585, + "step": 4164 + }, + { + "epoch": 0.32996632996632996, + "grad_norm": 1.3460781880914419, + "learning_rate": 1.5640618849964528e-05, + "loss": 0.2559, + "step": 4165 + }, + { + "epoch": 0.33004555357496534, + "grad_norm": 1.7591293010318814, + "learning_rate": 1.5638499808440036e-05, + "loss": 0.3463, + "step": 4166 + }, + { + "epoch": 0.3301247771836007, + "grad_norm": 1.4119593458458475, + "learning_rate": 1.563638039564746e-05, + "loss": 0.2451, + "step": 4167 + }, + { + "epoch": 0.3302040007922361, + "grad_norm": 1.452667841868237, + "learning_rate": 1.5634260611726355e-05, + "loss": 0.2668, + "step": 4168 + }, + { + "epoch": 0.33028322440087143, + "grad_norm": 1.6212523425673047, + "learning_rate": 1.5632140456816302e-05, + "loss": 0.3893, + "step": 4169 + }, + { + "epoch": 0.3303624480095068, + "grad_norm": 1.4613501771283923, + "learning_rate": 1.5630019931056894e-05, + "loss": 0.4171, + "step": 4170 + }, + { + "epoch": 0.3304416716181422, + "grad_norm": 1.4347976063611856, + "learning_rate": 1.5627899034587768e-05, + "loss": 0.2875, + "step": 4171 + }, + { + "epoch": 0.3305208952267776, + "grad_norm": 1.7536311018935635, + "learning_rate": 1.562577776754857e-05, + "loss": 0.3626, + "step": 4172 + }, + { + "epoch": 0.33060011883541296, + "grad_norm": 1.527150246117706, + "learning_rate": 1.5623656130078976e-05, + "loss": 0.3404, + "step": 4173 + }, + { + "epoch": 0.33067934244404834, + "grad_norm": 1.6467553163637536, + "learning_rate": 1.5621534122318682e-05, + "loss": 0.4612, + "step": 4174 + }, + { + "epoch": 0.3307585660526837, + "grad_norm": 1.7998619202822257, + "learning_rate": 1.5619411744407416e-05, + "loss": 0.3646, + "step": 4175 + }, + { + "epoch": 0.33083778966131905, + "grad_norm": 1.7394536808594858, + "learning_rate": 1.561728899648493e-05, + "loss": 0.4653, + "step": 4176 + }, + { + "epoch": 0.33091701326995443, + "grad_norm": 1.5112658734549012, + "learning_rate": 1.561516587869099e-05, + "loss": 0.2866, + "step": 4177 + }, + { + "epoch": 0.3309962368785898, + "grad_norm": 2.0090412743310573, + "learning_rate": 1.5613042391165395e-05, + "loss": 0.5817, + "step": 4178 + }, + { + "epoch": 0.3310754604872252, + "grad_norm": 1.5983017011813054, + "learning_rate": 1.5610918534047964e-05, + "loss": 0.3532, + "step": 4179 + }, + { + "epoch": 0.3311546840958606, + "grad_norm": 1.9844196291748513, + "learning_rate": 1.5608794307478546e-05, + "loss": 0.3935, + "step": 4180 + }, + { + "epoch": 0.33123390770449596, + "grad_norm": 1.6575759981422147, + "learning_rate": 1.5606669711597017e-05, + "loss": 0.3351, + "step": 4181 + }, + { + "epoch": 0.33131313131313134, + "grad_norm": 1.5268078413425163, + "learning_rate": 1.560454474654326e-05, + "loss": 0.3276, + "step": 4182 + }, + { + "epoch": 0.33139235492176666, + "grad_norm": 1.5697133967894934, + "learning_rate": 1.56024194124572e-05, + "loss": 0.3133, + "step": 4183 + }, + { + "epoch": 0.33147157853040204, + "grad_norm": 1.859752771998104, + "learning_rate": 1.5600293709478776e-05, + "loss": 0.3101, + "step": 4184 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 1.7134807583348657, + "learning_rate": 1.559816763774796e-05, + "loss": 0.3019, + "step": 4185 + }, + { + "epoch": 0.3316300257476728, + "grad_norm": 1.1808077793661966, + "learning_rate": 1.559604119740474e-05, + "loss": 0.1679, + "step": 4186 + }, + { + "epoch": 0.3317092493563082, + "grad_norm": 1.7001383949072224, + "learning_rate": 1.5593914388589136e-05, + "loss": 0.3528, + "step": 4187 + }, + { + "epoch": 0.33178847296494357, + "grad_norm": 1.6909479202040436, + "learning_rate": 1.559178721144119e-05, + "loss": 0.3719, + "step": 4188 + }, + { + "epoch": 0.33186769657357895, + "grad_norm": 2.028373876921339, + "learning_rate": 1.5589659666100952e-05, + "loss": 0.3597, + "step": 4189 + }, + { + "epoch": 0.3319469201822143, + "grad_norm": 1.3368820284378782, + "learning_rate": 1.5587531752708528e-05, + "loss": 0.2873, + "step": 4190 + }, + { + "epoch": 0.33202614379084966, + "grad_norm": 1.6234417288229521, + "learning_rate": 1.558540347140402e-05, + "loss": 0.3933, + "step": 4191 + }, + { + "epoch": 0.33210536739948504, + "grad_norm": 1.287891724882461, + "learning_rate": 1.558327482232757e-05, + "loss": 0.2745, + "step": 4192 + }, + { + "epoch": 0.3321845910081204, + "grad_norm": 1.3739640387928436, + "learning_rate": 1.558114580561934e-05, + "loss": 0.3256, + "step": 4193 + }, + { + "epoch": 0.3322638146167558, + "grad_norm": 1.450618259859973, + "learning_rate": 1.557901642141951e-05, + "loss": 0.3188, + "step": 4194 + }, + { + "epoch": 0.3323430382253912, + "grad_norm": 1.6499599765805935, + "learning_rate": 1.5576886669868297e-05, + "loss": 0.3621, + "step": 4195 + }, + { + "epoch": 0.33242226183402657, + "grad_norm": 1.6677216955280363, + "learning_rate": 1.5574756551105926e-05, + "loss": 0.3915, + "step": 4196 + }, + { + "epoch": 0.3325014854426619, + "grad_norm": 1.5018442221321981, + "learning_rate": 1.5572626065272666e-05, + "loss": 0.2439, + "step": 4197 + }, + { + "epoch": 0.3325807090512973, + "grad_norm": 1.3107643462273924, + "learning_rate": 1.557049521250879e-05, + "loss": 0.2991, + "step": 4198 + }, + { + "epoch": 0.33265993265993266, + "grad_norm": 1.7764472928849313, + "learning_rate": 1.5568363992954607e-05, + "loss": 0.413, + "step": 4199 + }, + { + "epoch": 0.33273915626856804, + "grad_norm": 1.6423729997991683, + "learning_rate": 1.556623240675045e-05, + "loss": 0.2733, + "step": 4200 + }, + { + "epoch": 0.3328183798772034, + "grad_norm": 1.3458073229408152, + "learning_rate": 1.556410045403667e-05, + "loss": 0.3462, + "step": 4201 + }, + { + "epoch": 0.3328976034858388, + "grad_norm": 1.6789851897838282, + "learning_rate": 1.556196813495365e-05, + "loss": 0.3986, + "step": 4202 + }, + { + "epoch": 0.3329768270944741, + "grad_norm": 1.4348840388865662, + "learning_rate": 1.555983544964179e-05, + "loss": 0.3226, + "step": 4203 + }, + { + "epoch": 0.3330560507031095, + "grad_norm": 1.3347616946655005, + "learning_rate": 1.555770239824152e-05, + "loss": 0.2601, + "step": 4204 + }, + { + "epoch": 0.3331352743117449, + "grad_norm": 1.7984596767267915, + "learning_rate": 1.5555568980893284e-05, + "loss": 0.3886, + "step": 4205 + }, + { + "epoch": 0.33321449792038027, + "grad_norm": 1.66669208163045, + "learning_rate": 1.5553435197737566e-05, + "loss": 0.3713, + "step": 4206 + }, + { + "epoch": 0.33329372152901565, + "grad_norm": 1.6769922685466212, + "learning_rate": 1.5551301048914863e-05, + "loss": 0.4031, + "step": 4207 + }, + { + "epoch": 0.33337294513765103, + "grad_norm": 1.978035134151725, + "learning_rate": 1.5549166534565695e-05, + "loss": 0.3462, + "step": 4208 + }, + { + "epoch": 0.3334521687462864, + "grad_norm": 1.4320454369412885, + "learning_rate": 1.554703165483061e-05, + "loss": 0.3039, + "step": 4209 + }, + { + "epoch": 0.33353139235492174, + "grad_norm": 1.3254232489993087, + "learning_rate": 1.5544896409850183e-05, + "loss": 0.2935, + "step": 4210 + }, + { + "epoch": 0.3336106159635571, + "grad_norm": 1.729154708646171, + "learning_rate": 1.554276079976501e-05, + "loss": 0.3526, + "step": 4211 + }, + { + "epoch": 0.3336898395721925, + "grad_norm": 1.6735937491248896, + "learning_rate": 1.5540624824715703e-05, + "loss": 0.3103, + "step": 4212 + }, + { + "epoch": 0.3337690631808279, + "grad_norm": 1.3218157014036311, + "learning_rate": 1.5538488484842914e-05, + "loss": 0.2603, + "step": 4213 + }, + { + "epoch": 0.33384828678946327, + "grad_norm": 1.6083059408034546, + "learning_rate": 1.553635178028731e-05, + "loss": 0.3101, + "step": 4214 + }, + { + "epoch": 0.33392751039809865, + "grad_norm": 1.7063443063241774, + "learning_rate": 1.5534214711189574e-05, + "loss": 0.3679, + "step": 4215 + }, + { + "epoch": 0.33400673400673403, + "grad_norm": 1.8270500236637786, + "learning_rate": 1.5532077277690435e-05, + "loss": 0.3964, + "step": 4216 + }, + { + "epoch": 0.33408595761536936, + "grad_norm": 1.44024601717267, + "learning_rate": 1.552993947993062e-05, + "loss": 0.3317, + "step": 4217 + }, + { + "epoch": 0.33416518122400474, + "grad_norm": 1.4523795416061784, + "learning_rate": 1.5527801318050904e-05, + "loss": 0.276, + "step": 4218 + }, + { + "epoch": 0.3342444048326401, + "grad_norm": 1.3648882674836615, + "learning_rate": 1.5525662792192066e-05, + "loss": 0.2354, + "step": 4219 + }, + { + "epoch": 0.3343236284412755, + "grad_norm": 1.4510706605866899, + "learning_rate": 1.5523523902494927e-05, + "loss": 0.3797, + "step": 4220 + }, + { + "epoch": 0.3344028520499109, + "grad_norm": 1.5174268932484072, + "learning_rate": 1.552138464910031e-05, + "loss": 0.3118, + "step": 4221 + }, + { + "epoch": 0.33448207565854626, + "grad_norm": 1.6627671348472728, + "learning_rate": 1.5519245032149083e-05, + "loss": 0.4178, + "step": 4222 + }, + { + "epoch": 0.33456129926718164, + "grad_norm": 1.5801064944691237, + "learning_rate": 1.5517105051782127e-05, + "loss": 0.2069, + "step": 4223 + }, + { + "epoch": 0.33464052287581697, + "grad_norm": 1.530721118780897, + "learning_rate": 1.551496470814035e-05, + "loss": 0.3517, + "step": 4224 + }, + { + "epoch": 0.33471974648445235, + "grad_norm": 1.3414293890139244, + "learning_rate": 1.5512824001364686e-05, + "loss": 0.2681, + "step": 4225 + }, + { + "epoch": 0.33479897009308773, + "grad_norm": 1.8468493679474305, + "learning_rate": 1.5510682931596083e-05, + "loss": 0.3363, + "step": 4226 + }, + { + "epoch": 0.3348781937017231, + "grad_norm": 1.7365712626099319, + "learning_rate": 1.550854149897553e-05, + "loss": 0.4225, + "step": 4227 + }, + { + "epoch": 0.3349574173103585, + "grad_norm": 1.4519458616213432, + "learning_rate": 1.5506399703644017e-05, + "loss": 0.326, + "step": 4228 + }, + { + "epoch": 0.3350366409189939, + "grad_norm": 1.6297718465849158, + "learning_rate": 1.5504257545742585e-05, + "loss": 0.4093, + "step": 4229 + }, + { + "epoch": 0.33511586452762926, + "grad_norm": 1.5059403987915492, + "learning_rate": 1.5502115025412275e-05, + "loss": 0.3955, + "step": 4230 + }, + { + "epoch": 0.3351950881362646, + "grad_norm": 1.816439505863056, + "learning_rate": 1.5499972142794167e-05, + "loss": 0.3876, + "step": 4231 + }, + { + "epoch": 0.33527431174489997, + "grad_norm": 1.2961933036118685, + "learning_rate": 1.5497828898029358e-05, + "loss": 0.2704, + "step": 4232 + }, + { + "epoch": 0.33535353535353535, + "grad_norm": 1.4132729334249627, + "learning_rate": 1.5495685291258967e-05, + "loss": 0.3239, + "step": 4233 + }, + { + "epoch": 0.33543275896217073, + "grad_norm": 1.3782417523010273, + "learning_rate": 1.5493541322624145e-05, + "loss": 0.2855, + "step": 4234 + }, + { + "epoch": 0.3355119825708061, + "grad_norm": 1.4803242882284275, + "learning_rate": 1.5491396992266065e-05, + "loss": 0.3304, + "step": 4235 + }, + { + "epoch": 0.3355912061794415, + "grad_norm": 1.4570404690494962, + "learning_rate": 1.548925230032591e-05, + "loss": 0.392, + "step": 4236 + }, + { + "epoch": 0.3356704297880769, + "grad_norm": 1.5458789724182995, + "learning_rate": 1.5487107246944902e-05, + "loss": 0.3548, + "step": 4237 + }, + { + "epoch": 0.3357496533967122, + "grad_norm": 1.6131480344783478, + "learning_rate": 1.548496183226429e-05, + "loss": 0.4586, + "step": 4238 + }, + { + "epoch": 0.3358288770053476, + "grad_norm": 1.4893714858939047, + "learning_rate": 1.548281605642533e-05, + "loss": 0.3258, + "step": 4239 + }, + { + "epoch": 0.33590810061398296, + "grad_norm": 1.6938789435238866, + "learning_rate": 1.5480669919569313e-05, + "loss": 0.2813, + "step": 4240 + }, + { + "epoch": 0.33598732422261834, + "grad_norm": 1.6769872447502734, + "learning_rate": 1.5478523421837553e-05, + "loss": 0.338, + "step": 4241 + }, + { + "epoch": 0.3360665478312537, + "grad_norm": 1.6003984862783687, + "learning_rate": 1.5476376563371392e-05, + "loss": 0.2741, + "step": 4242 + }, + { + "epoch": 0.3361457714398891, + "grad_norm": 1.2689194951385718, + "learning_rate": 1.547422934431218e-05, + "loss": 0.2666, + "step": 4243 + }, + { + "epoch": 0.33622499504852443, + "grad_norm": 1.7351447988751467, + "learning_rate": 1.5472081764801307e-05, + "loss": 0.4686, + "step": 4244 + }, + { + "epoch": 0.3363042186571598, + "grad_norm": 1.18197016411766, + "learning_rate": 1.546993382498018e-05, + "loss": 0.2585, + "step": 4245 + }, + { + "epoch": 0.3363834422657952, + "grad_norm": 1.6057280458348795, + "learning_rate": 1.546778552499023e-05, + "loss": 0.3443, + "step": 4246 + }, + { + "epoch": 0.3364626658744306, + "grad_norm": 1.6002838812438793, + "learning_rate": 1.5465636864972914e-05, + "loss": 0.3427, + "step": 4247 + }, + { + "epoch": 0.33654188948306596, + "grad_norm": 1.9739040391493348, + "learning_rate": 1.5463487845069708e-05, + "loss": 0.462, + "step": 4248 + }, + { + "epoch": 0.33662111309170134, + "grad_norm": 1.473318208191478, + "learning_rate": 1.546133846542212e-05, + "loss": 0.4105, + "step": 4249 + }, + { + "epoch": 0.3367003367003367, + "grad_norm": 1.3699943693317282, + "learning_rate": 1.5459188726171666e-05, + "loss": 0.2352, + "step": 4250 + }, + { + "epoch": 0.33677956030897205, + "grad_norm": 1.5106420361915884, + "learning_rate": 1.5457038627459905e-05, + "loss": 0.3859, + "step": 4251 + }, + { + "epoch": 0.33685878391760743, + "grad_norm": 1.2315617954454943, + "learning_rate": 1.545488816942841e-05, + "loss": 0.2315, + "step": 4252 + }, + { + "epoch": 0.3369380075262428, + "grad_norm": 1.753153443022401, + "learning_rate": 1.5452737352218773e-05, + "loss": 0.408, + "step": 4253 + }, + { + "epoch": 0.3370172311348782, + "grad_norm": 1.712294398397039, + "learning_rate": 1.545058617597262e-05, + "loss": 0.4053, + "step": 4254 + }, + { + "epoch": 0.3370964547435136, + "grad_norm": 1.528931331202446, + "learning_rate": 1.544843464083159e-05, + "loss": 0.3351, + "step": 4255 + }, + { + "epoch": 0.33717567835214896, + "grad_norm": 1.5315737332191974, + "learning_rate": 1.544628274693736e-05, + "loss": 0.3406, + "step": 4256 + }, + { + "epoch": 0.33725490196078434, + "grad_norm": 1.4872515032479605, + "learning_rate": 1.5444130494431612e-05, + "loss": 0.2454, + "step": 4257 + }, + { + "epoch": 0.33733412556941966, + "grad_norm": 1.41015186609565, + "learning_rate": 1.544197788345607e-05, + "loss": 0.3242, + "step": 4258 + }, + { + "epoch": 0.33741334917805504, + "grad_norm": 1.6792958175936996, + "learning_rate": 1.543982491415247e-05, + "loss": 0.4017, + "step": 4259 + }, + { + "epoch": 0.3374925727866904, + "grad_norm": 1.4778059162223025, + "learning_rate": 1.5437671586662575e-05, + "loss": 0.3251, + "step": 4260 + }, + { + "epoch": 0.3375717963953258, + "grad_norm": 1.5340082557646046, + "learning_rate": 1.543551790112817e-05, + "loss": 0.3924, + "step": 4261 + }, + { + "epoch": 0.3376510200039612, + "grad_norm": 1.0817474947175367, + "learning_rate": 1.5433363857691067e-05, + "loss": 0.2129, + "step": 4262 + }, + { + "epoch": 0.33773024361259657, + "grad_norm": 1.485660562195963, + "learning_rate": 1.5431209456493093e-05, + "loss": 0.3111, + "step": 4263 + }, + { + "epoch": 0.33780946722123195, + "grad_norm": 1.5150243558972971, + "learning_rate": 1.542905469767611e-05, + "loss": 0.3381, + "step": 4264 + }, + { + "epoch": 0.3378886908298673, + "grad_norm": 1.3946392536404986, + "learning_rate": 1.5426899581382e-05, + "loss": 0.2383, + "step": 4265 + }, + { + "epoch": 0.33796791443850266, + "grad_norm": 1.7879841887093437, + "learning_rate": 1.5424744107752666e-05, + "loss": 0.3504, + "step": 4266 + }, + { + "epoch": 0.33804713804713804, + "grad_norm": 1.6185442471378342, + "learning_rate": 1.542258827693003e-05, + "loss": 0.3185, + "step": 4267 + }, + { + "epoch": 0.3381263616557734, + "grad_norm": 1.2925602464150183, + "learning_rate": 1.542043208905605e-05, + "loss": 0.2116, + "step": 4268 + }, + { + "epoch": 0.3382055852644088, + "grad_norm": 1.4679548988166635, + "learning_rate": 1.5418275544272702e-05, + "loss": 0.2978, + "step": 4269 + }, + { + "epoch": 0.3382848088730442, + "grad_norm": 1.8014859356018105, + "learning_rate": 1.541611864272198e-05, + "loss": 0.3802, + "step": 4270 + }, + { + "epoch": 0.33836403248167957, + "grad_norm": 1.5208028695603413, + "learning_rate": 1.5413961384545902e-05, + "loss": 0.3596, + "step": 4271 + }, + { + "epoch": 0.3384432560903149, + "grad_norm": 1.7531835715555473, + "learning_rate": 1.541180376988652e-05, + "loss": 0.3586, + "step": 4272 + }, + { + "epoch": 0.3385224796989503, + "grad_norm": 1.7763373404347726, + "learning_rate": 1.54096457988859e-05, + "loss": 0.4851, + "step": 4273 + }, + { + "epoch": 0.33860170330758566, + "grad_norm": 1.3927237931969634, + "learning_rate": 1.540748747168613e-05, + "loss": 0.3053, + "step": 4274 + }, + { + "epoch": 0.33868092691622104, + "grad_norm": 1.4228184773640276, + "learning_rate": 1.5405328788429333e-05, + "loss": 0.2541, + "step": 4275 + }, + { + "epoch": 0.3387601505248564, + "grad_norm": 1.791523874147248, + "learning_rate": 1.5403169749257644e-05, + "loss": 0.4753, + "step": 4276 + }, + { + "epoch": 0.3388393741334918, + "grad_norm": 1.6019892733694858, + "learning_rate": 1.5401010354313222e-05, + "loss": 0.3366, + "step": 4277 + }, + { + "epoch": 0.3389185977421272, + "grad_norm": 1.6801031722330884, + "learning_rate": 1.539885060373826e-05, + "loss": 0.4062, + "step": 4278 + }, + { + "epoch": 0.3389978213507625, + "grad_norm": 1.3315412388592751, + "learning_rate": 1.539669049767496e-05, + "loss": 0.3292, + "step": 4279 + }, + { + "epoch": 0.3390770449593979, + "grad_norm": 1.7270116239306605, + "learning_rate": 1.539453003626556e-05, + "loss": 0.3136, + "step": 4280 + }, + { + "epoch": 0.33915626856803327, + "grad_norm": 1.3809647603967554, + "learning_rate": 1.5392369219652313e-05, + "loss": 0.3639, + "step": 4281 + }, + { + "epoch": 0.33923549217666865, + "grad_norm": 1.2838504386342768, + "learning_rate": 1.53902080479775e-05, + "loss": 0.2924, + "step": 4282 + }, + { + "epoch": 0.33931471578530403, + "grad_norm": 1.6397241444664687, + "learning_rate": 1.5388046521383424e-05, + "loss": 0.364, + "step": 4283 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 1.327571724092451, + "learning_rate": 1.538588464001241e-05, + "loss": 0.2866, + "step": 4284 + }, + { + "epoch": 0.33947316300257474, + "grad_norm": 1.9759979050593923, + "learning_rate": 1.5383722404006808e-05, + "loss": 0.4839, + "step": 4285 + }, + { + "epoch": 0.3395523866112101, + "grad_norm": 1.9619536335273284, + "learning_rate": 1.5381559813508986e-05, + "loss": 0.5092, + "step": 4286 + }, + { + "epoch": 0.3396316102198455, + "grad_norm": 1.2248605518738174, + "learning_rate": 1.537939686866135e-05, + "loss": 0.2458, + "step": 4287 + }, + { + "epoch": 0.3397108338284809, + "grad_norm": 1.491407375995511, + "learning_rate": 1.5377233569606312e-05, + "loss": 0.3168, + "step": 4288 + }, + { + "epoch": 0.33979005743711627, + "grad_norm": 1.301030974814455, + "learning_rate": 1.5375069916486318e-05, + "loss": 0.3654, + "step": 4289 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 1.3992107496371073, + "learning_rate": 1.5372905909443833e-05, + "loss": 0.2918, + "step": 4290 + }, + { + "epoch": 0.33994850465438703, + "grad_norm": 1.3277011072489449, + "learning_rate": 1.5370741548621343e-05, + "loss": 0.3824, + "step": 4291 + }, + { + "epoch": 0.34002772826302236, + "grad_norm": 1.8291009541297036, + "learning_rate": 1.5368576834161372e-05, + "loss": 0.3127, + "step": 4292 + }, + { + "epoch": 0.34010695187165774, + "grad_norm": 1.5514373837331348, + "learning_rate": 1.536641176620644e-05, + "loss": 0.4226, + "step": 4293 + }, + { + "epoch": 0.3401861754802931, + "grad_norm": 1.6067565336160639, + "learning_rate": 1.536424634489912e-05, + "loss": 0.3232, + "step": 4294 + }, + { + "epoch": 0.3402653990889285, + "grad_norm": 1.787977032883429, + "learning_rate": 1.536208057038199e-05, + "loss": 0.4794, + "step": 4295 + }, + { + "epoch": 0.3403446226975639, + "grad_norm": 1.3505269470739947, + "learning_rate": 1.535991444279765e-05, + "loss": 0.2428, + "step": 4296 + }, + { + "epoch": 0.34042384630619926, + "grad_norm": 1.2430003736676773, + "learning_rate": 1.535774796228874e-05, + "loss": 0.2724, + "step": 4297 + }, + { + "epoch": 0.34050306991483464, + "grad_norm": 1.6069657055059665, + "learning_rate": 1.5355581128997904e-05, + "loss": 0.3173, + "step": 4298 + }, + { + "epoch": 0.34058229352346997, + "grad_norm": 1.7780656520457074, + "learning_rate": 1.5353413943067818e-05, + "loss": 0.3433, + "step": 4299 + }, + { + "epoch": 0.34066151713210535, + "grad_norm": 1.452929952719961, + "learning_rate": 1.5351246404641183e-05, + "loss": 0.209, + "step": 4300 + }, + { + "epoch": 0.34074074074074073, + "grad_norm": 1.3784527823708006, + "learning_rate": 1.5349078513860728e-05, + "loss": 0.2718, + "step": 4301 + }, + { + "epoch": 0.3408199643493761, + "grad_norm": 1.6734288257932508, + "learning_rate": 1.534691027086918e-05, + "loss": 0.4111, + "step": 4302 + }, + { + "epoch": 0.3408991879580115, + "grad_norm": 1.6486669550154163, + "learning_rate": 1.5344741675809328e-05, + "loss": 0.3261, + "step": 4303 + }, + { + "epoch": 0.3409784115666469, + "grad_norm": 1.4206449357019255, + "learning_rate": 1.534257272882395e-05, + "loss": 0.2882, + "step": 4304 + }, + { + "epoch": 0.34105763517528226, + "grad_norm": 1.3727628650653578, + "learning_rate": 1.5340403430055864e-05, + "loss": 0.2751, + "step": 4305 + }, + { + "epoch": 0.3411368587839176, + "grad_norm": 1.3140316381191146, + "learning_rate": 1.533823377964791e-05, + "loss": 0.3591, + "step": 4306 + }, + { + "epoch": 0.34121608239255297, + "grad_norm": 1.578538650184226, + "learning_rate": 1.5336063777742944e-05, + "loss": 0.2873, + "step": 4307 + }, + { + "epoch": 0.34129530600118835, + "grad_norm": 1.6610921135103072, + "learning_rate": 1.5333893424483856e-05, + "loss": 0.3532, + "step": 4308 + }, + { + "epoch": 0.34137452960982373, + "grad_norm": 1.3148128237179069, + "learning_rate": 1.5331722720013555e-05, + "loss": 0.3027, + "step": 4309 + }, + { + "epoch": 0.3414537532184591, + "grad_norm": 1.4313257866471976, + "learning_rate": 1.532955166447496e-05, + "loss": 0.3684, + "step": 4310 + }, + { + "epoch": 0.3415329768270945, + "grad_norm": 1.7925574512419842, + "learning_rate": 1.5327380258011037e-05, + "loss": 0.4109, + "step": 4311 + }, + { + "epoch": 0.3416122004357299, + "grad_norm": 1.3467899204058864, + "learning_rate": 1.5325208500764756e-05, + "loss": 0.2851, + "step": 4312 + }, + { + "epoch": 0.3416914240443652, + "grad_norm": 1.7847515151425684, + "learning_rate": 1.532303639287912e-05, + "loss": 0.3556, + "step": 4313 + }, + { + "epoch": 0.3417706476530006, + "grad_norm": 1.535328652407067, + "learning_rate": 1.532086393449715e-05, + "loss": 0.4066, + "step": 4314 + }, + { + "epoch": 0.34184987126163596, + "grad_norm": 1.4220884972750913, + "learning_rate": 1.531869112576189e-05, + "loss": 0.3079, + "step": 4315 + }, + { + "epoch": 0.34192909487027134, + "grad_norm": 1.639160206200713, + "learning_rate": 1.5316517966816414e-05, + "loss": 0.3443, + "step": 4316 + }, + { + "epoch": 0.3420083184789067, + "grad_norm": 1.3840596508342897, + "learning_rate": 1.5314344457803812e-05, + "loss": 0.265, + "step": 4317 + }, + { + "epoch": 0.3420875420875421, + "grad_norm": 1.4090757924369377, + "learning_rate": 1.5312170598867195e-05, + "loss": 0.2959, + "step": 4318 + }, + { + "epoch": 0.3421667656961775, + "grad_norm": 1.2592572989778839, + "learning_rate": 1.5309996390149708e-05, + "loss": 0.2769, + "step": 4319 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 1.7499644082596897, + "learning_rate": 1.5307821831794506e-05, + "loss": 0.4392, + "step": 4320 + }, + { + "epoch": 0.3423252129134482, + "grad_norm": 1.628349571539565, + "learning_rate": 1.5305646923944776e-05, + "loss": 0.3193, + "step": 4321 + }, + { + "epoch": 0.3424044365220836, + "grad_norm": 1.6307441971665753, + "learning_rate": 1.5303471666743727e-05, + "loss": 0.3005, + "step": 4322 + }, + { + "epoch": 0.34248366013071896, + "grad_norm": 1.28757335105128, + "learning_rate": 1.5301296060334588e-05, + "loss": 0.2759, + "step": 4323 + }, + { + "epoch": 0.34256288373935434, + "grad_norm": 1.825664837017531, + "learning_rate": 1.529912010486061e-05, + "loss": 0.3556, + "step": 4324 + }, + { + "epoch": 0.3426421073479897, + "grad_norm": 1.6498042243213094, + "learning_rate": 1.5296943800465068e-05, + "loss": 0.3651, + "step": 4325 + }, + { + "epoch": 0.34272133095662505, + "grad_norm": 1.7490629847383423, + "learning_rate": 1.529476714729127e-05, + "loss": 0.3812, + "step": 4326 + }, + { + "epoch": 0.34280055456526043, + "grad_norm": 1.5570358947811893, + "learning_rate": 1.529259014548253e-05, + "loss": 0.3066, + "step": 4327 + }, + { + "epoch": 0.3428797781738958, + "grad_norm": 1.311637351378174, + "learning_rate": 1.5290412795182193e-05, + "loss": 0.2954, + "step": 4328 + }, + { + "epoch": 0.3429590017825312, + "grad_norm": 1.6279309235631882, + "learning_rate": 1.528823509653363e-05, + "loss": 0.5063, + "step": 4329 + }, + { + "epoch": 0.3430382253911666, + "grad_norm": 1.4566379968697696, + "learning_rate": 1.5286057049680236e-05, + "loss": 0.3533, + "step": 4330 + }, + { + "epoch": 0.34311744899980196, + "grad_norm": 1.578603220100308, + "learning_rate": 1.5283878654765414e-05, + "loss": 0.2953, + "step": 4331 + }, + { + "epoch": 0.34319667260843734, + "grad_norm": 1.2967332993666802, + "learning_rate": 1.5281699911932612e-05, + "loss": 0.2228, + "step": 4332 + }, + { + "epoch": 0.34327589621707266, + "grad_norm": 1.4011110065202876, + "learning_rate": 1.527952082132528e-05, + "loss": 0.4005, + "step": 4333 + }, + { + "epoch": 0.34335511982570804, + "grad_norm": 1.491828389161139, + "learning_rate": 1.5277341383086906e-05, + "loss": 0.3721, + "step": 4334 + }, + { + "epoch": 0.3434343434343434, + "grad_norm": 1.33962654281021, + "learning_rate": 1.5275161597360996e-05, + "loss": 0.3272, + "step": 4335 + }, + { + "epoch": 0.3435135670429788, + "grad_norm": 1.982517730652105, + "learning_rate": 1.5272981464291077e-05, + "loss": 0.3913, + "step": 4336 + }, + { + "epoch": 0.3435927906516142, + "grad_norm": 1.6422612412762339, + "learning_rate": 1.5270800984020705e-05, + "loss": 0.332, + "step": 4337 + }, + { + "epoch": 0.34367201426024957, + "grad_norm": 1.504029812139617, + "learning_rate": 1.5268620156693444e-05, + "loss": 0.3541, + "step": 4338 + }, + { + "epoch": 0.34375123786888495, + "grad_norm": 1.4455796329437427, + "learning_rate": 1.52664389824529e-05, + "loss": 0.3472, + "step": 4339 + }, + { + "epoch": 0.3438304614775203, + "grad_norm": 1.1758245169354014, + "learning_rate": 1.5264257461442687e-05, + "loss": 0.2044, + "step": 4340 + }, + { + "epoch": 0.34390968508615566, + "grad_norm": 1.7121470670997576, + "learning_rate": 1.526207559380645e-05, + "loss": 0.4498, + "step": 4341 + }, + { + "epoch": 0.34398890869479104, + "grad_norm": 1.765751378306275, + "learning_rate": 1.5259893379687855e-05, + "loss": 0.3202, + "step": 4342 + }, + { + "epoch": 0.3440681323034264, + "grad_norm": 1.706753772383736, + "learning_rate": 1.525771081923059e-05, + "loss": 0.4135, + "step": 4343 + }, + { + "epoch": 0.3441473559120618, + "grad_norm": 1.9877885681461496, + "learning_rate": 1.525552791257837e-05, + "loss": 0.5112, + "step": 4344 + }, + { + "epoch": 0.3442265795206972, + "grad_norm": 1.5323652363248574, + "learning_rate": 1.525334465987492e-05, + "loss": 0.2778, + "step": 4345 + }, + { + "epoch": 0.34430580312933257, + "grad_norm": 1.7597796408885797, + "learning_rate": 1.5251161061264003e-05, + "loss": 0.3523, + "step": 4346 + }, + { + "epoch": 0.3443850267379679, + "grad_norm": 1.4685050322338264, + "learning_rate": 1.5248977116889396e-05, + "loss": 0.3431, + "step": 4347 + }, + { + "epoch": 0.3444642503466033, + "grad_norm": 1.519332106378515, + "learning_rate": 1.5246792826894906e-05, + "loss": 0.409, + "step": 4348 + }, + { + "epoch": 0.34454347395523865, + "grad_norm": 1.6500545113632907, + "learning_rate": 1.5244608191424352e-05, + "loss": 0.3717, + "step": 4349 + }, + { + "epoch": 0.34462269756387404, + "grad_norm": 1.2276225784417312, + "learning_rate": 1.5242423210621584e-05, + "loss": 0.2226, + "step": 4350 + }, + { + "epoch": 0.3447019211725094, + "grad_norm": 1.6276868001027185, + "learning_rate": 1.5240237884630471e-05, + "loss": 0.4932, + "step": 4351 + }, + { + "epoch": 0.3447811447811448, + "grad_norm": 1.624805862201677, + "learning_rate": 1.5238052213594912e-05, + "loss": 0.395, + "step": 4352 + }, + { + "epoch": 0.3448603683897802, + "grad_norm": 1.4405626794068653, + "learning_rate": 1.5235866197658812e-05, + "loss": 0.3353, + "step": 4353 + }, + { + "epoch": 0.3449395919984155, + "grad_norm": 1.5529442881464233, + "learning_rate": 1.5233679836966122e-05, + "loss": 0.3625, + "step": 4354 + }, + { + "epoch": 0.3450188156070509, + "grad_norm": 1.5839005912555417, + "learning_rate": 1.5231493131660794e-05, + "loss": 0.3269, + "step": 4355 + }, + { + "epoch": 0.34509803921568627, + "grad_norm": 1.5303994692743137, + "learning_rate": 1.5229306081886818e-05, + "loss": 0.3016, + "step": 4356 + }, + { + "epoch": 0.34517726282432165, + "grad_norm": 1.6262291505700162, + "learning_rate": 1.5227118687788198e-05, + "loss": 0.2979, + "step": 4357 + }, + { + "epoch": 0.34525648643295703, + "grad_norm": 1.3914514338299404, + "learning_rate": 1.5224930949508964e-05, + "loss": 0.2786, + "step": 4358 + }, + { + "epoch": 0.3453357100415924, + "grad_norm": 1.3962291682052854, + "learning_rate": 1.5222742867193167e-05, + "loss": 0.2577, + "step": 4359 + }, + { + "epoch": 0.3454149336502278, + "grad_norm": 1.5197331743671314, + "learning_rate": 1.5220554440984882e-05, + "loss": 0.3455, + "step": 4360 + }, + { + "epoch": 0.3454941572588631, + "grad_norm": 1.5608143396327747, + "learning_rate": 1.5218365671028207e-05, + "loss": 0.3136, + "step": 4361 + }, + { + "epoch": 0.3455733808674985, + "grad_norm": 1.8207944893834378, + "learning_rate": 1.5216176557467265e-05, + "loss": 0.4917, + "step": 4362 + }, + { + "epoch": 0.3456526044761339, + "grad_norm": 1.798454423625633, + "learning_rate": 1.521398710044619e-05, + "loss": 0.333, + "step": 4363 + }, + { + "epoch": 0.34573182808476927, + "grad_norm": 1.396815439910078, + "learning_rate": 1.5211797300109154e-05, + "loss": 0.2323, + "step": 4364 + }, + { + "epoch": 0.34581105169340465, + "grad_norm": 1.7094250651362186, + "learning_rate": 1.5209607156600346e-05, + "loss": 0.3047, + "step": 4365 + }, + { + "epoch": 0.34589027530204003, + "grad_norm": 1.7605442349632685, + "learning_rate": 1.520741667006397e-05, + "loss": 0.348, + "step": 4366 + }, + { + "epoch": 0.34596949891067535, + "grad_norm": 1.3742067234633975, + "learning_rate": 1.5205225840644264e-05, + "loss": 0.274, + "step": 4367 + }, + { + "epoch": 0.34604872251931074, + "grad_norm": 1.2534730114084343, + "learning_rate": 1.5203034668485486e-05, + "loss": 0.2806, + "step": 4368 + }, + { + "epoch": 0.3461279461279461, + "grad_norm": 1.643414874391239, + "learning_rate": 1.5200843153731905e-05, + "loss": 0.3499, + "step": 4369 + }, + { + "epoch": 0.3462071697365815, + "grad_norm": 1.698514264158814, + "learning_rate": 1.519865129652783e-05, + "loss": 0.2825, + "step": 4370 + }, + { + "epoch": 0.3462863933452169, + "grad_norm": 1.5027770164358123, + "learning_rate": 1.5196459097017582e-05, + "loss": 0.3035, + "step": 4371 + }, + { + "epoch": 0.34636561695385226, + "grad_norm": 1.6487790488678877, + "learning_rate": 1.5194266555345505e-05, + "loss": 0.3143, + "step": 4372 + }, + { + "epoch": 0.34644484056248764, + "grad_norm": 1.8461492062324416, + "learning_rate": 1.5192073671655969e-05, + "loss": 0.4029, + "step": 4373 + }, + { + "epoch": 0.34652406417112297, + "grad_norm": 1.4250018414205798, + "learning_rate": 1.5189880446093366e-05, + "loss": 0.3339, + "step": 4374 + }, + { + "epoch": 0.34660328777975835, + "grad_norm": 1.5471037998190196, + "learning_rate": 1.5187686878802108e-05, + "loss": 0.2736, + "step": 4375 + }, + { + "epoch": 0.34668251138839373, + "grad_norm": 1.7239491407254735, + "learning_rate": 1.5185492969926627e-05, + "loss": 0.4099, + "step": 4376 + }, + { + "epoch": 0.3467617349970291, + "grad_norm": 1.4172153097053735, + "learning_rate": 1.5183298719611388e-05, + "loss": 0.4184, + "step": 4377 + }, + { + "epoch": 0.3468409586056645, + "grad_norm": 1.4244286831445356, + "learning_rate": 1.5181104128000868e-05, + "loss": 0.2639, + "step": 4378 + }, + { + "epoch": 0.3469201822142999, + "grad_norm": 1.4903485767289089, + "learning_rate": 1.517890919523957e-05, + "loss": 0.3343, + "step": 4379 + }, + { + "epoch": 0.34699940582293526, + "grad_norm": 1.5706996701714635, + "learning_rate": 1.517671392147202e-05, + "loss": 0.3917, + "step": 4380 + }, + { + "epoch": 0.3470786294315706, + "grad_norm": 2.148426472123439, + "learning_rate": 1.517451830684277e-05, + "loss": 0.4647, + "step": 4381 + }, + { + "epoch": 0.34715785304020597, + "grad_norm": 1.6660329561456046, + "learning_rate": 1.5172322351496385e-05, + "loss": 0.3693, + "step": 4382 + }, + { + "epoch": 0.34723707664884135, + "grad_norm": 1.6528446971522985, + "learning_rate": 1.517012605557746e-05, + "loss": 0.3442, + "step": 4383 + }, + { + "epoch": 0.34731630025747673, + "grad_norm": 1.4524525817125662, + "learning_rate": 1.5167929419230616e-05, + "loss": 0.3673, + "step": 4384 + }, + { + "epoch": 0.3473955238661121, + "grad_norm": 1.4646704157508048, + "learning_rate": 1.516573244260048e-05, + "loss": 0.3315, + "step": 4385 + }, + { + "epoch": 0.3474747474747475, + "grad_norm": 1.4932676472668638, + "learning_rate": 1.5163535125831724e-05, + "loss": 0.3442, + "step": 4386 + }, + { + "epoch": 0.3475539710833829, + "grad_norm": 1.5519892449063468, + "learning_rate": 1.5161337469069024e-05, + "loss": 0.3868, + "step": 4387 + }, + { + "epoch": 0.3476331946920182, + "grad_norm": 1.5261023445669821, + "learning_rate": 1.5159139472457086e-05, + "loss": 0.3886, + "step": 4388 + }, + { + "epoch": 0.3477124183006536, + "grad_norm": 1.3546685315882694, + "learning_rate": 1.5156941136140637e-05, + "loss": 0.3373, + "step": 4389 + }, + { + "epoch": 0.34779164190928896, + "grad_norm": 1.5646257798222758, + "learning_rate": 1.5154742460264426e-05, + "loss": 0.3123, + "step": 4390 + }, + { + "epoch": 0.34787086551792434, + "grad_norm": 1.303390509789042, + "learning_rate": 1.515254344497323e-05, + "loss": 0.2792, + "step": 4391 + }, + { + "epoch": 0.3479500891265597, + "grad_norm": 1.7601413012596392, + "learning_rate": 1.5150344090411841e-05, + "loss": 0.356, + "step": 4392 + }, + { + "epoch": 0.3480293127351951, + "grad_norm": 1.2650959078949433, + "learning_rate": 1.5148144396725072e-05, + "loss": 0.2288, + "step": 4393 + }, + { + "epoch": 0.3481085363438305, + "grad_norm": 1.4466704973799784, + "learning_rate": 1.514594436405777e-05, + "loss": 0.2492, + "step": 4394 + }, + { + "epoch": 0.3481877599524658, + "grad_norm": 2.2227170816834954, + "learning_rate": 1.5143743992554791e-05, + "loss": 0.3858, + "step": 4395 + }, + { + "epoch": 0.3482669835611012, + "grad_norm": 1.309722667859268, + "learning_rate": 1.514154328236102e-05, + "loss": 0.3023, + "step": 4396 + }, + { + "epoch": 0.3483462071697366, + "grad_norm": 1.3576272291406275, + "learning_rate": 1.5139342233621364e-05, + "loss": 0.2487, + "step": 4397 + }, + { + "epoch": 0.34842543077837196, + "grad_norm": 1.3382486838155585, + "learning_rate": 1.5137140846480752e-05, + "loss": 0.2242, + "step": 4398 + }, + { + "epoch": 0.34850465438700734, + "grad_norm": 1.4955289585838192, + "learning_rate": 1.5134939121084129e-05, + "loss": 0.3344, + "step": 4399 + }, + { + "epoch": 0.3485838779956427, + "grad_norm": 1.7993186062169424, + "learning_rate": 1.5132737057576476e-05, + "loss": 0.4859, + "step": 4400 + }, + { + "epoch": 0.34866310160427805, + "grad_norm": 1.520159946231801, + "learning_rate": 1.5130534656102783e-05, + "loss": 0.3211, + "step": 4401 + }, + { + "epoch": 0.34874232521291343, + "grad_norm": 1.444083381085495, + "learning_rate": 1.512833191680807e-05, + "loss": 0.3783, + "step": 4402 + }, + { + "epoch": 0.3488215488215488, + "grad_norm": 1.4996070574512057, + "learning_rate": 1.5126128839837378e-05, + "loss": 0.281, + "step": 4403 + }, + { + "epoch": 0.3489007724301842, + "grad_norm": 1.3668318440224068, + "learning_rate": 1.5123925425335766e-05, + "loss": 0.2849, + "step": 4404 + }, + { + "epoch": 0.3489799960388196, + "grad_norm": 1.3987864995830923, + "learning_rate": 1.5121721673448319e-05, + "loss": 0.3283, + "step": 4405 + }, + { + "epoch": 0.34905921964745495, + "grad_norm": 1.505740064476565, + "learning_rate": 1.5119517584320146e-05, + "loss": 0.3958, + "step": 4406 + }, + { + "epoch": 0.34913844325609034, + "grad_norm": 1.2862179808102558, + "learning_rate": 1.5117313158096371e-05, + "loss": 0.3162, + "step": 4407 + }, + { + "epoch": 0.34921766686472566, + "grad_norm": 1.3674620729882478, + "learning_rate": 1.511510839492215e-05, + "loss": 0.4187, + "step": 4408 + }, + { + "epoch": 0.34929689047336104, + "grad_norm": 1.198812844397246, + "learning_rate": 1.5112903294942651e-05, + "loss": 0.2396, + "step": 4409 + }, + { + "epoch": 0.3493761140819964, + "grad_norm": 1.3356613497560836, + "learning_rate": 1.5110697858303072e-05, + "loss": 0.3435, + "step": 4410 + }, + { + "epoch": 0.3494553376906318, + "grad_norm": 1.6636511396414906, + "learning_rate": 1.5108492085148632e-05, + "loss": 0.3838, + "step": 4411 + }, + { + "epoch": 0.3495345612992672, + "grad_norm": 1.6133531445790348, + "learning_rate": 1.5106285975624568e-05, + "loss": 0.433, + "step": 4412 + }, + { + "epoch": 0.34961378490790257, + "grad_norm": 1.4336497029329514, + "learning_rate": 1.5104079529876143e-05, + "loss": 0.251, + "step": 4413 + }, + { + "epoch": 0.34969300851653795, + "grad_norm": 1.3231363770107274, + "learning_rate": 1.510187274804864e-05, + "loss": 0.3091, + "step": 4414 + }, + { + "epoch": 0.3497722321251733, + "grad_norm": 1.4911702060595606, + "learning_rate": 1.5099665630287365e-05, + "loss": 0.3731, + "step": 4415 + }, + { + "epoch": 0.34985145573380866, + "grad_norm": 1.917045337477769, + "learning_rate": 1.5097458176737647e-05, + "loss": 0.3195, + "step": 4416 + }, + { + "epoch": 0.34993067934244404, + "grad_norm": 1.4294612713605452, + "learning_rate": 1.5095250387544833e-05, + "loss": 0.2989, + "step": 4417 + }, + { + "epoch": 0.3500099029510794, + "grad_norm": 1.3293686826300255, + "learning_rate": 1.5093042262854297e-05, + "loss": 0.252, + "step": 4418 + }, + { + "epoch": 0.3500891265597148, + "grad_norm": 1.5827571695390332, + "learning_rate": 1.509083380281144e-05, + "loss": 0.2892, + "step": 4419 + }, + { + "epoch": 0.3501683501683502, + "grad_norm": 1.386141660887367, + "learning_rate": 1.5088625007561668e-05, + "loss": 0.3653, + "step": 4420 + }, + { + "epoch": 0.35024757377698557, + "grad_norm": 1.8323940532231806, + "learning_rate": 1.5086415877250424e-05, + "loss": 0.405, + "step": 4421 + }, + { + "epoch": 0.3503267973856209, + "grad_norm": 1.3317656299159266, + "learning_rate": 1.5084206412023172e-05, + "loss": 0.3019, + "step": 4422 + }, + { + "epoch": 0.3504060209942563, + "grad_norm": 1.398782207111146, + "learning_rate": 1.5081996612025387e-05, + "loss": 0.3361, + "step": 4423 + }, + { + "epoch": 0.35048524460289165, + "grad_norm": 1.345468328411188, + "learning_rate": 1.5079786477402581e-05, + "loss": 0.306, + "step": 4424 + }, + { + "epoch": 0.35056446821152704, + "grad_norm": 1.5370407033705449, + "learning_rate": 1.5077576008300278e-05, + "loss": 0.4559, + "step": 4425 + }, + { + "epoch": 0.3506436918201624, + "grad_norm": 1.4600757339417962, + "learning_rate": 1.5075365204864025e-05, + "loss": 0.3518, + "step": 4426 + }, + { + "epoch": 0.3507229154287978, + "grad_norm": 1.6341718820482494, + "learning_rate": 1.5073154067239396e-05, + "loss": 0.4857, + "step": 4427 + }, + { + "epoch": 0.3508021390374332, + "grad_norm": 1.826444450110843, + "learning_rate": 1.507094259557198e-05, + "loss": 0.3229, + "step": 4428 + }, + { + "epoch": 0.3508813626460685, + "grad_norm": 1.5491382745889493, + "learning_rate": 1.5068730790007395e-05, + "loss": 0.3467, + "step": 4429 + }, + { + "epoch": 0.3509605862547039, + "grad_norm": 1.4093133322471183, + "learning_rate": 1.5066518650691277e-05, + "loss": 0.3193, + "step": 4430 + }, + { + "epoch": 0.35103980986333927, + "grad_norm": 1.65832062798544, + "learning_rate": 1.5064306177769284e-05, + "loss": 0.4091, + "step": 4431 + }, + { + "epoch": 0.35111903347197465, + "grad_norm": 1.475689115708618, + "learning_rate": 1.5062093371387097e-05, + "loss": 0.4302, + "step": 4432 + }, + { + "epoch": 0.35119825708061003, + "grad_norm": 1.3314872460149323, + "learning_rate": 1.5059880231690418e-05, + "loss": 0.2184, + "step": 4433 + }, + { + "epoch": 0.3512774806892454, + "grad_norm": 1.6020910146437128, + "learning_rate": 1.5057666758824974e-05, + "loss": 0.3545, + "step": 4434 + }, + { + "epoch": 0.3513567042978808, + "grad_norm": 1.837153136178669, + "learning_rate": 1.5055452952936512e-05, + "loss": 0.3466, + "step": 4435 + }, + { + "epoch": 0.3514359279065161, + "grad_norm": 1.6805451390680544, + "learning_rate": 1.5053238814170792e-05, + "loss": 0.3289, + "step": 4436 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 1.3378784333481688, + "learning_rate": 1.5051024342673614e-05, + "loss": 0.2403, + "step": 4437 + }, + { + "epoch": 0.3515943751237869, + "grad_norm": 2.087666893707564, + "learning_rate": 1.5048809538590789e-05, + "loss": 0.4036, + "step": 4438 + }, + { + "epoch": 0.35167359873242227, + "grad_norm": 1.7825672859552832, + "learning_rate": 1.5046594402068147e-05, + "loss": 0.3707, + "step": 4439 + }, + { + "epoch": 0.35175282234105765, + "grad_norm": 1.791977573660528, + "learning_rate": 1.5044378933251546e-05, + "loss": 0.3653, + "step": 4440 + }, + { + "epoch": 0.35183204594969303, + "grad_norm": 1.5204998704240598, + "learning_rate": 1.5042163132286867e-05, + "loss": 0.3088, + "step": 4441 + }, + { + "epoch": 0.35191126955832835, + "grad_norm": 1.6927276293706366, + "learning_rate": 1.5039946999320004e-05, + "loss": 0.3159, + "step": 4442 + }, + { + "epoch": 0.35199049316696374, + "grad_norm": 1.304804718934339, + "learning_rate": 1.5037730534496882e-05, + "loss": 0.2718, + "step": 4443 + }, + { + "epoch": 0.3520697167755991, + "grad_norm": 1.5480500253590626, + "learning_rate": 1.5035513737963445e-05, + "loss": 0.3296, + "step": 4444 + }, + { + "epoch": 0.3521489403842345, + "grad_norm": 1.4327400935705397, + "learning_rate": 1.5033296609865658e-05, + "loss": 0.2291, + "step": 4445 + }, + { + "epoch": 0.3522281639928699, + "grad_norm": 1.3544891280119487, + "learning_rate": 1.503107915034951e-05, + "loss": 0.2646, + "step": 4446 + }, + { + "epoch": 0.35230738760150526, + "grad_norm": 1.4381103031255773, + "learning_rate": 1.5028861359561005e-05, + "loss": 0.3491, + "step": 4447 + }, + { + "epoch": 0.35238661121014064, + "grad_norm": 1.2842815403805183, + "learning_rate": 1.5026643237646176e-05, + "loss": 0.2931, + "step": 4448 + }, + { + "epoch": 0.35246583481877597, + "grad_norm": 1.556770739822654, + "learning_rate": 1.5024424784751079e-05, + "loss": 0.3883, + "step": 4449 + }, + { + "epoch": 0.35254505842741135, + "grad_norm": 1.6071392135468507, + "learning_rate": 1.5022206001021784e-05, + "loss": 0.2604, + "step": 4450 + }, + { + "epoch": 0.35262428203604673, + "grad_norm": 1.6581467899133189, + "learning_rate": 1.501998688660439e-05, + "loss": 0.3594, + "step": 4451 + }, + { + "epoch": 0.3527035056446821, + "grad_norm": 1.4280202943505607, + "learning_rate": 1.5017767441645015e-05, + "loss": 0.3022, + "step": 4452 + }, + { + "epoch": 0.3527827292533175, + "grad_norm": 1.8696357359056301, + "learning_rate": 1.5015547666289798e-05, + "loss": 0.312, + "step": 4453 + }, + { + "epoch": 0.3528619528619529, + "grad_norm": 2.2254388203939968, + "learning_rate": 1.50133275606849e-05, + "loss": 0.5323, + "step": 4454 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 1.9794528530004738, + "learning_rate": 1.5011107124976505e-05, + "loss": 0.3146, + "step": 4455 + }, + { + "epoch": 0.3530204000792236, + "grad_norm": 1.5637467311851438, + "learning_rate": 1.5008886359310815e-05, + "loss": 0.3533, + "step": 4456 + }, + { + "epoch": 0.35309962368785897, + "grad_norm": 1.5582523654418898, + "learning_rate": 1.5006665263834062e-05, + "loss": 0.238, + "step": 4457 + }, + { + "epoch": 0.35317884729649435, + "grad_norm": 1.4982906644412797, + "learning_rate": 1.5004443838692492e-05, + "loss": 0.4334, + "step": 4458 + }, + { + "epoch": 0.35325807090512973, + "grad_norm": 1.2772645189554344, + "learning_rate": 1.5002222084032374e-05, + "loss": 0.269, + "step": 4459 + }, + { + "epoch": 0.3533372945137651, + "grad_norm": 1.611174493711392, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.3572, + "step": 4460 + }, + { + "epoch": 0.3534165181224005, + "grad_norm": 1.7140872029478584, + "learning_rate": 1.4997777586741689e-05, + "loss": 0.3621, + "step": 4461 + }, + { + "epoch": 0.3534957417310359, + "grad_norm": 1.8482216213669391, + "learning_rate": 1.4995554844403767e-05, + "loss": 0.4988, + "step": 4462 + }, + { + "epoch": 0.3535749653396712, + "grad_norm": 1.5782597846093604, + "learning_rate": 1.4993331773132598e-05, + "loss": 0.3621, + "step": 4463 + }, + { + "epoch": 0.3536541889483066, + "grad_norm": 1.619754703359135, + "learning_rate": 1.4991108373074557e-05, + "loss": 0.3121, + "step": 4464 + }, + { + "epoch": 0.35373341255694196, + "grad_norm": 1.251015099245574, + "learning_rate": 1.4988884644376045e-05, + "loss": 0.2942, + "step": 4465 + }, + { + "epoch": 0.35381263616557734, + "grad_norm": 1.1962443859476162, + "learning_rate": 1.4986660587183485e-05, + "loss": 0.2695, + "step": 4466 + }, + { + "epoch": 0.3538918597742127, + "grad_norm": 1.3050067250239221, + "learning_rate": 1.498443620164332e-05, + "loss": 0.3431, + "step": 4467 + }, + { + "epoch": 0.3539710833828481, + "grad_norm": 1.9169317517790057, + "learning_rate": 1.4982211487902015e-05, + "loss": 0.4503, + "step": 4468 + }, + { + "epoch": 0.3540503069914835, + "grad_norm": 1.5769739490151566, + "learning_rate": 1.4979986446106054e-05, + "loss": 0.3866, + "step": 4469 + }, + { + "epoch": 0.3541295306001188, + "grad_norm": 1.9632989273453332, + "learning_rate": 1.4977761076401949e-05, + "loss": 0.3616, + "step": 4470 + }, + { + "epoch": 0.3542087542087542, + "grad_norm": 1.4735528900290882, + "learning_rate": 1.4975535378936228e-05, + "loss": 0.3158, + "step": 4471 + }, + { + "epoch": 0.3542879778173896, + "grad_norm": 1.334976016328683, + "learning_rate": 1.4973309353855443e-05, + "loss": 0.2648, + "step": 4472 + }, + { + "epoch": 0.35436720142602496, + "grad_norm": 1.46997527587073, + "learning_rate": 1.497108300130617e-05, + "loss": 0.3375, + "step": 4473 + }, + { + "epoch": 0.35444642503466034, + "grad_norm": 1.6751360026779696, + "learning_rate": 1.4968856321434997e-05, + "loss": 0.4085, + "step": 4474 + }, + { + "epoch": 0.3545256486432957, + "grad_norm": 1.3147699829778916, + "learning_rate": 1.4966629314388548e-05, + "loss": 0.1788, + "step": 4475 + }, + { + "epoch": 0.3546048722519311, + "grad_norm": 1.7380815407654258, + "learning_rate": 1.4964401980313452e-05, + "loss": 0.4178, + "step": 4476 + }, + { + "epoch": 0.35468409586056643, + "grad_norm": 1.856948594190862, + "learning_rate": 1.4962174319356372e-05, + "loss": 0.4212, + "step": 4477 + }, + { + "epoch": 0.3547633194692018, + "grad_norm": 1.6701692103438248, + "learning_rate": 1.4959946331663995e-05, + "loss": 0.3934, + "step": 4478 + }, + { + "epoch": 0.3548425430778372, + "grad_norm": 1.715807350019634, + "learning_rate": 1.4957718017383013e-05, + "loss": 0.3259, + "step": 4479 + }, + { + "epoch": 0.3549217666864726, + "grad_norm": 1.6353226514714108, + "learning_rate": 1.4955489376660157e-05, + "loss": 0.3653, + "step": 4480 + }, + { + "epoch": 0.35500099029510795, + "grad_norm": 1.5986840524259371, + "learning_rate": 1.4953260409642172e-05, + "loss": 0.4304, + "step": 4481 + }, + { + "epoch": 0.35508021390374334, + "grad_norm": 1.4980375062464257, + "learning_rate": 1.4951031116475819e-05, + "loss": 0.2783, + "step": 4482 + }, + { + "epoch": 0.35515943751237866, + "grad_norm": 1.3735723890656084, + "learning_rate": 1.4948801497307893e-05, + "loss": 0.2921, + "step": 4483 + }, + { + "epoch": 0.35523866112101404, + "grad_norm": 1.476441144812472, + "learning_rate": 1.4946571552285196e-05, + "loss": 0.3835, + "step": 4484 + }, + { + "epoch": 0.3553178847296494, + "grad_norm": 1.3532321731038421, + "learning_rate": 1.4944341281554566e-05, + "loss": 0.3163, + "step": 4485 + }, + { + "epoch": 0.3553971083382848, + "grad_norm": 1.8412669825428365, + "learning_rate": 1.4942110685262854e-05, + "loss": 0.391, + "step": 4486 + }, + { + "epoch": 0.3554763319469202, + "grad_norm": 1.6406533550846139, + "learning_rate": 1.493987976355693e-05, + "loss": 0.4115, + "step": 4487 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 1.3490218542487145, + "learning_rate": 1.4937648516583696e-05, + "loss": 0.2903, + "step": 4488 + }, + { + "epoch": 0.35563477916419095, + "grad_norm": 1.7250046190266342, + "learning_rate": 1.4935416944490066e-05, + "loss": 0.4626, + "step": 4489 + }, + { + "epoch": 0.3557140027728263, + "grad_norm": 1.484861652232995, + "learning_rate": 1.4933185047422976e-05, + "loss": 0.3283, + "step": 4490 + }, + { + "epoch": 0.35579322638146166, + "grad_norm": 1.8375718812275208, + "learning_rate": 1.493095282552939e-05, + "loss": 0.4697, + "step": 4491 + }, + { + "epoch": 0.35587244999009704, + "grad_norm": 1.6458998248467758, + "learning_rate": 1.4928720278956284e-05, + "loss": 0.3502, + "step": 4492 + }, + { + "epoch": 0.3559516735987324, + "grad_norm": 1.4786427530122839, + "learning_rate": 1.4926487407850667e-05, + "loss": 0.3209, + "step": 4493 + }, + { + "epoch": 0.3560308972073678, + "grad_norm": 1.345736220657658, + "learning_rate": 1.4924254212359557e-05, + "loss": 0.3029, + "step": 4494 + }, + { + "epoch": 0.3561101208160032, + "grad_norm": 1.510996617616948, + "learning_rate": 1.492202069263e-05, + "loss": 0.2476, + "step": 4495 + }, + { + "epoch": 0.35618934442463857, + "grad_norm": 1.3485013445136644, + "learning_rate": 1.4919786848809061e-05, + "loss": 0.3037, + "step": 4496 + }, + { + "epoch": 0.3562685680332739, + "grad_norm": 1.3957058135768283, + "learning_rate": 1.4917552681043837e-05, + "loss": 0.277, + "step": 4497 + }, + { + "epoch": 0.3563477916419093, + "grad_norm": 1.6312078445070226, + "learning_rate": 1.4915318189481425e-05, + "loss": 0.4123, + "step": 4498 + }, + { + "epoch": 0.35642701525054465, + "grad_norm": 1.7028201265352148, + "learning_rate": 1.4913083374268965e-05, + "loss": 0.3283, + "step": 4499 + }, + { + "epoch": 0.35650623885918004, + "grad_norm": 1.5076321211463664, + "learning_rate": 1.4910848235553604e-05, + "loss": 0.366, + "step": 4500 + }, + { + "epoch": 0.3565854624678154, + "grad_norm": 1.288441048246071, + "learning_rate": 1.4908612773482514e-05, + "loss": 0.2596, + "step": 4501 + }, + { + "epoch": 0.3566646860764508, + "grad_norm": 1.3854643136389788, + "learning_rate": 1.4906376988202893e-05, + "loss": 0.304, + "step": 4502 + }, + { + "epoch": 0.3567439096850862, + "grad_norm": 1.5390867295985706, + "learning_rate": 1.4904140879861957e-05, + "loss": 0.3079, + "step": 4503 + }, + { + "epoch": 0.3568231332937215, + "grad_norm": 1.3772107961348354, + "learning_rate": 1.490190444860694e-05, + "loss": 0.247, + "step": 4504 + }, + { + "epoch": 0.3569023569023569, + "grad_norm": 1.4760415313792743, + "learning_rate": 1.48996676945851e-05, + "loss": 0.2896, + "step": 4505 + }, + { + "epoch": 0.35698158051099227, + "grad_norm": 1.7818994425986037, + "learning_rate": 1.4897430617943718e-05, + "loss": 0.4124, + "step": 4506 + }, + { + "epoch": 0.35706080411962765, + "grad_norm": 1.3538385468125493, + "learning_rate": 1.4895193218830098e-05, + "loss": 0.1953, + "step": 4507 + }, + { + "epoch": 0.35714002772826303, + "grad_norm": 1.5564099795895754, + "learning_rate": 1.4892955497391556e-05, + "loss": 0.3041, + "step": 4508 + }, + { + "epoch": 0.3572192513368984, + "grad_norm": 1.6640628194064107, + "learning_rate": 1.4890717453775438e-05, + "loss": 0.3135, + "step": 4509 + }, + { + "epoch": 0.3572984749455338, + "grad_norm": 1.2438496743608995, + "learning_rate": 1.488847908812911e-05, + "loss": 0.196, + "step": 4510 + }, + { + "epoch": 0.3573776985541691, + "grad_norm": 1.4194316506577334, + "learning_rate": 1.4886240400599954e-05, + "loss": 0.388, + "step": 4511 + }, + { + "epoch": 0.3574569221628045, + "grad_norm": 1.4845696393316856, + "learning_rate": 1.488400139133538e-05, + "loss": 0.2699, + "step": 4512 + }, + { + "epoch": 0.3575361457714399, + "grad_norm": 1.5306197110435527, + "learning_rate": 1.4881762060482814e-05, + "loss": 0.37, + "step": 4513 + }, + { + "epoch": 0.35761536938007527, + "grad_norm": 1.6089342845179366, + "learning_rate": 1.4879522408189706e-05, + "loss": 0.2954, + "step": 4514 + }, + { + "epoch": 0.35769459298871065, + "grad_norm": 1.4665943204645229, + "learning_rate": 1.4877282434603527e-05, + "loss": 0.268, + "step": 4515 + }, + { + "epoch": 0.35777381659734603, + "grad_norm": 1.6858722654032046, + "learning_rate": 1.4875042139871768e-05, + "loss": 0.373, + "step": 4516 + }, + { + "epoch": 0.3578530402059814, + "grad_norm": 1.687337844143792, + "learning_rate": 1.487280152414194e-05, + "loss": 0.3711, + "step": 4517 + }, + { + "epoch": 0.35793226381461674, + "grad_norm": 1.4986581913333943, + "learning_rate": 1.4870560587561578e-05, + "loss": 0.2921, + "step": 4518 + }, + { + "epoch": 0.3580114874232521, + "grad_norm": 1.8314682438077046, + "learning_rate": 1.4868319330278236e-05, + "loss": 0.4039, + "step": 4519 + }, + { + "epoch": 0.3580907110318875, + "grad_norm": 1.284968879816148, + "learning_rate": 1.4866077752439495e-05, + "loss": 0.2224, + "step": 4520 + }, + { + "epoch": 0.3581699346405229, + "grad_norm": 1.6388947973357595, + "learning_rate": 1.4863835854192945e-05, + "loss": 0.3199, + "step": 4521 + }, + { + "epoch": 0.35824915824915826, + "grad_norm": 1.3627534689367056, + "learning_rate": 1.4861593635686207e-05, + "loss": 0.3353, + "step": 4522 + }, + { + "epoch": 0.35832838185779364, + "grad_norm": 1.2997421795965136, + "learning_rate": 1.485935109706692e-05, + "loss": 0.333, + "step": 4523 + }, + { + "epoch": 0.35840760546642897, + "grad_norm": 1.7399908236843789, + "learning_rate": 1.4857108238482747e-05, + "loss": 0.3396, + "step": 4524 + }, + { + "epoch": 0.35848682907506435, + "grad_norm": 1.6751163458280312, + "learning_rate": 1.4854865060081367e-05, + "loss": 0.4047, + "step": 4525 + }, + { + "epoch": 0.35856605268369973, + "grad_norm": 1.6910081328032733, + "learning_rate": 1.4852621562010484e-05, + "loss": 0.3618, + "step": 4526 + }, + { + "epoch": 0.3586452762923351, + "grad_norm": 1.452417946872166, + "learning_rate": 1.4850377744417816e-05, + "loss": 0.3222, + "step": 4527 + }, + { + "epoch": 0.3587244999009705, + "grad_norm": 1.465134751993534, + "learning_rate": 1.4848133607451116e-05, + "loss": 0.3361, + "step": 4528 + }, + { + "epoch": 0.3588037235096059, + "grad_norm": 1.2937664713246881, + "learning_rate": 1.4845889151258144e-05, + "loss": 0.3048, + "step": 4529 + }, + { + "epoch": 0.35888294711824126, + "grad_norm": 1.4103158941759597, + "learning_rate": 1.484364437598669e-05, + "loss": 0.3097, + "step": 4530 + }, + { + "epoch": 0.3589621707268766, + "grad_norm": 1.6533614097152505, + "learning_rate": 1.4841399281784558e-05, + "loss": 0.3116, + "step": 4531 + }, + { + "epoch": 0.35904139433551197, + "grad_norm": 1.5836413070257982, + "learning_rate": 1.4839153868799583e-05, + "loss": 0.3021, + "step": 4532 + }, + { + "epoch": 0.35912061794414735, + "grad_norm": 1.5399833806009788, + "learning_rate": 1.4836908137179607e-05, + "loss": 0.3208, + "step": 4533 + }, + { + "epoch": 0.35919984155278273, + "grad_norm": 1.655999273451648, + "learning_rate": 1.4834662087072502e-05, + "loss": 0.2872, + "step": 4534 + }, + { + "epoch": 0.3592790651614181, + "grad_norm": 2.7594771713068877, + "learning_rate": 1.4832415718626166e-05, + "loss": 0.2787, + "step": 4535 + }, + { + "epoch": 0.3593582887700535, + "grad_norm": 1.6256369045786114, + "learning_rate": 1.4830169031988502e-05, + "loss": 0.3501, + "step": 4536 + }, + { + "epoch": 0.3594375123786889, + "grad_norm": 1.5312947221998576, + "learning_rate": 1.482792202730745e-05, + "loss": 0.3442, + "step": 4537 + }, + { + "epoch": 0.3595167359873242, + "grad_norm": 1.5979025065222114, + "learning_rate": 1.4825674704730966e-05, + "loss": 0.34, + "step": 4538 + }, + { + "epoch": 0.3595959595959596, + "grad_norm": 1.7289384151629619, + "learning_rate": 1.4823427064407018e-05, + "loss": 0.2654, + "step": 4539 + }, + { + "epoch": 0.35967518320459496, + "grad_norm": 1.5554422933856926, + "learning_rate": 1.4821179106483609e-05, + "loss": 0.2729, + "step": 4540 + }, + { + "epoch": 0.35975440681323034, + "grad_norm": 1.5807387121266765, + "learning_rate": 1.4818930831108755e-05, + "loss": 0.2966, + "step": 4541 + }, + { + "epoch": 0.3598336304218657, + "grad_norm": 1.8050893381280435, + "learning_rate": 1.481668223843049e-05, + "loss": 0.4225, + "step": 4542 + }, + { + "epoch": 0.3599128540305011, + "grad_norm": 1.6303951038746831, + "learning_rate": 1.481443332859688e-05, + "loss": 0.3015, + "step": 4543 + }, + { + "epoch": 0.3599920776391365, + "grad_norm": 1.6421873815220196, + "learning_rate": 1.4812184101755997e-05, + "loss": 0.4425, + "step": 4544 + }, + { + "epoch": 0.3600713012477718, + "grad_norm": 2.4365109990585463, + "learning_rate": 1.480993455805595e-05, + "loss": 0.4427, + "step": 4545 + }, + { + "epoch": 0.3601505248564072, + "grad_norm": 1.510087439192847, + "learning_rate": 1.480768469764485e-05, + "loss": 0.4234, + "step": 4546 + }, + { + "epoch": 0.3602297484650426, + "grad_norm": 1.7261180209256146, + "learning_rate": 1.480543452067085e-05, + "loss": 0.4369, + "step": 4547 + }, + { + "epoch": 0.36030897207367796, + "grad_norm": 1.4108114173008615, + "learning_rate": 1.480318402728211e-05, + "loss": 0.2853, + "step": 4548 + }, + { + "epoch": 0.36038819568231334, + "grad_norm": 1.868035887298413, + "learning_rate": 1.480093321762681e-05, + "loss": 0.4193, + "step": 4549 + }, + { + "epoch": 0.3604674192909487, + "grad_norm": 1.436531796439108, + "learning_rate": 1.4798682091853161e-05, + "loss": 0.2232, + "step": 4550 + }, + { + "epoch": 0.3605466428995841, + "grad_norm": 1.7665740118174211, + "learning_rate": 1.4796430650109383e-05, + "loss": 0.4041, + "step": 4551 + }, + { + "epoch": 0.36062586650821943, + "grad_norm": 1.3592856372708426, + "learning_rate": 1.4794178892543727e-05, + "loss": 0.3048, + "step": 4552 + }, + { + "epoch": 0.3607050901168548, + "grad_norm": 1.5242254138421896, + "learning_rate": 1.4791926819304462e-05, + "loss": 0.3198, + "step": 4553 + }, + { + "epoch": 0.3607843137254902, + "grad_norm": 1.5391450520791332, + "learning_rate": 1.4789674430539868e-05, + "loss": 0.3438, + "step": 4554 + }, + { + "epoch": 0.3608635373341256, + "grad_norm": 1.5332789880882494, + "learning_rate": 1.4787421726398263e-05, + "loss": 0.3789, + "step": 4555 + }, + { + "epoch": 0.36094276094276095, + "grad_norm": 1.7049827322244504, + "learning_rate": 1.4785168707027972e-05, + "loss": 0.4588, + "step": 4556 + }, + { + "epoch": 0.36102198455139634, + "grad_norm": 1.597298101899049, + "learning_rate": 1.4782915372577347e-05, + "loss": 0.3143, + "step": 4557 + }, + { + "epoch": 0.3611012081600317, + "grad_norm": 1.5429539572518831, + "learning_rate": 1.4780661723194757e-05, + "loss": 0.3254, + "step": 4558 + }, + { + "epoch": 0.36118043176866704, + "grad_norm": 1.608424375766527, + "learning_rate": 1.4778407759028599e-05, + "loss": 0.3012, + "step": 4559 + }, + { + "epoch": 0.3612596553773024, + "grad_norm": 1.8023334335875967, + "learning_rate": 1.4776153480227278e-05, + "loss": 0.3551, + "step": 4560 + }, + { + "epoch": 0.3613388789859378, + "grad_norm": 1.5202885322557214, + "learning_rate": 1.4773898886939235e-05, + "loss": 0.3201, + "step": 4561 + }, + { + "epoch": 0.3614181025945732, + "grad_norm": 1.6416522809960745, + "learning_rate": 1.4771643979312917e-05, + "loss": 0.3522, + "step": 4562 + }, + { + "epoch": 0.36149732620320857, + "grad_norm": 1.3573962324661244, + "learning_rate": 1.4769388757496806e-05, + "loss": 0.2134, + "step": 4563 + }, + { + "epoch": 0.36157654981184395, + "grad_norm": 1.6109318342005996, + "learning_rate": 1.4767133221639394e-05, + "loss": 0.382, + "step": 4564 + }, + { + "epoch": 0.3616557734204793, + "grad_norm": 1.8541905848657845, + "learning_rate": 1.4764877371889194e-05, + "loss": 0.3619, + "step": 4565 + }, + { + "epoch": 0.36173499702911466, + "grad_norm": 1.3441381077715713, + "learning_rate": 1.476262120839475e-05, + "loss": 0.2837, + "step": 4566 + }, + { + "epoch": 0.36181422063775004, + "grad_norm": 1.5064008339755857, + "learning_rate": 1.4760364731304614e-05, + "loss": 0.3249, + "step": 4567 + }, + { + "epoch": 0.3618934442463854, + "grad_norm": 1.487066228871815, + "learning_rate": 1.4758107940767368e-05, + "loss": 0.3311, + "step": 4568 + }, + { + "epoch": 0.3619726678550208, + "grad_norm": 1.4046031145447673, + "learning_rate": 1.4755850836931607e-05, + "loss": 0.3257, + "step": 4569 + }, + { + "epoch": 0.3620518914636562, + "grad_norm": 1.4320568252244823, + "learning_rate": 1.475359341994595e-05, + "loss": 0.3715, + "step": 4570 + }, + { + "epoch": 0.36213111507229157, + "grad_norm": 1.612723330184229, + "learning_rate": 1.4751335689959044e-05, + "loss": 0.309, + "step": 4571 + }, + { + "epoch": 0.3622103386809269, + "grad_norm": 1.6610382710189933, + "learning_rate": 1.4749077647119542e-05, + "loss": 0.2608, + "step": 4572 + }, + { + "epoch": 0.3622895622895623, + "grad_norm": 1.6224368272366407, + "learning_rate": 1.474681929157613e-05, + "loss": 0.4, + "step": 4573 + }, + { + "epoch": 0.36236878589819765, + "grad_norm": 1.5121286737390047, + "learning_rate": 1.4744560623477502e-05, + "loss": 0.3288, + "step": 4574 + }, + { + "epoch": 0.36244800950683304, + "grad_norm": 1.5583457044684619, + "learning_rate": 1.4742301642972392e-05, + "loss": 0.3732, + "step": 4575 + }, + { + "epoch": 0.3625272331154684, + "grad_norm": 1.8003479962902789, + "learning_rate": 1.4740042350209536e-05, + "loss": 0.3285, + "step": 4576 + }, + { + "epoch": 0.3626064567241038, + "grad_norm": 1.830572240757174, + "learning_rate": 1.4737782745337696e-05, + "loss": 0.4015, + "step": 4577 + }, + { + "epoch": 0.3626856803327392, + "grad_norm": 1.7514932272298862, + "learning_rate": 1.4735522828505663e-05, + "loss": 0.3968, + "step": 4578 + }, + { + "epoch": 0.3627649039413745, + "grad_norm": 1.5725393508885004, + "learning_rate": 1.4733262599862234e-05, + "loss": 0.3644, + "step": 4579 + }, + { + "epoch": 0.3628441275500099, + "grad_norm": 1.3220174749720226, + "learning_rate": 1.4731002059556242e-05, + "loss": 0.2512, + "step": 4580 + }, + { + "epoch": 0.36292335115864527, + "grad_norm": 1.4223548280156784, + "learning_rate": 1.4728741207736525e-05, + "loss": 0.4286, + "step": 4581 + }, + { + "epoch": 0.36300257476728065, + "grad_norm": 1.3931753407842224, + "learning_rate": 1.4726480044551953e-05, + "loss": 0.2845, + "step": 4582 + }, + { + "epoch": 0.36308179837591603, + "grad_norm": 1.706869850751769, + "learning_rate": 1.4724218570151415e-05, + "loss": 0.3968, + "step": 4583 + }, + { + "epoch": 0.3631610219845514, + "grad_norm": 1.6241919060888654, + "learning_rate": 1.4721956784683813e-05, + "loss": 0.4633, + "step": 4584 + }, + { + "epoch": 0.3632402455931868, + "grad_norm": 1.7289697936270059, + "learning_rate": 1.4719694688298078e-05, + "loss": 0.3716, + "step": 4585 + }, + { + "epoch": 0.3633194692018221, + "grad_norm": 1.3912620664242132, + "learning_rate": 1.4717432281143161e-05, + "loss": 0.2715, + "step": 4586 + }, + { + "epoch": 0.3633986928104575, + "grad_norm": 1.5715488781666351, + "learning_rate": 1.4715169563368021e-05, + "loss": 0.3328, + "step": 4587 + }, + { + "epoch": 0.3634779164190929, + "grad_norm": 1.204138937434893, + "learning_rate": 1.4712906535121658e-05, + "loss": 0.2294, + "step": 4588 + }, + { + "epoch": 0.36355714002772826, + "grad_norm": 1.6339354137993924, + "learning_rate": 1.4710643196553074e-05, + "loss": 0.4242, + "step": 4589 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.451752472936718, + "learning_rate": 1.4708379547811302e-05, + "loss": 0.2134, + "step": 4590 + }, + { + "epoch": 0.36371558724499903, + "grad_norm": 1.755564857595638, + "learning_rate": 1.4706115589045396e-05, + "loss": 0.3582, + "step": 4591 + }, + { + "epoch": 0.3637948108536344, + "grad_norm": 1.9643347303696277, + "learning_rate": 1.4703851320404416e-05, + "loss": 0.4131, + "step": 4592 + }, + { + "epoch": 0.36387403446226974, + "grad_norm": 1.5386793167121273, + "learning_rate": 1.4701586742037464e-05, + "loss": 0.2697, + "step": 4593 + }, + { + "epoch": 0.3639532580709051, + "grad_norm": 1.6750472898086326, + "learning_rate": 1.4699321854093649e-05, + "loss": 0.3799, + "step": 4594 + }, + { + "epoch": 0.3640324816795405, + "grad_norm": 1.5779240540787023, + "learning_rate": 1.46970566567221e-05, + "loss": 0.3196, + "step": 4595 + }, + { + "epoch": 0.3641117052881759, + "grad_norm": 1.6008195093628788, + "learning_rate": 1.469479115007197e-05, + "loss": 0.3869, + "step": 4596 + }, + { + "epoch": 0.36419092889681126, + "grad_norm": 1.8066764655593655, + "learning_rate": 1.4692525334292434e-05, + "loss": 0.237, + "step": 4597 + }, + { + "epoch": 0.36427015250544664, + "grad_norm": 1.538849625563868, + "learning_rate": 1.4690259209532682e-05, + "loss": 0.2831, + "step": 4598 + }, + { + "epoch": 0.364349376114082, + "grad_norm": 1.8674362509436677, + "learning_rate": 1.468799277594193e-05, + "loss": 0.3825, + "step": 4599 + }, + { + "epoch": 0.36442859972271735, + "grad_norm": 1.683190879844929, + "learning_rate": 1.4685726033669412e-05, + "loss": 0.4067, + "step": 4600 + }, + { + "epoch": 0.36450782333135273, + "grad_norm": 1.5257501778760814, + "learning_rate": 1.468345898286438e-05, + "loss": 0.2996, + "step": 4601 + }, + { + "epoch": 0.3645870469399881, + "grad_norm": 1.3137061305129647, + "learning_rate": 1.468119162367611e-05, + "loss": 0.2582, + "step": 4602 + }, + { + "epoch": 0.3646662705486235, + "grad_norm": 1.5493471197544215, + "learning_rate": 1.4678923956253894e-05, + "loss": 0.4196, + "step": 4603 + }, + { + "epoch": 0.3647454941572589, + "grad_norm": 1.7518071942723026, + "learning_rate": 1.4676655980747052e-05, + "loss": 0.3467, + "step": 4604 + }, + { + "epoch": 0.36482471776589426, + "grad_norm": 1.2114731631575513, + "learning_rate": 1.4674387697304914e-05, + "loss": 0.2344, + "step": 4605 + }, + { + "epoch": 0.3649039413745296, + "grad_norm": 1.5387893475895256, + "learning_rate": 1.4672119106076838e-05, + "loss": 0.4891, + "step": 4606 + }, + { + "epoch": 0.36498316498316496, + "grad_norm": 1.8681054215404638, + "learning_rate": 1.4669850207212202e-05, + "loss": 0.4069, + "step": 4607 + }, + { + "epoch": 0.36506238859180035, + "grad_norm": 1.4750912297774346, + "learning_rate": 1.4667581000860395e-05, + "loss": 0.2821, + "step": 4608 + }, + { + "epoch": 0.36514161220043573, + "grad_norm": 2.534997991900976, + "learning_rate": 1.4665311487170844e-05, + "loss": 0.6095, + "step": 4609 + }, + { + "epoch": 0.3652208358090711, + "grad_norm": 1.6675856075933126, + "learning_rate": 1.4663041666292978e-05, + "loss": 0.4356, + "step": 4610 + }, + { + "epoch": 0.3653000594177065, + "grad_norm": 1.6688530198532512, + "learning_rate": 1.4660771538376253e-05, + "loss": 0.3418, + "step": 4611 + }, + { + "epoch": 0.3653792830263419, + "grad_norm": 1.270817909880478, + "learning_rate": 1.4658501103570149e-05, + "loss": 0.2908, + "step": 4612 + }, + { + "epoch": 0.3654585066349772, + "grad_norm": 1.2741529689072915, + "learning_rate": 1.4656230362024166e-05, + "loss": 0.2625, + "step": 4613 + }, + { + "epoch": 0.3655377302436126, + "grad_norm": 1.4525899294237308, + "learning_rate": 1.4653959313887813e-05, + "loss": 0.358, + "step": 4614 + }, + { + "epoch": 0.36561695385224796, + "grad_norm": 1.3306823595862238, + "learning_rate": 1.4651687959310636e-05, + "loss": 0.2651, + "step": 4615 + }, + { + "epoch": 0.36569617746088334, + "grad_norm": 1.5112091541997872, + "learning_rate": 1.4649416298442187e-05, + "loss": 0.3741, + "step": 4616 + }, + { + "epoch": 0.3657754010695187, + "grad_norm": 1.6039927324986167, + "learning_rate": 1.4647144331432049e-05, + "loss": 0.3978, + "step": 4617 + }, + { + "epoch": 0.3658546246781541, + "grad_norm": 1.3158075578605177, + "learning_rate": 1.4644872058429816e-05, + "loss": 0.2901, + "step": 4618 + }, + { + "epoch": 0.3659338482867895, + "grad_norm": 1.5183594736862347, + "learning_rate": 1.4642599479585106e-05, + "loss": 0.3766, + "step": 4619 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 1.7834692884590755, + "learning_rate": 1.4640326595047561e-05, + "loss": 0.3997, + "step": 4620 + }, + { + "epoch": 0.3660922955040602, + "grad_norm": 1.6458240372043382, + "learning_rate": 1.4638053404966836e-05, + "loss": 0.3972, + "step": 4621 + }, + { + "epoch": 0.3661715191126956, + "grad_norm": 1.5626009523690787, + "learning_rate": 1.4635779909492614e-05, + "loss": 0.3104, + "step": 4622 + }, + { + "epoch": 0.36625074272133096, + "grad_norm": 1.605839133960405, + "learning_rate": 1.4633506108774588e-05, + "loss": 0.3325, + "step": 4623 + }, + { + "epoch": 0.36632996632996634, + "grad_norm": 1.65605612094927, + "learning_rate": 1.4631232002962481e-05, + "loss": 0.3319, + "step": 4624 + }, + { + "epoch": 0.3664091899386017, + "grad_norm": 1.194760596178059, + "learning_rate": 1.462895759220603e-05, + "loss": 0.2863, + "step": 4625 + }, + { + "epoch": 0.3664884135472371, + "grad_norm": 1.4897267524134923, + "learning_rate": 1.4626682876654998e-05, + "loss": 0.3196, + "step": 4626 + }, + { + "epoch": 0.36656763715587243, + "grad_norm": 1.6338069843979595, + "learning_rate": 1.4624407856459154e-05, + "loss": 0.4129, + "step": 4627 + }, + { + "epoch": 0.3666468607645078, + "grad_norm": 1.658035888298133, + "learning_rate": 1.4622132531768309e-05, + "loss": 0.373, + "step": 4628 + }, + { + "epoch": 0.3667260843731432, + "grad_norm": 1.4983164704356673, + "learning_rate": 1.4619856902732279e-05, + "loss": 0.3318, + "step": 4629 + }, + { + "epoch": 0.36680530798177857, + "grad_norm": 1.4205575374264885, + "learning_rate": 1.4617580969500895e-05, + "loss": 0.2856, + "step": 4630 + }, + { + "epoch": 0.36688453159041395, + "grad_norm": 1.39351778991826, + "learning_rate": 1.461530473222403e-05, + "loss": 0.2913, + "step": 4631 + }, + { + "epoch": 0.36696375519904934, + "grad_norm": 1.3460895648737004, + "learning_rate": 1.4613028191051548e-05, + "loss": 0.2886, + "step": 4632 + }, + { + "epoch": 0.3670429788076847, + "grad_norm": 1.743681810869301, + "learning_rate": 1.4610751346133361e-05, + "loss": 0.3681, + "step": 4633 + }, + { + "epoch": 0.36712220241632004, + "grad_norm": 1.5960034110784955, + "learning_rate": 1.4608474197619383e-05, + "loss": 0.3737, + "step": 4634 + }, + { + "epoch": 0.3672014260249554, + "grad_norm": 1.4543851667449015, + "learning_rate": 1.4606196745659551e-05, + "loss": 0.2573, + "step": 4635 + }, + { + "epoch": 0.3672806496335908, + "grad_norm": 1.3399534873314962, + "learning_rate": 1.460391899040383e-05, + "loss": 0.326, + "step": 4636 + }, + { + "epoch": 0.3673598732422262, + "grad_norm": 1.6446976758229053, + "learning_rate": 1.4601640932002194e-05, + "loss": 0.3493, + "step": 4637 + }, + { + "epoch": 0.36743909685086157, + "grad_norm": 1.6301308603854625, + "learning_rate": 1.4599362570604645e-05, + "loss": 0.31, + "step": 4638 + }, + { + "epoch": 0.36751832045949695, + "grad_norm": 1.3638954249354742, + "learning_rate": 1.4597083906361203e-05, + "loss": 0.4205, + "step": 4639 + }, + { + "epoch": 0.3675975440681323, + "grad_norm": 1.7531860483348178, + "learning_rate": 1.4594804939421903e-05, + "loss": 0.3755, + "step": 4640 + }, + { + "epoch": 0.36767676767676766, + "grad_norm": 1.4043519443126165, + "learning_rate": 1.4592525669936808e-05, + "loss": 0.2748, + "step": 4641 + }, + { + "epoch": 0.36775599128540304, + "grad_norm": 1.8413621118434689, + "learning_rate": 1.4590246098055995e-05, + "loss": 0.454, + "step": 4642 + }, + { + "epoch": 0.3678352148940384, + "grad_norm": 1.5235331398057443, + "learning_rate": 1.4587966223929562e-05, + "loss": 0.3505, + "step": 4643 + }, + { + "epoch": 0.3679144385026738, + "grad_norm": 1.4567379310302107, + "learning_rate": 1.458568604770763e-05, + "loss": 0.3569, + "step": 4644 + }, + { + "epoch": 0.3679936621113092, + "grad_norm": 1.3856834684257866, + "learning_rate": 1.458340556954034e-05, + "loss": 0.3214, + "step": 4645 + }, + { + "epoch": 0.36807288571994456, + "grad_norm": 1.5632748588157244, + "learning_rate": 1.4581124789577841e-05, + "loss": 0.3491, + "step": 4646 + }, + { + "epoch": 0.3681521093285799, + "grad_norm": 1.645248312939299, + "learning_rate": 1.4578843707970323e-05, + "loss": 0.341, + "step": 4647 + }, + { + "epoch": 0.36823133293721527, + "grad_norm": 1.6531855869579501, + "learning_rate": 1.4576562324867975e-05, + "loss": 0.3198, + "step": 4648 + }, + { + "epoch": 0.36831055654585065, + "grad_norm": 1.5837364881527218, + "learning_rate": 1.457428064042102e-05, + "loss": 0.3833, + "step": 4649 + }, + { + "epoch": 0.36838978015448604, + "grad_norm": 1.1834622362998237, + "learning_rate": 1.45719986547797e-05, + "loss": 0.1984, + "step": 4650 + }, + { + "epoch": 0.3684690037631214, + "grad_norm": 1.4053115244075074, + "learning_rate": 1.4569716368094262e-05, + "loss": 0.2948, + "step": 4651 + }, + { + "epoch": 0.3685482273717568, + "grad_norm": 1.5462441867586394, + "learning_rate": 1.456743378051499e-05, + "loss": 0.3294, + "step": 4652 + }, + { + "epoch": 0.3686274509803922, + "grad_norm": 1.3494062448999087, + "learning_rate": 1.456515089219218e-05, + "loss": 0.2445, + "step": 4653 + }, + { + "epoch": 0.3687066745890275, + "grad_norm": 1.9853310086543616, + "learning_rate": 1.456286770327615e-05, + "loss": 0.4928, + "step": 4654 + }, + { + "epoch": 0.3687858981976629, + "grad_norm": 1.4320607522285995, + "learning_rate": 1.456058421391724e-05, + "loss": 0.232, + "step": 4655 + }, + { + "epoch": 0.36886512180629827, + "grad_norm": 1.598467427100357, + "learning_rate": 1.45583004242658e-05, + "loss": 0.3815, + "step": 4656 + }, + { + "epoch": 0.36894434541493365, + "grad_norm": 1.6540532138961344, + "learning_rate": 1.4556016334472211e-05, + "loss": 0.3427, + "step": 4657 + }, + { + "epoch": 0.36902356902356903, + "grad_norm": 1.342372202072488, + "learning_rate": 1.455373194468687e-05, + "loss": 0.2886, + "step": 4658 + }, + { + "epoch": 0.3691027926322044, + "grad_norm": 1.5284179077281435, + "learning_rate": 1.4551447255060192e-05, + "loss": 0.3183, + "step": 4659 + }, + { + "epoch": 0.3691820162408398, + "grad_norm": 1.1655615528525876, + "learning_rate": 1.4549162265742608e-05, + "loss": 0.2578, + "step": 4660 + }, + { + "epoch": 0.3692612398494751, + "grad_norm": 1.6941544491756817, + "learning_rate": 1.4546876976884583e-05, + "loss": 0.3439, + "step": 4661 + }, + { + "epoch": 0.3693404634581105, + "grad_norm": 1.4269738452637428, + "learning_rate": 1.4544591388636584e-05, + "loss": 0.3112, + "step": 4662 + }, + { + "epoch": 0.3694196870667459, + "grad_norm": 1.103473132298058, + "learning_rate": 1.454230550114911e-05, + "loss": 0.1941, + "step": 4663 + }, + { + "epoch": 0.36949891067538126, + "grad_norm": 1.6713762821841982, + "learning_rate": 1.4540019314572678e-05, + "loss": 0.3523, + "step": 4664 + }, + { + "epoch": 0.36957813428401665, + "grad_norm": 1.3958377347130868, + "learning_rate": 1.4537732829057816e-05, + "loss": 0.3582, + "step": 4665 + }, + { + "epoch": 0.369657357892652, + "grad_norm": 1.8921948303906428, + "learning_rate": 1.4535446044755082e-05, + "loss": 0.4173, + "step": 4666 + }, + { + "epoch": 0.3697365815012874, + "grad_norm": 1.8616661308957294, + "learning_rate": 1.4533158961815048e-05, + "loss": 0.4406, + "step": 4667 + }, + { + "epoch": 0.36981580510992274, + "grad_norm": 1.217528633217943, + "learning_rate": 1.4530871580388311e-05, + "loss": 0.1821, + "step": 4668 + }, + { + "epoch": 0.3698950287185581, + "grad_norm": 1.5306705153935265, + "learning_rate": 1.4528583900625481e-05, + "loss": 0.3402, + "step": 4669 + }, + { + "epoch": 0.3699742523271935, + "grad_norm": 1.414553172656487, + "learning_rate": 1.4526295922677189e-05, + "loss": 0.4246, + "step": 4670 + }, + { + "epoch": 0.3700534759358289, + "grad_norm": 1.823181112464695, + "learning_rate": 1.4524007646694091e-05, + "loss": 0.4012, + "step": 4671 + }, + { + "epoch": 0.37013269954446426, + "grad_norm": 1.3108502235097963, + "learning_rate": 1.4521719072826858e-05, + "loss": 0.2361, + "step": 4672 + }, + { + "epoch": 0.37021192315309964, + "grad_norm": 1.1105335455527836, + "learning_rate": 1.451943020122618e-05, + "loss": 0.2221, + "step": 4673 + }, + { + "epoch": 0.370291146761735, + "grad_norm": 1.3107517950227892, + "learning_rate": 1.4517141032042773e-05, + "loss": 0.2718, + "step": 4674 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 1.3845994655311256, + "learning_rate": 1.4514851565427362e-05, + "loss": 0.3194, + "step": 4675 + }, + { + "epoch": 0.37044959397900573, + "grad_norm": 1.5857491671616526, + "learning_rate": 1.4512561801530699e-05, + "loss": 0.311, + "step": 4676 + }, + { + "epoch": 0.3705288175876411, + "grad_norm": 2.014009019609301, + "learning_rate": 1.4510271740503555e-05, + "loss": 0.3856, + "step": 4677 + }, + { + "epoch": 0.3706080411962765, + "grad_norm": 1.124160729962944, + "learning_rate": 1.4507981382496716e-05, + "loss": 0.2459, + "step": 4678 + }, + { + "epoch": 0.3706872648049119, + "grad_norm": 2.0185225727502383, + "learning_rate": 1.4505690727660997e-05, + "loss": 0.4145, + "step": 4679 + }, + { + "epoch": 0.37076648841354726, + "grad_norm": 1.5753645357511594, + "learning_rate": 1.4503399776147223e-05, + "loss": 0.3028, + "step": 4680 + }, + { + "epoch": 0.3708457120221826, + "grad_norm": 1.7773431005163407, + "learning_rate": 1.4501108528106243e-05, + "loss": 0.4036, + "step": 4681 + }, + { + "epoch": 0.37092493563081796, + "grad_norm": 1.4956789205301773, + "learning_rate": 1.4498816983688926e-05, + "loss": 0.313, + "step": 4682 + }, + { + "epoch": 0.37100415923945335, + "grad_norm": 1.5415317871335148, + "learning_rate": 1.4496525143046154e-05, + "loss": 0.3778, + "step": 4683 + }, + { + "epoch": 0.3710833828480887, + "grad_norm": 1.4161775432365453, + "learning_rate": 1.4494233006328837e-05, + "loss": 0.3218, + "step": 4684 + }, + { + "epoch": 0.3711626064567241, + "grad_norm": 1.4029137319291618, + "learning_rate": 1.4491940573687906e-05, + "loss": 0.2896, + "step": 4685 + }, + { + "epoch": 0.3712418300653595, + "grad_norm": 1.433596580212024, + "learning_rate": 1.44896478452743e-05, + "loss": 0.2989, + "step": 4686 + }, + { + "epoch": 0.37132105367399487, + "grad_norm": 1.5365685859727027, + "learning_rate": 1.4487354821238983e-05, + "loss": 0.3105, + "step": 4687 + }, + { + "epoch": 0.3714002772826302, + "grad_norm": 1.4626767706974417, + "learning_rate": 1.4485061501732949e-05, + "loss": 0.315, + "step": 4688 + }, + { + "epoch": 0.3714795008912656, + "grad_norm": 1.4747343396470216, + "learning_rate": 1.448276788690719e-05, + "loss": 0.3042, + "step": 4689 + }, + { + "epoch": 0.37155872449990096, + "grad_norm": 1.3942598195028861, + "learning_rate": 1.4480473976912737e-05, + "loss": 0.2464, + "step": 4690 + }, + { + "epoch": 0.37163794810853634, + "grad_norm": 1.5027014180904217, + "learning_rate": 1.4478179771900634e-05, + "loss": 0.3039, + "step": 4691 + }, + { + "epoch": 0.3717171717171717, + "grad_norm": 1.6378230445114412, + "learning_rate": 1.4475885272021936e-05, + "loss": 0.392, + "step": 4692 + }, + { + "epoch": 0.3717963953258071, + "grad_norm": 1.5117124897317267, + "learning_rate": 1.4473590477427735e-05, + "loss": 0.3651, + "step": 4693 + }, + { + "epoch": 0.3718756189344425, + "grad_norm": 1.555778726648578, + "learning_rate": 1.4471295388269121e-05, + "loss": 0.368, + "step": 4694 + }, + { + "epoch": 0.3719548425430778, + "grad_norm": 1.4679560826210025, + "learning_rate": 1.4469000004697224e-05, + "loss": 0.3114, + "step": 4695 + }, + { + "epoch": 0.3720340661517132, + "grad_norm": 1.534323434907577, + "learning_rate": 1.446670432686318e-05, + "loss": 0.3793, + "step": 4696 + }, + { + "epoch": 0.3721132897603486, + "grad_norm": 1.6423730313725133, + "learning_rate": 1.4464408354918145e-05, + "loss": 0.3179, + "step": 4697 + }, + { + "epoch": 0.37219251336898396, + "grad_norm": 1.2273350592842331, + "learning_rate": 1.4462112089013304e-05, + "loss": 0.2809, + "step": 4698 + }, + { + "epoch": 0.37227173697761934, + "grad_norm": 1.5561479541583605, + "learning_rate": 1.4459815529299851e-05, + "loss": 0.2943, + "step": 4699 + }, + { + "epoch": 0.3723509605862547, + "grad_norm": 1.5081278645188412, + "learning_rate": 1.4457518675929008e-05, + "loss": 0.3366, + "step": 4700 + }, + { + "epoch": 0.3724301841948901, + "grad_norm": 1.360163798910226, + "learning_rate": 1.4455221529052006e-05, + "loss": 0.2897, + "step": 4701 + }, + { + "epoch": 0.3725094078035254, + "grad_norm": 1.58515219573682, + "learning_rate": 1.4452924088820101e-05, + "loss": 0.353, + "step": 4702 + }, + { + "epoch": 0.3725886314121608, + "grad_norm": 1.4131259366145987, + "learning_rate": 1.4450626355384573e-05, + "loss": 0.3453, + "step": 4703 + }, + { + "epoch": 0.3726678550207962, + "grad_norm": 1.559073530996899, + "learning_rate": 1.4448328328896717e-05, + "loss": 0.3109, + "step": 4704 + }, + { + "epoch": 0.37274707862943157, + "grad_norm": 1.5864034495512063, + "learning_rate": 1.444603000950784e-05, + "loss": 0.2887, + "step": 4705 + }, + { + "epoch": 0.37282630223806695, + "grad_norm": 1.4532642364597348, + "learning_rate": 1.4443731397369283e-05, + "loss": 0.3134, + "step": 4706 + }, + { + "epoch": 0.37290552584670233, + "grad_norm": 1.6722820926762596, + "learning_rate": 1.4441432492632395e-05, + "loss": 0.4005, + "step": 4707 + }, + { + "epoch": 0.3729847494553377, + "grad_norm": 1.7373699492904362, + "learning_rate": 1.4439133295448547e-05, + "loss": 0.3444, + "step": 4708 + }, + { + "epoch": 0.37306397306397304, + "grad_norm": 1.2511997154092365, + "learning_rate": 1.4436833805969133e-05, + "loss": 0.2703, + "step": 4709 + }, + { + "epoch": 0.3731431966726084, + "grad_norm": 1.146360282693015, + "learning_rate": 1.4434534024345558e-05, + "loss": 0.2558, + "step": 4710 + }, + { + "epoch": 0.3732224202812438, + "grad_norm": 1.4858804313854288, + "learning_rate": 1.4432233950729257e-05, + "loss": 0.2908, + "step": 4711 + }, + { + "epoch": 0.3733016438898792, + "grad_norm": 1.4921898449790627, + "learning_rate": 1.442993358527168e-05, + "loss": 0.3013, + "step": 4712 + }, + { + "epoch": 0.37338086749851457, + "grad_norm": 1.4069100772099172, + "learning_rate": 1.4427632928124288e-05, + "loss": 0.2674, + "step": 4713 + }, + { + "epoch": 0.37346009110714995, + "grad_norm": 1.3734893985782217, + "learning_rate": 1.4425331979438573e-05, + "loss": 0.2721, + "step": 4714 + }, + { + "epoch": 0.37353931471578533, + "grad_norm": 1.1963372111462343, + "learning_rate": 1.4423030739366042e-05, + "loss": 0.2386, + "step": 4715 + }, + { + "epoch": 0.37361853832442066, + "grad_norm": 1.5444197340935044, + "learning_rate": 1.4420729208058217e-05, + "loss": 0.3274, + "step": 4716 + }, + { + "epoch": 0.37369776193305604, + "grad_norm": 1.863440753569575, + "learning_rate": 1.4418427385666647e-05, + "loss": 0.4475, + "step": 4717 + }, + { + "epoch": 0.3737769855416914, + "grad_norm": 1.5226158303161417, + "learning_rate": 1.4416125272342891e-05, + "loss": 0.3624, + "step": 4718 + }, + { + "epoch": 0.3738562091503268, + "grad_norm": 1.465057170296472, + "learning_rate": 1.4413822868238537e-05, + "loss": 0.3059, + "step": 4719 + }, + { + "epoch": 0.3739354327589622, + "grad_norm": 1.7880199929989178, + "learning_rate": 1.4411520173505184e-05, + "loss": 0.3379, + "step": 4720 + }, + { + "epoch": 0.37401465636759756, + "grad_norm": 1.5360629781643058, + "learning_rate": 1.4409217188294456e-05, + "loss": 0.2871, + "step": 4721 + }, + { + "epoch": 0.3740938799762329, + "grad_norm": 1.564431604919678, + "learning_rate": 1.440691391275799e-05, + "loss": 0.3338, + "step": 4722 + }, + { + "epoch": 0.37417310358486827, + "grad_norm": 1.588692736369417, + "learning_rate": 1.440461034704745e-05, + "loss": 0.3241, + "step": 4723 + }, + { + "epoch": 0.37425232719350365, + "grad_norm": 1.700732090206522, + "learning_rate": 1.4402306491314508e-05, + "loss": 0.4238, + "step": 4724 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 1.332147796700369, + "learning_rate": 1.4400002345710871e-05, + "loss": 0.2356, + "step": 4725 + }, + { + "epoch": 0.3744107744107744, + "grad_norm": 1.3139373866413449, + "learning_rate": 1.4397697910388248e-05, + "loss": 0.2486, + "step": 4726 + }, + { + "epoch": 0.3744899980194098, + "grad_norm": 1.6136158357733488, + "learning_rate": 1.4395393185498381e-05, + "loss": 0.3633, + "step": 4727 + }, + { + "epoch": 0.3745692216280452, + "grad_norm": 1.740817657205945, + "learning_rate": 1.4393088171193021e-05, + "loss": 0.4123, + "step": 4728 + }, + { + "epoch": 0.3746484452366805, + "grad_norm": 1.7611602299747777, + "learning_rate": 1.439078286762394e-05, + "loss": 0.3676, + "step": 4729 + }, + { + "epoch": 0.3747276688453159, + "grad_norm": 1.752622200562547, + "learning_rate": 1.4388477274942936e-05, + "loss": 0.3968, + "step": 4730 + }, + { + "epoch": 0.37480689245395127, + "grad_norm": 1.7450319718219336, + "learning_rate": 1.438617139330182e-05, + "loss": 0.3634, + "step": 4731 + }, + { + "epoch": 0.37488611606258665, + "grad_norm": 1.6150278691106186, + "learning_rate": 1.4383865222852423e-05, + "loss": 0.4263, + "step": 4732 + }, + { + "epoch": 0.37496533967122203, + "grad_norm": 1.4561309174099997, + "learning_rate": 1.4381558763746593e-05, + "loss": 0.3018, + "step": 4733 + }, + { + "epoch": 0.3750445632798574, + "grad_norm": 1.5183279123368738, + "learning_rate": 1.4379252016136203e-05, + "loss": 0.3077, + "step": 4734 + }, + { + "epoch": 0.3751237868884928, + "grad_norm": 1.693966480917254, + "learning_rate": 1.4376944980173138e-05, + "loss": 0.3354, + "step": 4735 + }, + { + "epoch": 0.3752030104971281, + "grad_norm": 1.3470739014671065, + "learning_rate": 1.4374637656009309e-05, + "loss": 0.3069, + "step": 4736 + }, + { + "epoch": 0.3752822341057635, + "grad_norm": 1.4664709557930569, + "learning_rate": 1.4372330043796636e-05, + "loss": 0.292, + "step": 4737 + }, + { + "epoch": 0.3753614577143989, + "grad_norm": 1.6288033815681167, + "learning_rate": 1.437002214368707e-05, + "loss": 0.4213, + "step": 4738 + }, + { + "epoch": 0.37544068132303426, + "grad_norm": 1.6169427484861356, + "learning_rate": 1.4367713955832575e-05, + "loss": 0.3659, + "step": 4739 + }, + { + "epoch": 0.37551990493166965, + "grad_norm": 1.4286273559504765, + "learning_rate": 1.4365405480385129e-05, + "loss": 0.2908, + "step": 4740 + }, + { + "epoch": 0.375599128540305, + "grad_norm": 1.6600317436593115, + "learning_rate": 1.4363096717496738e-05, + "loss": 0.3743, + "step": 4741 + }, + { + "epoch": 0.3756783521489404, + "grad_norm": 1.5638075988765359, + "learning_rate": 1.4360787667319423e-05, + "loss": 0.322, + "step": 4742 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 1.4730870053945573, + "learning_rate": 1.4358478330005222e-05, + "loss": 0.3008, + "step": 4743 + }, + { + "epoch": 0.3758367993662111, + "grad_norm": 1.6658587344631532, + "learning_rate": 1.4356168705706195e-05, + "loss": 0.4229, + "step": 4744 + }, + { + "epoch": 0.3759160229748465, + "grad_norm": 1.1808276796449346, + "learning_rate": 1.4353858794574418e-05, + "loss": 0.2283, + "step": 4745 + }, + { + "epoch": 0.3759952465834819, + "grad_norm": 1.4503386906435112, + "learning_rate": 1.435154859676199e-05, + "loss": 0.218, + "step": 4746 + }, + { + "epoch": 0.37607447019211726, + "grad_norm": 1.6026729841908476, + "learning_rate": 1.4349238112421025e-05, + "loss": 0.3528, + "step": 4747 + }, + { + "epoch": 0.37615369380075264, + "grad_norm": 1.5594781413395669, + "learning_rate": 1.4346927341703659e-05, + "loss": 0.2479, + "step": 4748 + }, + { + "epoch": 0.376232917409388, + "grad_norm": 1.7522496954621372, + "learning_rate": 1.4344616284762038e-05, + "loss": 0.305, + "step": 4749 + }, + { + "epoch": 0.37631214101802335, + "grad_norm": 1.5081692518287162, + "learning_rate": 1.4342304941748347e-05, + "loss": 0.3058, + "step": 4750 + }, + { + "epoch": 0.37639136462665873, + "grad_norm": 1.3274860445854602, + "learning_rate": 1.4339993312814765e-05, + "loss": 0.2843, + "step": 4751 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 1.7588223459918972, + "learning_rate": 1.4337681398113508e-05, + "loss": 0.3943, + "step": 4752 + }, + { + "epoch": 0.3765498118439295, + "grad_norm": 1.539578007692588, + "learning_rate": 1.4335369197796803e-05, + "loss": 0.3581, + "step": 4753 + }, + { + "epoch": 0.3766290354525649, + "grad_norm": 1.4499875464029557, + "learning_rate": 1.4333056712016893e-05, + "loss": 0.3362, + "step": 4754 + }, + { + "epoch": 0.37670825906120026, + "grad_norm": 1.506353803068302, + "learning_rate": 1.4330743940926052e-05, + "loss": 0.4008, + "step": 4755 + }, + { + "epoch": 0.37678748266983564, + "grad_norm": 1.5167911728871935, + "learning_rate": 1.4328430884676559e-05, + "loss": 0.4025, + "step": 4756 + }, + { + "epoch": 0.37686670627847096, + "grad_norm": 1.5081114452206938, + "learning_rate": 1.432611754342072e-05, + "loss": 0.2339, + "step": 4757 + }, + { + "epoch": 0.37694592988710635, + "grad_norm": 1.6514876173222937, + "learning_rate": 1.4323803917310857e-05, + "loss": 0.3226, + "step": 4758 + }, + { + "epoch": 0.3770251534957417, + "grad_norm": 1.3272690395731235, + "learning_rate": 1.4321490006499309e-05, + "loss": 0.3652, + "step": 4759 + }, + { + "epoch": 0.3771043771043771, + "grad_norm": 1.728946837203164, + "learning_rate": 1.4319175811138439e-05, + "loss": 0.4434, + "step": 4760 + }, + { + "epoch": 0.3771836007130125, + "grad_norm": 1.2655002648705844, + "learning_rate": 1.4316861331380624e-05, + "loss": 0.2848, + "step": 4761 + }, + { + "epoch": 0.37726282432164787, + "grad_norm": 1.5918721024132505, + "learning_rate": 1.431454656737826e-05, + "loss": 0.297, + "step": 4762 + }, + { + "epoch": 0.3773420479302832, + "grad_norm": 1.6932390692880819, + "learning_rate": 1.4312231519283768e-05, + "loss": 0.4457, + "step": 4763 + }, + { + "epoch": 0.3774212715389186, + "grad_norm": 1.55078416855098, + "learning_rate": 1.4309916187249578e-05, + "loss": 0.3203, + "step": 4764 + }, + { + "epoch": 0.37750049514755396, + "grad_norm": 1.4521369653671037, + "learning_rate": 1.4307600571428143e-05, + "loss": 0.2671, + "step": 4765 + }, + { + "epoch": 0.37757971875618934, + "grad_norm": 1.825500781525087, + "learning_rate": 1.4305284671971943e-05, + "loss": 0.3236, + "step": 4766 + }, + { + "epoch": 0.3776589423648247, + "grad_norm": 1.5303295024891377, + "learning_rate": 1.4302968489033462e-05, + "loss": 0.3466, + "step": 4767 + }, + { + "epoch": 0.3777381659734601, + "grad_norm": 1.2917772443555833, + "learning_rate": 1.4300652022765207e-05, + "loss": 0.2203, + "step": 4768 + }, + { + "epoch": 0.3778173895820955, + "grad_norm": 1.3576850745825737, + "learning_rate": 1.429833527331971e-05, + "loss": 0.2941, + "step": 4769 + }, + { + "epoch": 0.3778966131907308, + "grad_norm": 1.4598007330713265, + "learning_rate": 1.4296018240849518e-05, + "loss": 0.2954, + "step": 4770 + }, + { + "epoch": 0.3779758367993662, + "grad_norm": 1.7004278676561306, + "learning_rate": 1.4293700925507199e-05, + "loss": 0.4102, + "step": 4771 + }, + { + "epoch": 0.3780550604080016, + "grad_norm": 1.3716560256662682, + "learning_rate": 1.429138332744533e-05, + "loss": 0.2995, + "step": 4772 + }, + { + "epoch": 0.37813428401663696, + "grad_norm": 1.8978984802341372, + "learning_rate": 1.428906544681652e-05, + "loss": 0.5102, + "step": 4773 + }, + { + "epoch": 0.37821350762527234, + "grad_norm": 1.480435219751354, + "learning_rate": 1.4286747283773388e-05, + "loss": 0.3326, + "step": 4774 + }, + { + "epoch": 0.3782927312339077, + "grad_norm": 1.6170666697721474, + "learning_rate": 1.4284428838468572e-05, + "loss": 0.4041, + "step": 4775 + }, + { + "epoch": 0.3783719548425431, + "grad_norm": 1.5603998015216134, + "learning_rate": 1.4282110111054733e-05, + "loss": 0.3073, + "step": 4776 + }, + { + "epoch": 0.3784511784511784, + "grad_norm": 1.2834935868281077, + "learning_rate": 1.4279791101684547e-05, + "loss": 0.3091, + "step": 4777 + }, + { + "epoch": 0.3785304020598138, + "grad_norm": 1.664788512525511, + "learning_rate": 1.427747181051071e-05, + "loss": 0.3906, + "step": 4778 + }, + { + "epoch": 0.3786096256684492, + "grad_norm": 1.388924260721447, + "learning_rate": 1.4275152237685938e-05, + "loss": 0.2765, + "step": 4779 + }, + { + "epoch": 0.37868884927708457, + "grad_norm": 1.504079229141585, + "learning_rate": 1.4272832383362962e-05, + "loss": 0.3215, + "step": 4780 + }, + { + "epoch": 0.37876807288571995, + "grad_norm": 1.7578236816960764, + "learning_rate": 1.427051224769453e-05, + "loss": 0.293, + "step": 4781 + }, + { + "epoch": 0.37884729649435533, + "grad_norm": 1.6795768761384273, + "learning_rate": 1.4268191830833417e-05, + "loss": 0.3963, + "step": 4782 + }, + { + "epoch": 0.3789265201029907, + "grad_norm": 1.394179790188331, + "learning_rate": 1.426587113293241e-05, + "loss": 0.305, + "step": 4783 + }, + { + "epoch": 0.37900574371162604, + "grad_norm": 1.532906346205471, + "learning_rate": 1.4263550154144313e-05, + "loss": 0.3036, + "step": 4784 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 1.2060321000261924, + "learning_rate": 1.4261228894621955e-05, + "loss": 0.2612, + "step": 4785 + }, + { + "epoch": 0.3791641909288968, + "grad_norm": 1.2643894269638989, + "learning_rate": 1.4258907354518177e-05, + "loss": 0.3271, + "step": 4786 + }, + { + "epoch": 0.3792434145375322, + "grad_norm": 1.4460847572072517, + "learning_rate": 1.4256585533985842e-05, + "loss": 0.2806, + "step": 4787 + }, + { + "epoch": 0.37932263814616757, + "grad_norm": 1.433822254279183, + "learning_rate": 1.425426343317783e-05, + "loss": 0.2994, + "step": 4788 + }, + { + "epoch": 0.37940186175480295, + "grad_norm": 1.5820851739982629, + "learning_rate": 1.4251941052247044e-05, + "loss": 0.3346, + "step": 4789 + }, + { + "epoch": 0.37948108536343833, + "grad_norm": 1.2061245598050252, + "learning_rate": 1.4249618391346399e-05, + "loss": 0.2543, + "step": 4790 + }, + { + "epoch": 0.37956030897207366, + "grad_norm": 1.4766982300934812, + "learning_rate": 1.4247295450628826e-05, + "loss": 0.3901, + "step": 4791 + }, + { + "epoch": 0.37963953258070904, + "grad_norm": 1.2927766879477545, + "learning_rate": 1.4244972230247287e-05, + "loss": 0.2867, + "step": 4792 + }, + { + "epoch": 0.3797187561893444, + "grad_norm": 1.5140968593992927, + "learning_rate": 1.4242648730354756e-05, + "loss": 0.2814, + "step": 4793 + }, + { + "epoch": 0.3797979797979798, + "grad_norm": 2.0780975926321226, + "learning_rate": 1.4240324951104213e-05, + "loss": 0.3576, + "step": 4794 + }, + { + "epoch": 0.3798772034066152, + "grad_norm": 1.4711231963119753, + "learning_rate": 1.4238000892648682e-05, + "loss": 0.375, + "step": 4795 + }, + { + "epoch": 0.37995642701525056, + "grad_norm": 1.402121695072646, + "learning_rate": 1.423567655514118e-05, + "loss": 0.332, + "step": 4796 + }, + { + "epoch": 0.38003565062388595, + "grad_norm": 1.5551963104852167, + "learning_rate": 1.4233351938734758e-05, + "loss": 0.3524, + "step": 4797 + }, + { + "epoch": 0.38011487423252127, + "grad_norm": 1.7685458571080634, + "learning_rate": 1.4231027043582483e-05, + "loss": 0.3524, + "step": 4798 + }, + { + "epoch": 0.38019409784115665, + "grad_norm": 1.4910700471238236, + "learning_rate": 1.4228701869837433e-05, + "loss": 0.2447, + "step": 4799 + }, + { + "epoch": 0.38027332144979203, + "grad_norm": 1.3840322448951945, + "learning_rate": 1.4226376417652713e-05, + "loss": 0.2976, + "step": 4800 + }, + { + "epoch": 0.3803525450584274, + "grad_norm": 1.992655736226578, + "learning_rate": 1.4224050687181442e-05, + "loss": 0.514, + "step": 4801 + }, + { + "epoch": 0.3804317686670628, + "grad_norm": 1.3722806032954438, + "learning_rate": 1.4221724678576756e-05, + "loss": 0.32, + "step": 4802 + }, + { + "epoch": 0.3805109922756982, + "grad_norm": 1.5161612358815824, + "learning_rate": 1.421939839199182e-05, + "loss": 0.3037, + "step": 4803 + }, + { + "epoch": 0.3805902158843335, + "grad_norm": 1.722906661712149, + "learning_rate": 1.4217071827579796e-05, + "loss": 0.4407, + "step": 4804 + }, + { + "epoch": 0.3806694394929689, + "grad_norm": 1.4325706364116861, + "learning_rate": 1.4214744985493884e-05, + "loss": 0.3028, + "step": 4805 + }, + { + "epoch": 0.38074866310160427, + "grad_norm": 1.4871762269384778, + "learning_rate": 1.4212417865887299e-05, + "loss": 0.3223, + "step": 4806 + }, + { + "epoch": 0.38082788671023965, + "grad_norm": 1.5846116636037701, + "learning_rate": 1.4210090468913263e-05, + "loss": 0.3463, + "step": 4807 + }, + { + "epoch": 0.38090711031887503, + "grad_norm": 1.4911394889462763, + "learning_rate": 1.4207762794725026e-05, + "loss": 0.2443, + "step": 4808 + }, + { + "epoch": 0.3809863339275104, + "grad_norm": 1.71450505054649, + "learning_rate": 1.4205434843475859e-05, + "loss": 0.4016, + "step": 4809 + }, + { + "epoch": 0.3810655575361458, + "grad_norm": 1.2468708300234312, + "learning_rate": 1.420310661531904e-05, + "loss": 0.3156, + "step": 4810 + }, + { + "epoch": 0.3811447811447811, + "grad_norm": 1.4510412626556506, + "learning_rate": 1.4200778110407873e-05, + "loss": 0.3015, + "step": 4811 + }, + { + "epoch": 0.3812240047534165, + "grad_norm": 1.2815638980910853, + "learning_rate": 1.4198449328895685e-05, + "loss": 0.2661, + "step": 4812 + }, + { + "epoch": 0.3813032283620519, + "grad_norm": 1.3894997499992157, + "learning_rate": 1.4196120270935807e-05, + "loss": 0.3037, + "step": 4813 + }, + { + "epoch": 0.38138245197068726, + "grad_norm": 1.9720604861663078, + "learning_rate": 1.4193790936681602e-05, + "loss": 0.4662, + "step": 4814 + }, + { + "epoch": 0.38146167557932265, + "grad_norm": 1.3074617738028005, + "learning_rate": 1.4191461326286442e-05, + "loss": 0.2884, + "step": 4815 + }, + { + "epoch": 0.381540899187958, + "grad_norm": 1.6905048906543017, + "learning_rate": 1.4189131439903721e-05, + "loss": 0.454, + "step": 4816 + }, + { + "epoch": 0.3816201227965934, + "grad_norm": 1.5671984538408312, + "learning_rate": 1.4186801277686852e-05, + "loss": 0.3987, + "step": 4817 + }, + { + "epoch": 0.38169934640522873, + "grad_norm": 1.2480114164130005, + "learning_rate": 1.4184470839789265e-05, + "loss": 0.1935, + "step": 4818 + }, + { + "epoch": 0.3817785700138641, + "grad_norm": 1.5023859298809623, + "learning_rate": 1.4182140126364404e-05, + "loss": 0.2729, + "step": 4819 + }, + { + "epoch": 0.3818577936224995, + "grad_norm": 1.456270837781397, + "learning_rate": 1.4179809137565742e-05, + "loss": 0.333, + "step": 4820 + }, + { + "epoch": 0.3819370172311349, + "grad_norm": 1.4132678249526858, + "learning_rate": 1.417747787354676e-05, + "loss": 0.3856, + "step": 4821 + }, + { + "epoch": 0.38201624083977026, + "grad_norm": 1.8741654642532457, + "learning_rate": 1.4175146334460963e-05, + "loss": 0.3839, + "step": 4822 + }, + { + "epoch": 0.38209546444840564, + "grad_norm": 1.437968325797009, + "learning_rate": 1.4172814520461867e-05, + "loss": 0.3176, + "step": 4823 + }, + { + "epoch": 0.382174688057041, + "grad_norm": 1.5045774004637167, + "learning_rate": 1.4170482431703012e-05, + "loss": 0.3417, + "step": 4824 + }, + { + "epoch": 0.38225391166567635, + "grad_norm": 1.6932079063719387, + "learning_rate": 1.4168150068337958e-05, + "loss": 0.2693, + "step": 4825 + }, + { + "epoch": 0.38233313527431173, + "grad_norm": 1.5961538588669137, + "learning_rate": 1.4165817430520276e-05, + "loss": 0.3366, + "step": 4826 + }, + { + "epoch": 0.3824123588829471, + "grad_norm": 1.4955891414265299, + "learning_rate": 1.4163484518403561e-05, + "loss": 0.3621, + "step": 4827 + }, + { + "epoch": 0.3824915824915825, + "grad_norm": 1.8994905647227531, + "learning_rate": 1.4161151332141426e-05, + "loss": 0.339, + "step": 4828 + }, + { + "epoch": 0.3825708061002179, + "grad_norm": 1.4277600710447218, + "learning_rate": 1.4158817871887497e-05, + "loss": 0.3345, + "step": 4829 + }, + { + "epoch": 0.38265002970885326, + "grad_norm": 1.1820947784878053, + "learning_rate": 1.4156484137795424e-05, + "loss": 0.2685, + "step": 4830 + }, + { + "epoch": 0.38272925331748864, + "grad_norm": 1.464863891146817, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.3885, + "step": 4831 + }, + { + "epoch": 0.38280847692612396, + "grad_norm": 1.6709840314896864, + "learning_rate": 1.4151815848711512e-05, + "loss": 0.3355, + "step": 4832 + }, + { + "epoch": 0.38288770053475935, + "grad_norm": 1.4371088335493813, + "learning_rate": 1.4149481294027063e-05, + "loss": 0.3113, + "step": 4833 + }, + { + "epoch": 0.3829669241433947, + "grad_norm": 1.4629809768142543, + "learning_rate": 1.4147146466119235e-05, + "loss": 0.3118, + "step": 4834 + }, + { + "epoch": 0.3830461477520301, + "grad_norm": 1.2887370470691075, + "learning_rate": 1.4144811365141769e-05, + "loss": 0.2397, + "step": 4835 + }, + { + "epoch": 0.3831253713606655, + "grad_norm": 1.526638232785953, + "learning_rate": 1.4142475991248417e-05, + "loss": 0.3646, + "step": 4836 + }, + { + "epoch": 0.38320459496930087, + "grad_norm": 1.8151728744650315, + "learning_rate": 1.4140140344592952e-05, + "loss": 0.4331, + "step": 4837 + }, + { + "epoch": 0.3832838185779362, + "grad_norm": 1.5869284743567247, + "learning_rate": 1.413780442532917e-05, + "loss": 0.3556, + "step": 4838 + }, + { + "epoch": 0.3833630421865716, + "grad_norm": 1.3999090768521412, + "learning_rate": 1.4135468233610872e-05, + "loss": 0.3369, + "step": 4839 + }, + { + "epoch": 0.38344226579520696, + "grad_norm": 1.534627600910808, + "learning_rate": 1.4133131769591893e-05, + "loss": 0.2688, + "step": 4840 + }, + { + "epoch": 0.38352148940384234, + "grad_norm": 1.3799826397724837, + "learning_rate": 1.4130795033426073e-05, + "loss": 0.2866, + "step": 4841 + }, + { + "epoch": 0.3836007130124777, + "grad_norm": 1.5145454712738629, + "learning_rate": 1.4128458025267276e-05, + "loss": 0.3713, + "step": 4842 + }, + { + "epoch": 0.3836799366211131, + "grad_norm": 1.4029868938789205, + "learning_rate": 1.4126120745269382e-05, + "loss": 0.3325, + "step": 4843 + }, + { + "epoch": 0.3837591602297485, + "grad_norm": 1.3859297427363313, + "learning_rate": 1.4123783193586294e-05, + "loss": 0.3033, + "step": 4844 + }, + { + "epoch": 0.3838383838383838, + "grad_norm": 1.4367793385527374, + "learning_rate": 1.4121445370371922e-05, + "loss": 0.2948, + "step": 4845 + }, + { + "epoch": 0.3839176074470192, + "grad_norm": 1.3009985454714486, + "learning_rate": 1.4119107275780203e-05, + "loss": 0.2924, + "step": 4846 + }, + { + "epoch": 0.3839968310556546, + "grad_norm": 1.3214844254389435, + "learning_rate": 1.4116768909965092e-05, + "loss": 0.2372, + "step": 4847 + }, + { + "epoch": 0.38407605466428996, + "grad_norm": 1.2892725320025957, + "learning_rate": 1.4114430273080558e-05, + "loss": 0.2241, + "step": 4848 + }, + { + "epoch": 0.38415527827292534, + "grad_norm": 1.4742226328357293, + "learning_rate": 1.4112091365280585e-05, + "loss": 0.384, + "step": 4849 + }, + { + "epoch": 0.3842345018815607, + "grad_norm": 1.5086970159345738, + "learning_rate": 1.4109752186719181e-05, + "loss": 0.3531, + "step": 4850 + }, + { + "epoch": 0.3843137254901961, + "grad_norm": 1.3932404244457717, + "learning_rate": 1.4107412737550372e-05, + "loss": 0.3144, + "step": 4851 + }, + { + "epoch": 0.3843929490988314, + "grad_norm": 1.6274932328954217, + "learning_rate": 1.4105073017928199e-05, + "loss": 0.343, + "step": 4852 + }, + { + "epoch": 0.3844721727074668, + "grad_norm": 1.5895181306760369, + "learning_rate": 1.4102733028006719e-05, + "loss": 0.3992, + "step": 4853 + }, + { + "epoch": 0.3845513963161022, + "grad_norm": 1.4167651297118389, + "learning_rate": 1.410039276794001e-05, + "loss": 0.2739, + "step": 4854 + }, + { + "epoch": 0.38463061992473757, + "grad_norm": 1.3318687998724372, + "learning_rate": 1.4098052237882168e-05, + "loss": 0.2687, + "step": 4855 + }, + { + "epoch": 0.38470984353337295, + "grad_norm": 1.4652998253881695, + "learning_rate": 1.4095711437987303e-05, + "loss": 0.3214, + "step": 4856 + }, + { + "epoch": 0.38478906714200833, + "grad_norm": 1.3185510641697167, + "learning_rate": 1.4093370368409546e-05, + "loss": 0.216, + "step": 4857 + }, + { + "epoch": 0.3848682907506437, + "grad_norm": 1.7416634357536502, + "learning_rate": 1.409102902930305e-05, + "loss": 0.2907, + "step": 4858 + }, + { + "epoch": 0.38494751435927904, + "grad_norm": 1.6139151540670138, + "learning_rate": 1.4088687420821974e-05, + "loss": 0.3849, + "step": 4859 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 1.59243920936772, + "learning_rate": 1.4086345543120508e-05, + "loss": 0.3523, + "step": 4860 + }, + { + "epoch": 0.3851059615765498, + "grad_norm": 1.7621266320877087, + "learning_rate": 1.4084003396352848e-05, + "loss": 0.3887, + "step": 4861 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 1.4568306356231104, + "learning_rate": 1.4081660980673215e-05, + "loss": 0.3004, + "step": 4862 + }, + { + "epoch": 0.38526440879382057, + "grad_norm": 1.499868370735634, + "learning_rate": 1.4079318296235846e-05, + "loss": 0.263, + "step": 4863 + }, + { + "epoch": 0.38534363240245595, + "grad_norm": 1.389251130343504, + "learning_rate": 1.4076975343194996e-05, + "loss": 0.2393, + "step": 4864 + }, + { + "epoch": 0.38542285601109133, + "grad_norm": 1.8785391613569955, + "learning_rate": 1.4074632121704941e-05, + "loss": 0.4887, + "step": 4865 + }, + { + "epoch": 0.38550207961972666, + "grad_norm": 1.7127361770676095, + "learning_rate": 1.4072288631919962e-05, + "loss": 0.3281, + "step": 4866 + }, + { + "epoch": 0.38558130322836204, + "grad_norm": 1.4570751925136936, + "learning_rate": 1.406994487399437e-05, + "loss": 0.3089, + "step": 4867 + }, + { + "epoch": 0.3856605268369974, + "grad_norm": 1.4250013415436769, + "learning_rate": 1.4067600848082496e-05, + "loss": 0.2918, + "step": 4868 + }, + { + "epoch": 0.3857397504456328, + "grad_norm": 1.3092557186054714, + "learning_rate": 1.4065256554338675e-05, + "loss": 0.2441, + "step": 4869 + }, + { + "epoch": 0.3858189740542682, + "grad_norm": 1.6026662951991348, + "learning_rate": 1.406291199291727e-05, + "loss": 0.3461, + "step": 4870 + }, + { + "epoch": 0.38589819766290356, + "grad_norm": 1.7070252621318853, + "learning_rate": 1.4060567163972663e-05, + "loss": 0.4226, + "step": 4871 + }, + { + "epoch": 0.38597742127153895, + "grad_norm": 1.392679573232567, + "learning_rate": 1.4058222067659244e-05, + "loss": 0.3104, + "step": 4872 + }, + { + "epoch": 0.38605664488017427, + "grad_norm": 1.3057904428747868, + "learning_rate": 1.405587670413143e-05, + "loss": 0.2831, + "step": 4873 + }, + { + "epoch": 0.38613586848880965, + "grad_norm": 1.693204687443313, + "learning_rate": 1.405353107354365e-05, + "loss": 0.3544, + "step": 4874 + }, + { + "epoch": 0.38621509209744503, + "grad_norm": 1.6760805100532747, + "learning_rate": 1.4051185176050353e-05, + "loss": 0.3324, + "step": 4875 + }, + { + "epoch": 0.3862943157060804, + "grad_norm": 1.453011845161088, + "learning_rate": 1.4048839011806006e-05, + "loss": 0.2512, + "step": 4876 + }, + { + "epoch": 0.3863735393147158, + "grad_norm": 1.5671310369401856, + "learning_rate": 1.404649258096509e-05, + "loss": 0.4056, + "step": 4877 + }, + { + "epoch": 0.3864527629233512, + "grad_norm": 1.4378890463135665, + "learning_rate": 1.4044145883682108e-05, + "loss": 0.2719, + "step": 4878 + }, + { + "epoch": 0.3865319865319865, + "grad_norm": 1.656400400040357, + "learning_rate": 1.4041798920111582e-05, + "loss": 0.2845, + "step": 4879 + }, + { + "epoch": 0.3866112101406219, + "grad_norm": 1.4339987754579901, + "learning_rate": 1.4039451690408042e-05, + "loss": 0.3309, + "step": 4880 + }, + { + "epoch": 0.38669043374925727, + "grad_norm": 1.3487398994118345, + "learning_rate": 1.4037104194726048e-05, + "loss": 0.269, + "step": 4881 + }, + { + "epoch": 0.38676965735789265, + "grad_norm": 1.6421955042997727, + "learning_rate": 1.4034756433220164e-05, + "loss": 0.3485, + "step": 4882 + }, + { + "epoch": 0.38684888096652803, + "grad_norm": 1.4465869016485433, + "learning_rate": 1.4032408406044986e-05, + "loss": 0.2558, + "step": 4883 + }, + { + "epoch": 0.3869281045751634, + "grad_norm": 1.3221438980799047, + "learning_rate": 1.4030060113355118e-05, + "loss": 0.2674, + "step": 4884 + }, + { + "epoch": 0.3870073281837988, + "grad_norm": 1.4552553725512243, + "learning_rate": 1.402771155530518e-05, + "loss": 0.3098, + "step": 4885 + }, + { + "epoch": 0.3870865517924341, + "grad_norm": 1.7244149169211545, + "learning_rate": 1.4025362732049816e-05, + "loss": 0.324, + "step": 4886 + }, + { + "epoch": 0.3871657754010695, + "grad_norm": 1.6021578566570707, + "learning_rate": 1.4023013643743688e-05, + "loss": 0.301, + "step": 4887 + }, + { + "epoch": 0.3872449990097049, + "grad_norm": 1.245091050052912, + "learning_rate": 1.4020664290541465e-05, + "loss": 0.223, + "step": 4888 + }, + { + "epoch": 0.38732422261834026, + "grad_norm": 1.5142043254638706, + "learning_rate": 1.4018314672597848e-05, + "loss": 0.3521, + "step": 4889 + }, + { + "epoch": 0.38740344622697565, + "grad_norm": 1.605573363303424, + "learning_rate": 1.4015964790067545e-05, + "loss": 0.3046, + "step": 4890 + }, + { + "epoch": 0.387482669835611, + "grad_norm": 1.3940049026532149, + "learning_rate": 1.401361464310528e-05, + "loss": 0.2442, + "step": 4891 + }, + { + "epoch": 0.3875618934442464, + "grad_norm": 1.8692209407690585, + "learning_rate": 1.4011264231865807e-05, + "loss": 0.3858, + "step": 4892 + }, + { + "epoch": 0.38764111705288173, + "grad_norm": 1.5836848128189307, + "learning_rate": 1.4008913556503885e-05, + "loss": 0.3213, + "step": 4893 + }, + { + "epoch": 0.3877203406615171, + "grad_norm": 1.5546143594560407, + "learning_rate": 1.4006562617174292e-05, + "loss": 0.2907, + "step": 4894 + }, + { + "epoch": 0.3877995642701525, + "grad_norm": 1.3973791623022782, + "learning_rate": 1.4004211414031831e-05, + "loss": 0.2564, + "step": 4895 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 1.4495212236430446, + "learning_rate": 1.4001859947231316e-05, + "loss": 0.2934, + "step": 4896 + }, + { + "epoch": 0.38795801148742326, + "grad_norm": 1.550855352746961, + "learning_rate": 1.3999508216927578e-05, + "loss": 0.2918, + "step": 4897 + }, + { + "epoch": 0.38803723509605864, + "grad_norm": 1.6357448051825703, + "learning_rate": 1.399715622327547e-05, + "loss": 0.3758, + "step": 4898 + }, + { + "epoch": 0.388116458704694, + "grad_norm": 1.4102534696996243, + "learning_rate": 1.3994803966429854e-05, + "loss": 0.3107, + "step": 4899 + }, + { + "epoch": 0.38819568231332935, + "grad_norm": 1.4405333401650178, + "learning_rate": 1.3992451446545624e-05, + "loss": 0.2864, + "step": 4900 + }, + { + "epoch": 0.38827490592196473, + "grad_norm": 1.3815074972136017, + "learning_rate": 1.3990098663777674e-05, + "loss": 0.2869, + "step": 4901 + }, + { + "epoch": 0.3883541295306001, + "grad_norm": 1.2807015578899121, + "learning_rate": 1.3987745618280925e-05, + "loss": 0.2803, + "step": 4902 + }, + { + "epoch": 0.3884333531392355, + "grad_norm": 1.7737699226786035, + "learning_rate": 1.3985392310210318e-05, + "loss": 0.3655, + "step": 4903 + }, + { + "epoch": 0.3885125767478709, + "grad_norm": 1.5189345534784644, + "learning_rate": 1.39830387397208e-05, + "loss": 0.3009, + "step": 4904 + }, + { + "epoch": 0.38859180035650626, + "grad_norm": 1.4081276248404047, + "learning_rate": 1.3980684906967348e-05, + "loss": 0.2972, + "step": 4905 + }, + { + "epoch": 0.38867102396514164, + "grad_norm": 1.5252911547919124, + "learning_rate": 1.3978330812104947e-05, + "loss": 0.4237, + "step": 4906 + }, + { + "epoch": 0.38875024757377696, + "grad_norm": 1.857577057367717, + "learning_rate": 1.3975976455288607e-05, + "loss": 0.4442, + "step": 4907 + }, + { + "epoch": 0.38882947118241235, + "grad_norm": 1.5517775074502338, + "learning_rate": 1.397362183667335e-05, + "loss": 0.299, + "step": 4908 + }, + { + "epoch": 0.3889086947910477, + "grad_norm": 1.7384555905688281, + "learning_rate": 1.3971266956414211e-05, + "loss": 0.3328, + "step": 4909 + }, + { + "epoch": 0.3889879183996831, + "grad_norm": 1.5151401049783881, + "learning_rate": 1.3968911814666252e-05, + "loss": 0.2712, + "step": 4910 + }, + { + "epoch": 0.3890671420083185, + "grad_norm": 1.4591876739845957, + "learning_rate": 1.3966556411584548e-05, + "loss": 0.2688, + "step": 4911 + }, + { + "epoch": 0.38914636561695387, + "grad_norm": 1.197145569434945, + "learning_rate": 1.396420074732419e-05, + "loss": 0.2645, + "step": 4912 + }, + { + "epoch": 0.38922558922558925, + "grad_norm": 1.4483910374396045, + "learning_rate": 1.396184482204029e-05, + "loss": 0.3459, + "step": 4913 + }, + { + "epoch": 0.3893048128342246, + "grad_norm": 1.6798923908578112, + "learning_rate": 1.3959488635887967e-05, + "loss": 0.3377, + "step": 4914 + }, + { + "epoch": 0.38938403644285996, + "grad_norm": 1.8372114205527845, + "learning_rate": 1.3957132189022373e-05, + "loss": 0.3953, + "step": 4915 + }, + { + "epoch": 0.38946326005149534, + "grad_norm": 1.5875159009324193, + "learning_rate": 1.3954775481598665e-05, + "loss": 0.3627, + "step": 4916 + }, + { + "epoch": 0.3895424836601307, + "grad_norm": 1.536919695369658, + "learning_rate": 1.3952418513772016e-05, + "loss": 0.3839, + "step": 4917 + }, + { + "epoch": 0.3896217072687661, + "grad_norm": 1.5192670346215063, + "learning_rate": 1.3950061285697629e-05, + "loss": 0.3168, + "step": 4918 + }, + { + "epoch": 0.3897009308774015, + "grad_norm": 1.3322279353201638, + "learning_rate": 1.3947703797530716e-05, + "loss": 0.2695, + "step": 4919 + }, + { + "epoch": 0.3897801544860368, + "grad_norm": 1.900708452974259, + "learning_rate": 1.3945346049426498e-05, + "loss": 0.4402, + "step": 4920 + }, + { + "epoch": 0.3898593780946722, + "grad_norm": 1.3076043739971344, + "learning_rate": 1.3942988041540226e-05, + "loss": 0.2402, + "step": 4921 + }, + { + "epoch": 0.3899386017033076, + "grad_norm": 1.4605614608888793, + "learning_rate": 1.394062977402717e-05, + "loss": 0.2432, + "step": 4922 + }, + { + "epoch": 0.39001782531194296, + "grad_norm": 1.498653246339802, + "learning_rate": 1.3938271247042601e-05, + "loss": 0.3179, + "step": 4923 + }, + { + "epoch": 0.39009704892057834, + "grad_norm": 1.3603336566482187, + "learning_rate": 1.3935912460741818e-05, + "loss": 0.2831, + "step": 4924 + }, + { + "epoch": 0.3901762725292137, + "grad_norm": 1.595824176692529, + "learning_rate": 1.3933553415280142e-05, + "loss": 0.3657, + "step": 4925 + }, + { + "epoch": 0.3902554961378491, + "grad_norm": 1.5565791931987947, + "learning_rate": 1.3931194110812896e-05, + "loss": 0.4068, + "step": 4926 + }, + { + "epoch": 0.3903347197464844, + "grad_norm": 1.5866391152435144, + "learning_rate": 1.3928834547495438e-05, + "loss": 0.3923, + "step": 4927 + }, + { + "epoch": 0.3904139433551198, + "grad_norm": 1.2514747270586963, + "learning_rate": 1.3926474725483125e-05, + "loss": 0.3238, + "step": 4928 + }, + { + "epoch": 0.3904931669637552, + "grad_norm": 1.2428996148590872, + "learning_rate": 1.3924114644931346e-05, + "loss": 0.2322, + "step": 4929 + }, + { + "epoch": 0.39057239057239057, + "grad_norm": 1.5089946511845411, + "learning_rate": 1.3921754305995501e-05, + "loss": 0.3293, + "step": 4930 + }, + { + "epoch": 0.39065161418102595, + "grad_norm": 1.4372606111853505, + "learning_rate": 1.3919393708831004e-05, + "loss": 0.3526, + "step": 4931 + }, + { + "epoch": 0.39073083778966133, + "grad_norm": 2.350554482680579, + "learning_rate": 1.3917032853593289e-05, + "loss": 0.4767, + "step": 4932 + }, + { + "epoch": 0.3908100613982967, + "grad_norm": 1.386996836373447, + "learning_rate": 1.3914671740437811e-05, + "loss": 0.2811, + "step": 4933 + }, + { + "epoch": 0.39088928500693204, + "grad_norm": 1.480487325372816, + "learning_rate": 1.3912310369520032e-05, + "loss": 0.3345, + "step": 4934 + }, + { + "epoch": 0.3909685086155674, + "grad_norm": 1.774852860448413, + "learning_rate": 1.3909948740995442e-05, + "loss": 0.3905, + "step": 4935 + }, + { + "epoch": 0.3910477322242028, + "grad_norm": 1.5142521263545279, + "learning_rate": 1.3907586855019538e-05, + "loss": 0.4321, + "step": 4936 + }, + { + "epoch": 0.3911269558328382, + "grad_norm": 1.5950034703760658, + "learning_rate": 1.3905224711747844e-05, + "loss": 0.3508, + "step": 4937 + }, + { + "epoch": 0.39120617944147357, + "grad_norm": 1.3624766687345164, + "learning_rate": 1.3902862311335896e-05, + "loss": 0.3258, + "step": 4938 + }, + { + "epoch": 0.39128540305010895, + "grad_norm": 1.1130591754136652, + "learning_rate": 1.390049965393924e-05, + "loss": 0.2118, + "step": 4939 + }, + { + "epoch": 0.39136462665874433, + "grad_norm": 1.3571908795904069, + "learning_rate": 1.3898136739713451e-05, + "loss": 0.3062, + "step": 4940 + }, + { + "epoch": 0.39144385026737966, + "grad_norm": 1.1739721243409775, + "learning_rate": 1.3895773568814118e-05, + "loss": 0.2056, + "step": 4941 + }, + { + "epoch": 0.39152307387601504, + "grad_norm": 1.5190804103607454, + "learning_rate": 1.3893410141396835e-05, + "loss": 0.3981, + "step": 4942 + }, + { + "epoch": 0.3916022974846504, + "grad_norm": 1.8777037940433605, + "learning_rate": 1.3891046457617233e-05, + "loss": 0.3201, + "step": 4943 + }, + { + "epoch": 0.3916815210932858, + "grad_norm": 1.457636938366171, + "learning_rate": 1.388868251763094e-05, + "loss": 0.3054, + "step": 4944 + }, + { + "epoch": 0.3917607447019212, + "grad_norm": 1.4995620941905698, + "learning_rate": 1.3886318321593614e-05, + "loss": 0.3182, + "step": 4945 + }, + { + "epoch": 0.39183996831055656, + "grad_norm": 1.5102016269779819, + "learning_rate": 1.388395386966093e-05, + "loss": 0.3344, + "step": 4946 + }, + { + "epoch": 0.39191919191919194, + "grad_norm": 1.619362491291635, + "learning_rate": 1.388158916198857e-05, + "loss": 0.3207, + "step": 4947 + }, + { + "epoch": 0.39199841552782727, + "grad_norm": 1.6366579897083362, + "learning_rate": 1.3879224198732239e-05, + "loss": 0.2877, + "step": 4948 + }, + { + "epoch": 0.39207763913646265, + "grad_norm": 1.3143741865211294, + "learning_rate": 1.3876858980047665e-05, + "loss": 0.2965, + "step": 4949 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 1.307834048146454, + "learning_rate": 1.3874493506090578e-05, + "loss": 0.2229, + "step": 4950 + }, + { + "epoch": 0.3922360863537334, + "grad_norm": 1.3970734522709591, + "learning_rate": 1.3872127777016739e-05, + "loss": 0.3171, + "step": 4951 + }, + { + "epoch": 0.3923153099623688, + "grad_norm": 1.523061243712294, + "learning_rate": 1.3869761792981915e-05, + "loss": 0.2829, + "step": 4952 + }, + { + "epoch": 0.3923945335710042, + "grad_norm": 1.72991733701946, + "learning_rate": 1.3867395554141899e-05, + "loss": 0.4397, + "step": 4953 + }, + { + "epoch": 0.39247375717963956, + "grad_norm": 1.5156389702467246, + "learning_rate": 1.3865029060652493e-05, + "loss": 0.2811, + "step": 4954 + }, + { + "epoch": 0.3925529807882749, + "grad_norm": 1.544392007745256, + "learning_rate": 1.3862662312669518e-05, + "loss": 0.2781, + "step": 4955 + }, + { + "epoch": 0.39263220439691027, + "grad_norm": 1.1763181963539993, + "learning_rate": 1.386029531034882e-05, + "loss": 0.1858, + "step": 4956 + }, + { + "epoch": 0.39271142800554565, + "grad_norm": 1.436715364023107, + "learning_rate": 1.385792805384625e-05, + "loss": 0.2848, + "step": 4957 + }, + { + "epoch": 0.39279065161418103, + "grad_norm": 1.6864050374375252, + "learning_rate": 1.3855560543317679e-05, + "loss": 0.431, + "step": 4958 + }, + { + "epoch": 0.3928698752228164, + "grad_norm": 1.2407580960938767, + "learning_rate": 1.3853192778919e-05, + "loss": 0.1618, + "step": 4959 + }, + { + "epoch": 0.3929490988314518, + "grad_norm": 1.5653234500224156, + "learning_rate": 1.3850824760806115e-05, + "loss": 0.391, + "step": 4960 + }, + { + "epoch": 0.3930283224400871, + "grad_norm": 2.001064179629869, + "learning_rate": 1.384845648913495e-05, + "loss": 0.4012, + "step": 4961 + }, + { + "epoch": 0.3931075460487225, + "grad_norm": 1.475863294319895, + "learning_rate": 1.3846087964061442e-05, + "loss": 0.3193, + "step": 4962 + }, + { + "epoch": 0.3931867696573579, + "grad_norm": 1.4870112306973544, + "learning_rate": 1.3843719185741548e-05, + "loss": 0.3913, + "step": 4963 + }, + { + "epoch": 0.39326599326599326, + "grad_norm": 1.7489630039622186, + "learning_rate": 1.3841350154331239e-05, + "loss": 0.379, + "step": 4964 + }, + { + "epoch": 0.39334521687462864, + "grad_norm": 1.285459337194145, + "learning_rate": 1.383898086998651e-05, + "loss": 0.2464, + "step": 4965 + }, + { + "epoch": 0.393424440483264, + "grad_norm": 1.3767443739787066, + "learning_rate": 1.3836611332863356e-05, + "loss": 0.2907, + "step": 4966 + }, + { + "epoch": 0.3935036640918994, + "grad_norm": 1.6696218537345195, + "learning_rate": 1.383424154311781e-05, + "loss": 0.2769, + "step": 4967 + }, + { + "epoch": 0.39358288770053473, + "grad_norm": 1.5706534290720284, + "learning_rate": 1.383187150090591e-05, + "loss": 0.3987, + "step": 4968 + }, + { + "epoch": 0.3936621113091701, + "grad_norm": 1.6106067471922265, + "learning_rate": 1.3829501206383704e-05, + "loss": 0.3045, + "step": 4969 + }, + { + "epoch": 0.3937413349178055, + "grad_norm": 1.4170378107587347, + "learning_rate": 1.3827130659707275e-05, + "loss": 0.2872, + "step": 4970 + }, + { + "epoch": 0.3938205585264409, + "grad_norm": 1.7369049510017955, + "learning_rate": 1.3824759861032704e-05, + "loss": 0.4081, + "step": 4971 + }, + { + "epoch": 0.39389978213507626, + "grad_norm": 1.5627150123986702, + "learning_rate": 1.38223888105161e-05, + "loss": 0.2873, + "step": 4972 + }, + { + "epoch": 0.39397900574371164, + "grad_norm": 1.5990469821045645, + "learning_rate": 1.3820017508313587e-05, + "loss": 0.285, + "step": 4973 + }, + { + "epoch": 0.394058229352347, + "grad_norm": 1.574830293032543, + "learning_rate": 1.3817645954581301e-05, + "loss": 0.287, + "step": 4974 + }, + { + "epoch": 0.39413745296098235, + "grad_norm": 1.922110188389245, + "learning_rate": 1.3815274149475395e-05, + "loss": 0.404, + "step": 4975 + }, + { + "epoch": 0.39421667656961773, + "grad_norm": 1.3959745443810705, + "learning_rate": 1.3812902093152047e-05, + "loss": 0.3102, + "step": 4976 + }, + { + "epoch": 0.3942959001782531, + "grad_norm": 1.3148567603923778, + "learning_rate": 1.3810529785767444e-05, + "loss": 0.2902, + "step": 4977 + }, + { + "epoch": 0.3943751237868885, + "grad_norm": 1.562263005375727, + "learning_rate": 1.3808157227477788e-05, + "loss": 0.3135, + "step": 4978 + }, + { + "epoch": 0.3944543473955239, + "grad_norm": 1.445019603491408, + "learning_rate": 1.3805784418439303e-05, + "loss": 0.359, + "step": 4979 + }, + { + "epoch": 0.39453357100415926, + "grad_norm": 1.4375606960140948, + "learning_rate": 1.3803411358808222e-05, + "loss": 0.3641, + "step": 4980 + }, + { + "epoch": 0.39461279461279464, + "grad_norm": 1.699124449073503, + "learning_rate": 1.3801038048740811e-05, + "loss": 0.3997, + "step": 4981 + }, + { + "epoch": 0.39469201822142996, + "grad_norm": 1.5823737147711643, + "learning_rate": 1.379866448839333e-05, + "loss": 0.3482, + "step": 4982 + }, + { + "epoch": 0.39477124183006534, + "grad_norm": 1.3914423040345811, + "learning_rate": 1.379629067792207e-05, + "loss": 0.2589, + "step": 4983 + }, + { + "epoch": 0.3948504654387007, + "grad_norm": 1.7521132149510126, + "learning_rate": 1.3793916617483338e-05, + "loss": 0.3454, + "step": 4984 + }, + { + "epoch": 0.3949296890473361, + "grad_norm": 1.5304703868662894, + "learning_rate": 1.379154230723345e-05, + "loss": 0.3699, + "step": 4985 + }, + { + "epoch": 0.3950089126559715, + "grad_norm": 1.4582817106605426, + "learning_rate": 1.3789167747328746e-05, + "loss": 0.3478, + "step": 4986 + }, + { + "epoch": 0.39508813626460687, + "grad_norm": 1.6094975130019122, + "learning_rate": 1.3786792937925576e-05, + "loss": 0.2726, + "step": 4987 + }, + { + "epoch": 0.39516735987324225, + "grad_norm": 1.4944047635188087, + "learning_rate": 1.3784417879180314e-05, + "loss": 0.3977, + "step": 4988 + }, + { + "epoch": 0.3952465834818776, + "grad_norm": 1.171463862044395, + "learning_rate": 1.3782042571249343e-05, + "loss": 0.245, + "step": 4989 + }, + { + "epoch": 0.39532580709051296, + "grad_norm": 1.4369194403357708, + "learning_rate": 1.3779667014289067e-05, + "loss": 0.341, + "step": 4990 + }, + { + "epoch": 0.39540503069914834, + "grad_norm": 1.867628309453861, + "learning_rate": 1.3777291208455902e-05, + "loss": 0.3443, + "step": 4991 + }, + { + "epoch": 0.3954842543077837, + "grad_norm": 1.4653879965908314, + "learning_rate": 1.3774915153906292e-05, + "loss": 0.3819, + "step": 4992 + }, + { + "epoch": 0.3955634779164191, + "grad_norm": 1.7858215676104223, + "learning_rate": 1.377253885079668e-05, + "loss": 0.3648, + "step": 4993 + }, + { + "epoch": 0.3956427015250545, + "grad_norm": 1.5388153043970205, + "learning_rate": 1.3770162299283535e-05, + "loss": 0.344, + "step": 4994 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 1.3894138430179441, + "learning_rate": 1.3767785499523347e-05, + "loss": 0.3939, + "step": 4995 + }, + { + "epoch": 0.3958011487423252, + "grad_norm": 1.29479156077149, + "learning_rate": 1.376540845167261e-05, + "loss": 0.3124, + "step": 4996 + }, + { + "epoch": 0.3958803723509606, + "grad_norm": 1.2249471037141981, + "learning_rate": 1.3763031155887847e-05, + "loss": 0.2343, + "step": 4997 + }, + { + "epoch": 0.39595959595959596, + "grad_norm": 1.2296091385926569, + "learning_rate": 1.3760653612325588e-05, + "loss": 0.1757, + "step": 4998 + }, + { + "epoch": 0.39603881956823134, + "grad_norm": 1.7019888682035174, + "learning_rate": 1.3758275821142382e-05, + "loss": 0.4034, + "step": 4999 + }, + { + "epoch": 0.3961180431768667, + "grad_norm": 1.5817909189265358, + "learning_rate": 1.3755897782494803e-05, + "loss": 0.2973, + "step": 5000 + }, + { + "epoch": 0.3961972667855021, + "grad_norm": 1.464106757563948, + "learning_rate": 1.375351949653942e-05, + "loss": 0.3074, + "step": 5001 + }, + { + "epoch": 0.3962764903941374, + "grad_norm": 1.5710960764287611, + "learning_rate": 1.375114096343284e-05, + "loss": 0.3292, + "step": 5002 + }, + { + "epoch": 0.3963557140027728, + "grad_norm": 1.3406466771326344, + "learning_rate": 1.3748762183331681e-05, + "loss": 0.2521, + "step": 5003 + }, + { + "epoch": 0.3964349376114082, + "grad_norm": 1.4091705003260702, + "learning_rate": 1.3746383156392566e-05, + "loss": 0.3176, + "step": 5004 + }, + { + "epoch": 0.39651416122004357, + "grad_norm": 1.7205745709410996, + "learning_rate": 1.374400388277215e-05, + "loss": 0.2981, + "step": 5005 + }, + { + "epoch": 0.39659338482867895, + "grad_norm": 1.3665911662947505, + "learning_rate": 1.3741624362627091e-05, + "loss": 0.2548, + "step": 5006 + }, + { + "epoch": 0.39667260843731433, + "grad_norm": 1.1696985243092033, + "learning_rate": 1.373924459611407e-05, + "loss": 0.2767, + "step": 5007 + }, + { + "epoch": 0.3967518320459497, + "grad_norm": 1.492781908167231, + "learning_rate": 1.3736864583389789e-05, + "loss": 0.3531, + "step": 5008 + }, + { + "epoch": 0.39683105565458504, + "grad_norm": 1.7228978934711823, + "learning_rate": 1.373448432461095e-05, + "loss": 0.3194, + "step": 5009 + }, + { + "epoch": 0.3969102792632204, + "grad_norm": 1.4416531593543607, + "learning_rate": 1.373210381993429e-05, + "loss": 0.3286, + "step": 5010 + }, + { + "epoch": 0.3969895028718558, + "grad_norm": 1.2285054962792505, + "learning_rate": 1.3729723069516554e-05, + "loss": 0.2804, + "step": 5011 + }, + { + "epoch": 0.3970687264804912, + "grad_norm": 1.1976794926937506, + "learning_rate": 1.3727342073514497e-05, + "loss": 0.2344, + "step": 5012 + }, + { + "epoch": 0.39714795008912657, + "grad_norm": 1.5596682285242176, + "learning_rate": 1.3724960832084902e-05, + "loss": 0.3788, + "step": 5013 + }, + { + "epoch": 0.39722717369776195, + "grad_norm": 1.4818707493373284, + "learning_rate": 1.3722579345384558e-05, + "loss": 0.2911, + "step": 5014 + }, + { + "epoch": 0.39730639730639733, + "grad_norm": 1.4985081443216186, + "learning_rate": 1.3720197613570272e-05, + "loss": 0.3241, + "step": 5015 + }, + { + "epoch": 0.39738562091503266, + "grad_norm": 1.495735575665788, + "learning_rate": 1.3717815636798879e-05, + "loss": 0.2311, + "step": 5016 + }, + { + "epoch": 0.39746484452366804, + "grad_norm": 1.3790531256193288, + "learning_rate": 1.3715433415227212e-05, + "loss": 0.299, + "step": 5017 + }, + { + "epoch": 0.3975440681323034, + "grad_norm": 1.6006274992818033, + "learning_rate": 1.3713050949012134e-05, + "loss": 0.2669, + "step": 5018 + }, + { + "epoch": 0.3976232917409388, + "grad_norm": 1.5820256933793853, + "learning_rate": 1.3710668238310519e-05, + "loss": 0.3796, + "step": 5019 + }, + { + "epoch": 0.3977025153495742, + "grad_norm": 1.7423856715804857, + "learning_rate": 1.3708285283279252e-05, + "loss": 0.3166, + "step": 5020 + }, + { + "epoch": 0.39778173895820956, + "grad_norm": 1.7575159874833988, + "learning_rate": 1.3705902084075244e-05, + "loss": 0.3899, + "step": 5021 + }, + { + "epoch": 0.39786096256684494, + "grad_norm": 2.0061270150199446, + "learning_rate": 1.3703518640855414e-05, + "loss": 0.428, + "step": 5022 + }, + { + "epoch": 0.39794018617548027, + "grad_norm": 1.3674633621831715, + "learning_rate": 1.37011349537767e-05, + "loss": 0.3994, + "step": 5023 + }, + { + "epoch": 0.39801940978411565, + "grad_norm": 1.6041104223436808, + "learning_rate": 1.3698751022996061e-05, + "loss": 0.378, + "step": 5024 + }, + { + "epoch": 0.39809863339275103, + "grad_norm": 1.6551307112366629, + "learning_rate": 1.3696366848670464e-05, + "loss": 0.3998, + "step": 5025 + }, + { + "epoch": 0.3981778570013864, + "grad_norm": 1.412588586855754, + "learning_rate": 1.3693982430956896e-05, + "loss": 0.2962, + "step": 5026 + }, + { + "epoch": 0.3982570806100218, + "grad_norm": 1.330918437743754, + "learning_rate": 1.369159777001236e-05, + "loss": 0.2318, + "step": 5027 + }, + { + "epoch": 0.3983363042186572, + "grad_norm": 1.4553267716806277, + "learning_rate": 1.368921286599387e-05, + "loss": 0.3721, + "step": 5028 + }, + { + "epoch": 0.39841552782729256, + "grad_norm": 1.6678420955887867, + "learning_rate": 1.368682771905847e-05, + "loss": 0.3242, + "step": 5029 + }, + { + "epoch": 0.3984947514359279, + "grad_norm": 1.4436114898942265, + "learning_rate": 1.3684442329363199e-05, + "loss": 0.3532, + "step": 5030 + }, + { + "epoch": 0.39857397504456327, + "grad_norm": 1.485021287915516, + "learning_rate": 1.368205669706513e-05, + "loss": 0.3453, + "step": 5031 + }, + { + "epoch": 0.39865319865319865, + "grad_norm": 1.7388753727727932, + "learning_rate": 1.3679670822321347e-05, + "loss": 0.4663, + "step": 5032 + }, + { + "epoch": 0.39873242226183403, + "grad_norm": 1.3623754471291856, + "learning_rate": 1.3677284705288943e-05, + "loss": 0.3294, + "step": 5033 + }, + { + "epoch": 0.3988116458704694, + "grad_norm": 1.5637861204893544, + "learning_rate": 1.3674898346125036e-05, + "loss": 0.292, + "step": 5034 + }, + { + "epoch": 0.3988908694791048, + "grad_norm": 1.8638612822671077, + "learning_rate": 1.3672511744986756e-05, + "loss": 0.4539, + "step": 5035 + }, + { + "epoch": 0.3989700930877402, + "grad_norm": 1.5244759697815742, + "learning_rate": 1.3670124902031248e-05, + "loss": 0.3595, + "step": 5036 + }, + { + "epoch": 0.3990493166963755, + "grad_norm": 1.2151913711526459, + "learning_rate": 1.3667737817415679e-05, + "loss": 0.2267, + "step": 5037 + }, + { + "epoch": 0.3991285403050109, + "grad_norm": 1.2503137345771052, + "learning_rate": 1.3665350491297215e-05, + "loss": 0.2677, + "step": 5038 + }, + { + "epoch": 0.39920776391364626, + "grad_norm": 1.3803298061296474, + "learning_rate": 1.3662962923833063e-05, + "loss": 0.3013, + "step": 5039 + }, + { + "epoch": 0.39928698752228164, + "grad_norm": 1.5910524223780635, + "learning_rate": 1.3660575115180427e-05, + "loss": 0.336, + "step": 5040 + }, + { + "epoch": 0.399366211130917, + "grad_norm": 1.332126057527755, + "learning_rate": 1.3658187065496533e-05, + "loss": 0.3224, + "step": 5041 + }, + { + "epoch": 0.3994454347395524, + "grad_norm": 1.3842123919676765, + "learning_rate": 1.365579877493862e-05, + "loss": 0.3962, + "step": 5042 + }, + { + "epoch": 0.39952465834818773, + "grad_norm": 1.1278405464818457, + "learning_rate": 1.3653410243663953e-05, + "loss": 0.2196, + "step": 5043 + }, + { + "epoch": 0.3996038819568231, + "grad_norm": 2.004902343504536, + "learning_rate": 1.3651021471829797e-05, + "loss": 0.365, + "step": 5044 + }, + { + "epoch": 0.3996831055654585, + "grad_norm": 1.3966098423418447, + "learning_rate": 1.3648632459593444e-05, + "loss": 0.4062, + "step": 5045 + }, + { + "epoch": 0.3997623291740939, + "grad_norm": 1.4737089172015694, + "learning_rate": 1.3646243207112204e-05, + "loss": 0.336, + "step": 5046 + }, + { + "epoch": 0.39984155278272926, + "grad_norm": 1.325129287968291, + "learning_rate": 1.3643853714543389e-05, + "loss": 0.2506, + "step": 5047 + }, + { + "epoch": 0.39992077639136464, + "grad_norm": 1.4796352957074, + "learning_rate": 1.3641463982044343e-05, + "loss": 0.3205, + "step": 5048 + }, + { + "epoch": 0.4, + "grad_norm": 1.2825025572461901, + "learning_rate": 1.3639074009772412e-05, + "loss": 0.3262, + "step": 5049 + }, + { + "epoch": 0.40007922360863535, + "grad_norm": 1.5490918508835396, + "learning_rate": 1.3636683797884971e-05, + "loss": 0.297, + "step": 5050 + }, + { + "epoch": 0.40015844721727073, + "grad_norm": 1.5654343330054767, + "learning_rate": 1.36342933465394e-05, + "loss": 0.3065, + "step": 5051 + }, + { + "epoch": 0.4002376708259061, + "grad_norm": 1.6738514056112284, + "learning_rate": 1.3631902655893096e-05, + "loss": 0.3508, + "step": 5052 + }, + { + "epoch": 0.4003168944345415, + "grad_norm": 1.6050114072452457, + "learning_rate": 1.3629511726103482e-05, + "loss": 0.4524, + "step": 5053 + }, + { + "epoch": 0.4003961180431769, + "grad_norm": 1.5027399407005393, + "learning_rate": 1.3627120557327982e-05, + "loss": 0.3557, + "step": 5054 + }, + { + "epoch": 0.40047534165181226, + "grad_norm": 1.690092647585083, + "learning_rate": 1.3624729149724047e-05, + "loss": 0.4404, + "step": 5055 + }, + { + "epoch": 0.40055456526044764, + "grad_norm": 1.4935298015860123, + "learning_rate": 1.362233750344914e-05, + "loss": 0.2976, + "step": 5056 + }, + { + "epoch": 0.40063378886908296, + "grad_norm": 1.4219310251649206, + "learning_rate": 1.3619945618660735e-05, + "loss": 0.2338, + "step": 5057 + }, + { + "epoch": 0.40071301247771834, + "grad_norm": 1.4265664738658819, + "learning_rate": 1.3617553495516332e-05, + "loss": 0.3311, + "step": 5058 + }, + { + "epoch": 0.4007922360863537, + "grad_norm": 1.5142283373178527, + "learning_rate": 1.3615161134173435e-05, + "loss": 0.2608, + "step": 5059 + }, + { + "epoch": 0.4008714596949891, + "grad_norm": 1.395747327231729, + "learning_rate": 1.3612768534789573e-05, + "loss": 0.3121, + "step": 5060 + }, + { + "epoch": 0.4009506833036245, + "grad_norm": 1.569119538801247, + "learning_rate": 1.3610375697522287e-05, + "loss": 0.2611, + "step": 5061 + }, + { + "epoch": 0.40102990691225987, + "grad_norm": 1.5475282295528205, + "learning_rate": 1.3607982622529135e-05, + "loss": 0.3369, + "step": 5062 + }, + { + "epoch": 0.40110913052089525, + "grad_norm": 1.3391420756406291, + "learning_rate": 1.3605589309967686e-05, + "loss": 0.2263, + "step": 5063 + }, + { + "epoch": 0.4011883541295306, + "grad_norm": 1.616637222299692, + "learning_rate": 1.3603195759995531e-05, + "loss": 0.2968, + "step": 5064 + }, + { + "epoch": 0.40126757773816596, + "grad_norm": 1.310848605966884, + "learning_rate": 1.3600801972770272e-05, + "loss": 0.2965, + "step": 5065 + }, + { + "epoch": 0.40134680134680134, + "grad_norm": 1.317397509847661, + "learning_rate": 1.3598407948449528e-05, + "loss": 0.2648, + "step": 5066 + }, + { + "epoch": 0.4014260249554367, + "grad_norm": 1.8128827078877061, + "learning_rate": 1.3596013687190936e-05, + "loss": 0.3586, + "step": 5067 + }, + { + "epoch": 0.4015052485640721, + "grad_norm": 1.5154107649780881, + "learning_rate": 1.3593619189152146e-05, + "loss": 0.2863, + "step": 5068 + }, + { + "epoch": 0.4015844721727075, + "grad_norm": 2.2402790821417855, + "learning_rate": 1.3591224454490824e-05, + "loss": 0.5488, + "step": 5069 + }, + { + "epoch": 0.40166369578134287, + "grad_norm": 1.3407774317326246, + "learning_rate": 1.3588829483364652e-05, + "loss": 0.2998, + "step": 5070 + }, + { + "epoch": 0.4017429193899782, + "grad_norm": 1.543450009768594, + "learning_rate": 1.3586434275931324e-05, + "loss": 0.3311, + "step": 5071 + }, + { + "epoch": 0.4018221429986136, + "grad_norm": 1.1472374992200711, + "learning_rate": 1.358403883234856e-05, + "loss": 0.2274, + "step": 5072 + }, + { + "epoch": 0.40190136660724896, + "grad_norm": 1.4854762026695074, + "learning_rate": 1.358164315277408e-05, + "loss": 0.352, + "step": 5073 + }, + { + "epoch": 0.40198059021588434, + "grad_norm": 1.7722309855181337, + "learning_rate": 1.3579247237365634e-05, + "loss": 0.346, + "step": 5074 + }, + { + "epoch": 0.4020598138245197, + "grad_norm": 1.6302962553982612, + "learning_rate": 1.357685108628098e-05, + "loss": 0.2874, + "step": 5075 + }, + { + "epoch": 0.4021390374331551, + "grad_norm": 1.4946590974993992, + "learning_rate": 1.3574454699677893e-05, + "loss": 0.3132, + "step": 5076 + }, + { + "epoch": 0.4022182610417904, + "grad_norm": 1.4606986865603948, + "learning_rate": 1.357205807771416e-05, + "loss": 0.3539, + "step": 5077 + }, + { + "epoch": 0.4022974846504258, + "grad_norm": 1.3887744348364097, + "learning_rate": 1.3569661220547596e-05, + "loss": 0.2618, + "step": 5078 + }, + { + "epoch": 0.4023767082590612, + "grad_norm": 1.5906813703947276, + "learning_rate": 1.3567264128336013e-05, + "loss": 0.2608, + "step": 5079 + }, + { + "epoch": 0.40245593186769657, + "grad_norm": 1.3139415967821995, + "learning_rate": 1.3564866801237254e-05, + "loss": 0.3073, + "step": 5080 + }, + { + "epoch": 0.40253515547633195, + "grad_norm": 1.2613576760617309, + "learning_rate": 1.3562469239409166e-05, + "loss": 0.1679, + "step": 5081 + }, + { + "epoch": 0.40261437908496733, + "grad_norm": 1.3019899148352339, + "learning_rate": 1.3560071443009622e-05, + "loss": 0.2993, + "step": 5082 + }, + { + "epoch": 0.4026936026936027, + "grad_norm": 1.3826184365597471, + "learning_rate": 1.3557673412196504e-05, + "loss": 0.3148, + "step": 5083 + }, + { + "epoch": 0.40277282630223804, + "grad_norm": 1.356399970669234, + "learning_rate": 1.3555275147127709e-05, + "loss": 0.2798, + "step": 5084 + }, + { + "epoch": 0.4028520499108734, + "grad_norm": 1.4328015520163058, + "learning_rate": 1.3552876647961151e-05, + "loss": 0.2851, + "step": 5085 + }, + { + "epoch": 0.4029312735195088, + "grad_norm": 1.7211357868465713, + "learning_rate": 1.3550477914854766e-05, + "loss": 0.3722, + "step": 5086 + }, + { + "epoch": 0.4030104971281442, + "grad_norm": 1.3134018607662379, + "learning_rate": 1.3548078947966487e-05, + "loss": 0.2279, + "step": 5087 + }, + { + "epoch": 0.40308972073677957, + "grad_norm": 1.215497608930495, + "learning_rate": 1.3545679747454286e-05, + "loss": 0.2297, + "step": 5088 + }, + { + "epoch": 0.40316894434541495, + "grad_norm": 1.4186684839296035, + "learning_rate": 1.3543280313476135e-05, + "loss": 0.284, + "step": 5089 + }, + { + "epoch": 0.40324816795405033, + "grad_norm": 1.4254813838474238, + "learning_rate": 1.3540880646190022e-05, + "loss": 0.3482, + "step": 5090 + }, + { + "epoch": 0.40332739156268566, + "grad_norm": 1.4018365750149362, + "learning_rate": 1.353848074575396e-05, + "loss": 0.378, + "step": 5091 + }, + { + "epoch": 0.40340661517132104, + "grad_norm": 1.2700896602992604, + "learning_rate": 1.3536080612325963e-05, + "loss": 0.2575, + "step": 5092 + }, + { + "epoch": 0.4034858387799564, + "grad_norm": 1.7202241636971003, + "learning_rate": 1.3533680246064073e-05, + "loss": 0.3945, + "step": 5093 + }, + { + "epoch": 0.4035650623885918, + "grad_norm": 1.4888000830307015, + "learning_rate": 1.3531279647126342e-05, + "loss": 0.3157, + "step": 5094 + }, + { + "epoch": 0.4036442859972272, + "grad_norm": 1.3437683614397191, + "learning_rate": 1.352887881567084e-05, + "loss": 0.2713, + "step": 5095 + }, + { + "epoch": 0.40372350960586256, + "grad_norm": 1.5688630700524515, + "learning_rate": 1.3526477751855645e-05, + "loss": 0.3719, + "step": 5096 + }, + { + "epoch": 0.40380273321449794, + "grad_norm": 1.142016810373895, + "learning_rate": 1.3524076455838859e-05, + "loss": 0.2025, + "step": 5097 + }, + { + "epoch": 0.40388195682313327, + "grad_norm": 1.8625953729769273, + "learning_rate": 1.3521674927778594e-05, + "loss": 0.2729, + "step": 5098 + }, + { + "epoch": 0.40396118043176865, + "grad_norm": 1.7706211373366825, + "learning_rate": 1.3519273167832982e-05, + "loss": 0.4145, + "step": 5099 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.5767148359739922, + "learning_rate": 1.3516871176160166e-05, + "loss": 0.3634, + "step": 5100 + }, + { + "epoch": 0.4041196276490394, + "grad_norm": 1.4709106099560698, + "learning_rate": 1.3514468952918303e-05, + "loss": 0.3076, + "step": 5101 + }, + { + "epoch": 0.4041988512576748, + "grad_norm": 1.5046873589838186, + "learning_rate": 1.3512066498265572e-05, + "loss": 0.3747, + "step": 5102 + }, + { + "epoch": 0.4042780748663102, + "grad_norm": 1.5105863620024775, + "learning_rate": 1.3509663812360161e-05, + "loss": 0.2927, + "step": 5103 + }, + { + "epoch": 0.40435729847494556, + "grad_norm": 1.5985591007068733, + "learning_rate": 1.3507260895360274e-05, + "loss": 0.3275, + "step": 5104 + }, + { + "epoch": 0.4044365220835809, + "grad_norm": 1.4536441352260983, + "learning_rate": 1.3504857747424133e-05, + "loss": 0.2837, + "step": 5105 + }, + { + "epoch": 0.40451574569221627, + "grad_norm": 1.3922656809888372, + "learning_rate": 1.3502454368709973e-05, + "loss": 0.314, + "step": 5106 + }, + { + "epoch": 0.40459496930085165, + "grad_norm": 1.5213369663753875, + "learning_rate": 1.3500050759376052e-05, + "loss": 0.3539, + "step": 5107 + }, + { + "epoch": 0.40467419290948703, + "grad_norm": 1.3112523212882794, + "learning_rate": 1.3497646919580623e-05, + "loss": 0.3005, + "step": 5108 + }, + { + "epoch": 0.4047534165181224, + "grad_norm": 1.5476483359070967, + "learning_rate": 1.3495242849481973e-05, + "loss": 0.313, + "step": 5109 + }, + { + "epoch": 0.4048326401267578, + "grad_norm": 1.862245994646032, + "learning_rate": 1.3492838549238406e-05, + "loss": 0.3588, + "step": 5110 + }, + { + "epoch": 0.4049118637353932, + "grad_norm": 1.3476418031706454, + "learning_rate": 1.349043401900822e-05, + "loss": 0.3708, + "step": 5111 + }, + { + "epoch": 0.4049910873440285, + "grad_norm": 1.5224215662980156, + "learning_rate": 1.348802925894975e-05, + "loss": 0.3304, + "step": 5112 + }, + { + "epoch": 0.4050703109526639, + "grad_norm": 1.4243258848457265, + "learning_rate": 1.348562426922134e-05, + "loss": 0.3618, + "step": 5113 + }, + { + "epoch": 0.40514953456129926, + "grad_norm": 1.4068209927098394, + "learning_rate": 1.3483219049981343e-05, + "loss": 0.2777, + "step": 5114 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 1.6424624219085664, + "learning_rate": 1.348081360138813e-05, + "loss": 0.3122, + "step": 5115 + }, + { + "epoch": 0.40530798177857, + "grad_norm": 1.3833716206915943, + "learning_rate": 1.347840792360009e-05, + "loss": 0.2839, + "step": 5116 + }, + { + "epoch": 0.4053872053872054, + "grad_norm": 1.516729453968915, + "learning_rate": 1.3476002016775626e-05, + "loss": 0.3656, + "step": 5117 + }, + { + "epoch": 0.40546642899584073, + "grad_norm": 1.289522675820099, + "learning_rate": 1.3473595881073154e-05, + "loss": 0.2556, + "step": 5118 + }, + { + "epoch": 0.4055456526044761, + "grad_norm": 1.4400693404044367, + "learning_rate": 1.3471189516651108e-05, + "loss": 0.3242, + "step": 5119 + }, + { + "epoch": 0.4056248762131115, + "grad_norm": 1.5127858934548162, + "learning_rate": 1.3468782923667936e-05, + "loss": 0.3343, + "step": 5120 + }, + { + "epoch": 0.4057040998217469, + "grad_norm": 1.5143332512948404, + "learning_rate": 1.3466376102282098e-05, + "loss": 0.34, + "step": 5121 + }, + { + "epoch": 0.40578332343038226, + "grad_norm": 1.5384258865514473, + "learning_rate": 1.3463969052652073e-05, + "loss": 0.2999, + "step": 5122 + }, + { + "epoch": 0.40586254703901764, + "grad_norm": 1.6343927203275055, + "learning_rate": 1.3461561774936352e-05, + "loss": 0.2824, + "step": 5123 + }, + { + "epoch": 0.405941770647653, + "grad_norm": 1.546340158040832, + "learning_rate": 1.3459154269293443e-05, + "loss": 0.3168, + "step": 5124 + }, + { + "epoch": 0.40602099425628835, + "grad_norm": 1.7077793404017203, + "learning_rate": 1.3456746535881872e-05, + "loss": 0.4051, + "step": 5125 + }, + { + "epoch": 0.40610021786492373, + "grad_norm": 2.377896012436734, + "learning_rate": 1.3454338574860175e-05, + "loss": 0.3813, + "step": 5126 + }, + { + "epoch": 0.4061794414735591, + "grad_norm": 1.432030837873591, + "learning_rate": 1.3451930386386902e-05, + "loss": 0.2947, + "step": 5127 + }, + { + "epoch": 0.4062586650821945, + "grad_norm": 1.7630241646227092, + "learning_rate": 1.3449521970620624e-05, + "loss": 0.327, + "step": 5128 + }, + { + "epoch": 0.4063378886908299, + "grad_norm": 1.495139451041156, + "learning_rate": 1.3447113327719923e-05, + "loss": 0.2847, + "step": 5129 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 1.5502926540053648, + "learning_rate": 1.3444704457843393e-05, + "loss": 0.3223, + "step": 5130 + }, + { + "epoch": 0.40649633590810064, + "grad_norm": 1.4049206586767469, + "learning_rate": 1.3442295361149651e-05, + "loss": 0.3057, + "step": 5131 + }, + { + "epoch": 0.40657555951673596, + "grad_norm": 1.2889872241005051, + "learning_rate": 1.3439886037797326e-05, + "loss": 0.2343, + "step": 5132 + }, + { + "epoch": 0.40665478312537134, + "grad_norm": 1.6026088694358938, + "learning_rate": 1.3437476487945051e-05, + "loss": 0.3364, + "step": 5133 + }, + { + "epoch": 0.4067340067340067, + "grad_norm": 1.5080351343819522, + "learning_rate": 1.3435066711751494e-05, + "loss": 0.2883, + "step": 5134 + }, + { + "epoch": 0.4068132303426421, + "grad_norm": 1.618867149979652, + "learning_rate": 1.343265670937532e-05, + "loss": 0.2965, + "step": 5135 + }, + { + "epoch": 0.4068924539512775, + "grad_norm": 1.442556926185253, + "learning_rate": 1.3430246480975218e-05, + "loss": 0.3287, + "step": 5136 + }, + { + "epoch": 0.40697167755991287, + "grad_norm": 1.8739691331582184, + "learning_rate": 1.3427836026709892e-05, + "loss": 0.4267, + "step": 5137 + }, + { + "epoch": 0.40705090116854825, + "grad_norm": 1.345967067735683, + "learning_rate": 1.3425425346738057e-05, + "loss": 0.2856, + "step": 5138 + }, + { + "epoch": 0.4071301247771836, + "grad_norm": 1.6215449566175117, + "learning_rate": 1.3423014441218444e-05, + "loss": 0.3122, + "step": 5139 + }, + { + "epoch": 0.40720934838581896, + "grad_norm": 1.4485375277499877, + "learning_rate": 1.3420603310309805e-05, + "loss": 0.3134, + "step": 5140 + }, + { + "epoch": 0.40728857199445434, + "grad_norm": 1.0513686120861234, + "learning_rate": 1.3418191954170892e-05, + "loss": 0.2185, + "step": 5141 + }, + { + "epoch": 0.4073677956030897, + "grad_norm": 1.2751831024389073, + "learning_rate": 1.341578037296049e-05, + "loss": 0.2254, + "step": 5142 + }, + { + "epoch": 0.4074470192117251, + "grad_norm": 1.6091232455224238, + "learning_rate": 1.3413368566837384e-05, + "loss": 0.3887, + "step": 5143 + }, + { + "epoch": 0.4075262428203605, + "grad_norm": 1.3567058867828161, + "learning_rate": 1.341095653596038e-05, + "loss": 0.2665, + "step": 5144 + }, + { + "epoch": 0.40760546642899587, + "grad_norm": 1.602471551287939, + "learning_rate": 1.3408544280488305e-05, + "loss": 0.3403, + "step": 5145 + }, + { + "epoch": 0.4076846900376312, + "grad_norm": 1.6804636754184283, + "learning_rate": 1.3406131800579985e-05, + "loss": 0.2691, + "step": 5146 + }, + { + "epoch": 0.4077639136462666, + "grad_norm": 1.5258151815672445, + "learning_rate": 1.3403719096394276e-05, + "loss": 0.3424, + "step": 5147 + }, + { + "epoch": 0.40784313725490196, + "grad_norm": 1.4092434435781458, + "learning_rate": 1.3401306168090047e-05, + "loss": 0.2989, + "step": 5148 + }, + { + "epoch": 0.40792236086353734, + "grad_norm": 2.0043761547433965, + "learning_rate": 1.3398893015826166e-05, + "loss": 0.425, + "step": 5149 + }, + { + "epoch": 0.4080015844721727, + "grad_norm": 1.8712062050794998, + "learning_rate": 1.3396479639761541e-05, + "loss": 0.3765, + "step": 5150 + }, + { + "epoch": 0.4080808080808081, + "grad_norm": 1.5376564389726946, + "learning_rate": 1.3394066040055071e-05, + "loss": 0.4119, + "step": 5151 + }, + { + "epoch": 0.4081600316894435, + "grad_norm": 1.6465483678555928, + "learning_rate": 1.3391652216865682e-05, + "loss": 0.4312, + "step": 5152 + }, + { + "epoch": 0.4082392552980788, + "grad_norm": 1.386572013284321, + "learning_rate": 1.3389238170352318e-05, + "loss": 0.3636, + "step": 5153 + }, + { + "epoch": 0.4083184789067142, + "grad_norm": 1.3145816603053615, + "learning_rate": 1.3386823900673926e-05, + "loss": 0.2432, + "step": 5154 + }, + { + "epoch": 0.40839770251534957, + "grad_norm": 1.5352418769531913, + "learning_rate": 1.3384409407989475e-05, + "loss": 0.3208, + "step": 5155 + }, + { + "epoch": 0.40847692612398495, + "grad_norm": 1.4478943760331868, + "learning_rate": 1.3381994692457956e-05, + "loss": 0.2085, + "step": 5156 + }, + { + "epoch": 0.40855614973262033, + "grad_norm": 1.3582057956841456, + "learning_rate": 1.3379579754238354e-05, + "loss": 0.2672, + "step": 5157 + }, + { + "epoch": 0.4086353733412557, + "grad_norm": 1.595702451101812, + "learning_rate": 1.3377164593489687e-05, + "loss": 0.325, + "step": 5158 + }, + { + "epoch": 0.40871459694989104, + "grad_norm": 1.4022522124207797, + "learning_rate": 1.3374749210370983e-05, + "loss": 0.2432, + "step": 5159 + }, + { + "epoch": 0.4087938205585264, + "grad_norm": 1.3480377781733934, + "learning_rate": 1.3372333605041282e-05, + "loss": 0.2984, + "step": 5160 + }, + { + "epoch": 0.4088730441671618, + "grad_norm": 1.6983224458064181, + "learning_rate": 1.3369917777659638e-05, + "loss": 0.3731, + "step": 5161 + }, + { + "epoch": 0.4089522677757972, + "grad_norm": 1.12434265947815, + "learning_rate": 1.3367501728385124e-05, + "loss": 0.2076, + "step": 5162 + }, + { + "epoch": 0.40903149138443257, + "grad_norm": 1.2190963677722704, + "learning_rate": 1.3365085457376823e-05, + "loss": 0.2465, + "step": 5163 + }, + { + "epoch": 0.40911071499306795, + "grad_norm": 1.459942902902082, + "learning_rate": 1.336266896479384e-05, + "loss": 0.3297, + "step": 5164 + }, + { + "epoch": 0.40918993860170333, + "grad_norm": 1.6100273207675837, + "learning_rate": 1.3360252250795282e-05, + "loss": 0.3469, + "step": 5165 + }, + { + "epoch": 0.40926916221033866, + "grad_norm": 1.4067367717715846, + "learning_rate": 1.3357835315540281e-05, + "loss": 0.3151, + "step": 5166 + }, + { + "epoch": 0.40934838581897404, + "grad_norm": 1.419665094192966, + "learning_rate": 1.3355418159187988e-05, + "loss": 0.2393, + "step": 5167 + }, + { + "epoch": 0.4094276094276094, + "grad_norm": 1.7599246830886042, + "learning_rate": 1.335300078189755e-05, + "loss": 0.3986, + "step": 5168 + }, + { + "epoch": 0.4095068330362448, + "grad_norm": 1.2602699938197612, + "learning_rate": 1.3350583183828143e-05, + "loss": 0.2041, + "step": 5169 + }, + { + "epoch": 0.4095860566448802, + "grad_norm": 1.3599322010416612, + "learning_rate": 1.3348165365138956e-05, + "loss": 0.2781, + "step": 5170 + }, + { + "epoch": 0.40966528025351556, + "grad_norm": 1.3036849355719582, + "learning_rate": 1.3345747325989188e-05, + "loss": 0.2763, + "step": 5171 + }, + { + "epoch": 0.40974450386215094, + "grad_norm": 1.5881215934325086, + "learning_rate": 1.3343329066538064e-05, + "loss": 0.3535, + "step": 5172 + }, + { + "epoch": 0.40982372747078627, + "grad_norm": 1.6258469513641862, + "learning_rate": 1.3340910586944805e-05, + "loss": 0.2677, + "step": 5173 + }, + { + "epoch": 0.40990295107942165, + "grad_norm": 1.4849707647922123, + "learning_rate": 1.3338491887368656e-05, + "loss": 0.2681, + "step": 5174 + }, + { + "epoch": 0.40998217468805703, + "grad_norm": 1.8341736007917508, + "learning_rate": 1.3336072967968882e-05, + "loss": 0.3178, + "step": 5175 + }, + { + "epoch": 0.4100613982966924, + "grad_norm": 1.3491635824228365, + "learning_rate": 1.3333653828904755e-05, + "loss": 0.2803, + "step": 5176 + }, + { + "epoch": 0.4101406219053278, + "grad_norm": 1.474192346727405, + "learning_rate": 1.3331234470335566e-05, + "loss": 0.382, + "step": 5177 + }, + { + "epoch": 0.4102198455139632, + "grad_norm": 1.191256819233283, + "learning_rate": 1.3328814892420613e-05, + "loss": 0.2027, + "step": 5178 + }, + { + "epoch": 0.41029906912259856, + "grad_norm": 1.6130990837673658, + "learning_rate": 1.3326395095319218e-05, + "loss": 0.3884, + "step": 5179 + }, + { + "epoch": 0.4103782927312339, + "grad_norm": 1.4273983030998831, + "learning_rate": 1.3323975079190713e-05, + "loss": 0.3146, + "step": 5180 + }, + { + "epoch": 0.41045751633986927, + "grad_norm": 1.7431163848350197, + "learning_rate": 1.332155484419444e-05, + "loss": 0.3889, + "step": 5181 + }, + { + "epoch": 0.41053673994850465, + "grad_norm": 1.4615864282767486, + "learning_rate": 1.3319134390489765e-05, + "loss": 0.339, + "step": 5182 + }, + { + "epoch": 0.41061596355714003, + "grad_norm": 1.6573437351140943, + "learning_rate": 1.3316713718236061e-05, + "loss": 0.4215, + "step": 5183 + }, + { + "epoch": 0.4106951871657754, + "grad_norm": 1.6556506961283273, + "learning_rate": 1.3314292827592716e-05, + "loss": 0.3992, + "step": 5184 + }, + { + "epoch": 0.4107744107744108, + "grad_norm": 1.4842305462675711, + "learning_rate": 1.3311871718719137e-05, + "loss": 0.2754, + "step": 5185 + }, + { + "epoch": 0.4108536343830462, + "grad_norm": 1.336339609771554, + "learning_rate": 1.330945039177474e-05, + "loss": 0.2304, + "step": 5186 + }, + { + "epoch": 0.4109328579916815, + "grad_norm": 1.2231715882263132, + "learning_rate": 1.3307028846918958e-05, + "loss": 0.2473, + "step": 5187 + }, + { + "epoch": 0.4110120816003169, + "grad_norm": 1.5235206339847889, + "learning_rate": 1.3304607084311246e-05, + "loss": 0.3613, + "step": 5188 + }, + { + "epoch": 0.41109130520895226, + "grad_norm": 1.609900385807517, + "learning_rate": 1.3302185104111049e-05, + "loss": 0.4396, + "step": 5189 + }, + { + "epoch": 0.41117052881758764, + "grad_norm": 2.1327215250766947, + "learning_rate": 1.3299762906477855e-05, + "loss": 0.4076, + "step": 5190 + }, + { + "epoch": 0.411249752426223, + "grad_norm": 1.2616874660468511, + "learning_rate": 1.3297340491571153e-05, + "loss": 0.3311, + "step": 5191 + }, + { + "epoch": 0.4113289760348584, + "grad_norm": 1.6064882997861947, + "learning_rate": 1.3294917859550444e-05, + "loss": 0.316, + "step": 5192 + }, + { + "epoch": 0.4114081996434938, + "grad_norm": 1.465321134095432, + "learning_rate": 1.3292495010575249e-05, + "loss": 0.2429, + "step": 5193 + }, + { + "epoch": 0.4114874232521291, + "grad_norm": 1.2818824094209373, + "learning_rate": 1.3290071944805099e-05, + "loss": 0.231, + "step": 5194 + }, + { + "epoch": 0.4115666468607645, + "grad_norm": 1.6035677355084266, + "learning_rate": 1.3287648662399544e-05, + "loss": 0.3876, + "step": 5195 + }, + { + "epoch": 0.4116458704693999, + "grad_norm": 1.406564577989663, + "learning_rate": 1.3285225163518141e-05, + "loss": 0.3723, + "step": 5196 + }, + { + "epoch": 0.41172509407803526, + "grad_norm": 1.6169480290303706, + "learning_rate": 1.328280144832047e-05, + "loss": 0.3449, + "step": 5197 + }, + { + "epoch": 0.41180431768667064, + "grad_norm": 1.7619077768442422, + "learning_rate": 1.3280377516966118e-05, + "loss": 0.2563, + "step": 5198 + }, + { + "epoch": 0.411883541295306, + "grad_norm": 1.5295708545348026, + "learning_rate": 1.3277953369614696e-05, + "loss": 0.1938, + "step": 5199 + }, + { + "epoch": 0.41196276490394135, + "grad_norm": 1.3755280072067326, + "learning_rate": 1.3275529006425808e-05, + "loss": 0.2395, + "step": 5200 + }, + { + "epoch": 0.41204198851257673, + "grad_norm": 1.2322555994151438, + "learning_rate": 1.3273104427559102e-05, + "loss": 0.178, + "step": 5201 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 1.5426966514076645, + "learning_rate": 1.3270679633174219e-05, + "loss": 0.3087, + "step": 5202 + }, + { + "epoch": 0.4122004357298475, + "grad_norm": 1.2069032494845653, + "learning_rate": 1.3268254623430817e-05, + "loss": 0.1516, + "step": 5203 + }, + { + "epoch": 0.4122796593384829, + "grad_norm": 1.4279471998876085, + "learning_rate": 1.3265829398488576e-05, + "loss": 0.2815, + "step": 5204 + }, + { + "epoch": 0.41235888294711825, + "grad_norm": 1.3322301509141183, + "learning_rate": 1.3263403958507181e-05, + "loss": 0.2021, + "step": 5205 + }, + { + "epoch": 0.41243810655575364, + "grad_norm": 1.1180406077512783, + "learning_rate": 1.326097830364634e-05, + "loss": 0.2739, + "step": 5206 + }, + { + "epoch": 0.41251733016438896, + "grad_norm": 1.804246926901076, + "learning_rate": 1.3258552434065768e-05, + "loss": 0.3799, + "step": 5207 + }, + { + "epoch": 0.41259655377302434, + "grad_norm": 1.558428723551447, + "learning_rate": 1.3256126349925195e-05, + "loss": 0.3173, + "step": 5208 + }, + { + "epoch": 0.4126757773816597, + "grad_norm": 1.4241058954975696, + "learning_rate": 1.3253700051384371e-05, + "loss": 0.3708, + "step": 5209 + }, + { + "epoch": 0.4127550009902951, + "grad_norm": 1.5097313655449034, + "learning_rate": 1.3251273538603056e-05, + "loss": 0.2931, + "step": 5210 + }, + { + "epoch": 0.4128342245989305, + "grad_norm": 1.64220889074521, + "learning_rate": 1.3248846811741021e-05, + "loss": 0.3565, + "step": 5211 + }, + { + "epoch": 0.41291344820756587, + "grad_norm": 1.5268543740728318, + "learning_rate": 1.3246419870958056e-05, + "loss": 0.2583, + "step": 5212 + }, + { + "epoch": 0.41299267181620125, + "grad_norm": 1.7112226464572504, + "learning_rate": 1.3243992716413962e-05, + "loss": 0.3525, + "step": 5213 + }, + { + "epoch": 0.4130718954248366, + "grad_norm": 1.6035105814364647, + "learning_rate": 1.324156534826856e-05, + "loss": 0.3236, + "step": 5214 + }, + { + "epoch": 0.41315111903347196, + "grad_norm": 1.6213796882474591, + "learning_rate": 1.3239137766681675e-05, + "loss": 0.2291, + "step": 5215 + }, + { + "epoch": 0.41323034264210734, + "grad_norm": 1.69758930605804, + "learning_rate": 1.3236709971813153e-05, + "loss": 0.3082, + "step": 5216 + }, + { + "epoch": 0.4133095662507427, + "grad_norm": 1.2929577016598028, + "learning_rate": 1.3234281963822856e-05, + "loss": 0.289, + "step": 5217 + }, + { + "epoch": 0.4133887898593781, + "grad_norm": 1.5934629181630984, + "learning_rate": 1.3231853742870652e-05, + "loss": 0.3386, + "step": 5218 + }, + { + "epoch": 0.4134680134680135, + "grad_norm": 1.4110547869130234, + "learning_rate": 1.322942530911643e-05, + "loss": 0.3154, + "step": 5219 + }, + { + "epoch": 0.41354723707664887, + "grad_norm": 1.137906778375266, + "learning_rate": 1.3226996662720094e-05, + "loss": 0.2315, + "step": 5220 + }, + { + "epoch": 0.4136264606852842, + "grad_norm": 1.594671550607869, + "learning_rate": 1.322456780384155e-05, + "loss": 0.3952, + "step": 5221 + }, + { + "epoch": 0.4137056842939196, + "grad_norm": 1.412784278604694, + "learning_rate": 1.3222138732640732e-05, + "loss": 0.2651, + "step": 5222 + }, + { + "epoch": 0.41378490790255495, + "grad_norm": 1.4133644066446025, + "learning_rate": 1.3219709449277584e-05, + "loss": 0.2917, + "step": 5223 + }, + { + "epoch": 0.41386413151119034, + "grad_norm": 1.6966961337699802, + "learning_rate": 1.3217279953912061e-05, + "loss": 0.3627, + "step": 5224 + }, + { + "epoch": 0.4139433551198257, + "grad_norm": 1.4802565286774627, + "learning_rate": 1.3214850246704134e-05, + "loss": 0.3051, + "step": 5225 + }, + { + "epoch": 0.4140225787284611, + "grad_norm": 1.5168767869460236, + "learning_rate": 1.3212420327813789e-05, + "loss": 0.2811, + "step": 5226 + }, + { + "epoch": 0.4141018023370965, + "grad_norm": 1.759120769981136, + "learning_rate": 1.3209990197401016e-05, + "loss": 0.3254, + "step": 5227 + }, + { + "epoch": 0.4141810259457318, + "grad_norm": 1.379197883649193, + "learning_rate": 1.3207559855625842e-05, + "loss": 0.3107, + "step": 5228 + }, + { + "epoch": 0.4142602495543672, + "grad_norm": 1.539297375233917, + "learning_rate": 1.3205129302648282e-05, + "loss": 0.3664, + "step": 5229 + }, + { + "epoch": 0.41433947316300257, + "grad_norm": 1.6106230234218317, + "learning_rate": 1.3202698538628376e-05, + "loss": 0.3357, + "step": 5230 + }, + { + "epoch": 0.41441869677163795, + "grad_norm": 1.8049845672565505, + "learning_rate": 1.3200267563726187e-05, + "loss": 0.3857, + "step": 5231 + }, + { + "epoch": 0.41449792038027333, + "grad_norm": 2.007341944420089, + "learning_rate": 1.3197836378101773e-05, + "loss": 0.4556, + "step": 5232 + }, + { + "epoch": 0.4145771439889087, + "grad_norm": 1.8360264118700633, + "learning_rate": 1.3195404981915223e-05, + "loss": 0.3389, + "step": 5233 + }, + { + "epoch": 0.4146563675975441, + "grad_norm": 1.523470834230146, + "learning_rate": 1.3192973375326635e-05, + "loss": 0.374, + "step": 5234 + }, + { + "epoch": 0.4147355912061794, + "grad_norm": 1.5842085704016085, + "learning_rate": 1.3190541558496106e-05, + "loss": 0.3783, + "step": 5235 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 1.4553986164021036, + "learning_rate": 1.318810953158377e-05, + "loss": 0.3548, + "step": 5236 + }, + { + "epoch": 0.4148940384234502, + "grad_norm": 1.4820043409827646, + "learning_rate": 1.3185677294749763e-05, + "loss": 0.348, + "step": 5237 + }, + { + "epoch": 0.41497326203208557, + "grad_norm": 1.372925911489726, + "learning_rate": 1.3183244848154232e-05, + "loss": 0.3695, + "step": 5238 + }, + { + "epoch": 0.41505248564072095, + "grad_norm": 1.5564712000250092, + "learning_rate": 1.3180812191957346e-05, + "loss": 0.403, + "step": 5239 + }, + { + "epoch": 0.41513170924935633, + "grad_norm": 1.2778426771006124, + "learning_rate": 1.3178379326319284e-05, + "loss": 0.3558, + "step": 5240 + }, + { + "epoch": 0.41521093285799165, + "grad_norm": 1.5609390757786532, + "learning_rate": 1.3175946251400234e-05, + "loss": 0.3478, + "step": 5241 + }, + { + "epoch": 0.41529015646662704, + "grad_norm": 1.141077493648826, + "learning_rate": 1.3173512967360406e-05, + "loss": 0.2096, + "step": 5242 + }, + { + "epoch": 0.4153693800752624, + "grad_norm": 1.453801804037298, + "learning_rate": 1.317107947436002e-05, + "loss": 0.4164, + "step": 5243 + }, + { + "epoch": 0.4154486036838978, + "grad_norm": 1.4209188508433634, + "learning_rate": 1.3168645772559308e-05, + "loss": 0.3488, + "step": 5244 + }, + { + "epoch": 0.4155278272925332, + "grad_norm": 1.4395965238873254, + "learning_rate": 1.3166211862118519e-05, + "loss": 0.2656, + "step": 5245 + }, + { + "epoch": 0.41560705090116856, + "grad_norm": 1.4795990756278954, + "learning_rate": 1.3163777743197912e-05, + "loss": 0.265, + "step": 5246 + }, + { + "epoch": 0.41568627450980394, + "grad_norm": 1.344745241180794, + "learning_rate": 1.3161343415957767e-05, + "loss": 0.296, + "step": 5247 + }, + { + "epoch": 0.41576549811843927, + "grad_norm": 1.1695065089717933, + "learning_rate": 1.3158908880558366e-05, + "loss": 0.1817, + "step": 5248 + }, + { + "epoch": 0.41584472172707465, + "grad_norm": 1.3687761894189339, + "learning_rate": 1.3156474137160015e-05, + "loss": 0.3404, + "step": 5249 + }, + { + "epoch": 0.41592394533571003, + "grad_norm": 1.9831913212191044, + "learning_rate": 1.3154039185923034e-05, + "loss": 0.4215, + "step": 5250 + }, + { + "epoch": 0.4160031689443454, + "grad_norm": 1.6383419899854181, + "learning_rate": 1.3151604027007744e-05, + "loss": 0.4112, + "step": 5251 + }, + { + "epoch": 0.4160823925529808, + "grad_norm": 1.1759785023432148, + "learning_rate": 1.3149168660574495e-05, + "loss": 0.2422, + "step": 5252 + }, + { + "epoch": 0.4161616161616162, + "grad_norm": 1.7063921866718448, + "learning_rate": 1.3146733086783646e-05, + "loss": 0.3722, + "step": 5253 + }, + { + "epoch": 0.41624083977025156, + "grad_norm": 1.5826022937986772, + "learning_rate": 1.3144297305795559e-05, + "loss": 0.2803, + "step": 5254 + }, + { + "epoch": 0.4163200633788869, + "grad_norm": 1.6130969620528077, + "learning_rate": 1.3141861317770628e-05, + "loss": 0.3097, + "step": 5255 + }, + { + "epoch": 0.41639928698752227, + "grad_norm": 1.4575279513781108, + "learning_rate": 1.3139425122869244e-05, + "loss": 0.3608, + "step": 5256 + }, + { + "epoch": 0.41647851059615765, + "grad_norm": 1.2969171877071193, + "learning_rate": 1.3136988721251823e-05, + "loss": 0.3114, + "step": 5257 + }, + { + "epoch": 0.41655773420479303, + "grad_norm": 1.7477472012265047, + "learning_rate": 1.3134552113078788e-05, + "loss": 0.3671, + "step": 5258 + }, + { + "epoch": 0.4166369578134284, + "grad_norm": 1.5858059611899067, + "learning_rate": 1.3132115298510579e-05, + "loss": 0.347, + "step": 5259 + }, + { + "epoch": 0.4167161814220638, + "grad_norm": 1.466729421542365, + "learning_rate": 1.312967827770765e-05, + "loss": 0.3881, + "step": 5260 + }, + { + "epoch": 0.4167954050306992, + "grad_norm": 1.7151680532077944, + "learning_rate": 1.3127241050830463e-05, + "loss": 0.4049, + "step": 5261 + }, + { + "epoch": 0.4168746286393345, + "grad_norm": 1.5098726250968266, + "learning_rate": 1.3124803618039501e-05, + "loss": 0.2808, + "step": 5262 + }, + { + "epoch": 0.4169538522479699, + "grad_norm": 1.3477863666910488, + "learning_rate": 1.3122365979495259e-05, + "loss": 0.2731, + "step": 5263 + }, + { + "epoch": 0.41703307585660526, + "grad_norm": 1.2699422428056057, + "learning_rate": 1.3119928135358238e-05, + "loss": 0.2585, + "step": 5264 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 1.4970157185764927, + "learning_rate": 1.3117490085788963e-05, + "loss": 0.3108, + "step": 5265 + }, + { + "epoch": 0.417191523073876, + "grad_norm": 1.6236409562923515, + "learning_rate": 1.3115051830947966e-05, + "loss": 0.4194, + "step": 5266 + }, + { + "epoch": 0.4172707466825114, + "grad_norm": 1.283106924272449, + "learning_rate": 1.3112613370995792e-05, + "loss": 0.3093, + "step": 5267 + }, + { + "epoch": 0.4173499702911468, + "grad_norm": 1.4710991651014518, + "learning_rate": 1.3110174706093007e-05, + "loss": 0.3513, + "step": 5268 + }, + { + "epoch": 0.4174291938997821, + "grad_norm": 1.5367612462046478, + "learning_rate": 1.3107735836400184e-05, + "loss": 0.3399, + "step": 5269 + }, + { + "epoch": 0.4175084175084175, + "grad_norm": 1.5998705516764091, + "learning_rate": 1.3105296762077906e-05, + "loss": 0.3249, + "step": 5270 + }, + { + "epoch": 0.4175876411170529, + "grad_norm": 1.1361656401124913, + "learning_rate": 1.3102857483286781e-05, + "loss": 0.216, + "step": 5271 + }, + { + "epoch": 0.41766686472568826, + "grad_norm": 1.7112437910284877, + "learning_rate": 1.310041800018742e-05, + "loss": 0.3659, + "step": 5272 + }, + { + "epoch": 0.41774608833432364, + "grad_norm": 1.4668141398726744, + "learning_rate": 1.3097978312940453e-05, + "loss": 0.3346, + "step": 5273 + }, + { + "epoch": 0.417825311942959, + "grad_norm": 1.7919395749676035, + "learning_rate": 1.309553842170652e-05, + "loss": 0.2925, + "step": 5274 + }, + { + "epoch": 0.4179045355515944, + "grad_norm": 1.3011750576725445, + "learning_rate": 1.3093098326646277e-05, + "loss": 0.25, + "step": 5275 + }, + { + "epoch": 0.41798375916022973, + "grad_norm": 1.2571443241609823, + "learning_rate": 1.3090658027920391e-05, + "loss": 0.2126, + "step": 5276 + }, + { + "epoch": 0.4180629827688651, + "grad_norm": 1.6601330619247827, + "learning_rate": 1.3088217525689546e-05, + "loss": 0.4192, + "step": 5277 + }, + { + "epoch": 0.4181422063775005, + "grad_norm": 1.3512499179836552, + "learning_rate": 1.3085776820114435e-05, + "loss": 0.2721, + "step": 5278 + }, + { + "epoch": 0.4182214299861359, + "grad_norm": 1.6193294689001956, + "learning_rate": 1.3083335911355768e-05, + "loss": 0.3656, + "step": 5279 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 1.4947321244634302, + "learning_rate": 1.3080894799574271e-05, + "loss": 0.3214, + "step": 5280 + }, + { + "epoch": 0.41837987720340664, + "grad_norm": 1.6548284031422333, + "learning_rate": 1.3078453484930674e-05, + "loss": 0.332, + "step": 5281 + }, + { + "epoch": 0.41845910081204196, + "grad_norm": 1.2878359525580316, + "learning_rate": 1.3076011967585727e-05, + "loss": 0.2698, + "step": 5282 + }, + { + "epoch": 0.41853832442067734, + "grad_norm": 1.5570065632798822, + "learning_rate": 1.3073570247700192e-05, + "loss": 0.3878, + "step": 5283 + }, + { + "epoch": 0.4186175480293127, + "grad_norm": 1.6400304463041429, + "learning_rate": 1.3071128325434845e-05, + "loss": 0.3951, + "step": 5284 + }, + { + "epoch": 0.4186967716379481, + "grad_norm": 1.3379521115779618, + "learning_rate": 1.3068686200950475e-05, + "loss": 0.2309, + "step": 5285 + }, + { + "epoch": 0.4187759952465835, + "grad_norm": 1.2955472859032604, + "learning_rate": 1.3066243874407886e-05, + "loss": 0.2528, + "step": 5286 + }, + { + "epoch": 0.41885521885521887, + "grad_norm": 1.429669254020428, + "learning_rate": 1.306380134596789e-05, + "loss": 0.237, + "step": 5287 + }, + { + "epoch": 0.41893444246385425, + "grad_norm": 1.4364730899683567, + "learning_rate": 1.306135861579132e-05, + "loss": 0.3382, + "step": 5288 + }, + { + "epoch": 0.4190136660724896, + "grad_norm": 1.3358004572151094, + "learning_rate": 1.3058915684039013e-05, + "loss": 0.2657, + "step": 5289 + }, + { + "epoch": 0.41909288968112496, + "grad_norm": 1.2371880112514608, + "learning_rate": 1.3056472550871829e-05, + "loss": 0.2399, + "step": 5290 + }, + { + "epoch": 0.41917211328976034, + "grad_norm": 1.4502287582672058, + "learning_rate": 1.3054029216450632e-05, + "loss": 0.3935, + "step": 5291 + }, + { + "epoch": 0.4192513368983957, + "grad_norm": 1.6608036733376232, + "learning_rate": 1.3051585680936305e-05, + "loss": 0.3826, + "step": 5292 + }, + { + "epoch": 0.4193305605070311, + "grad_norm": 1.3671963038191222, + "learning_rate": 1.304914194448975e-05, + "loss": 0.2501, + "step": 5293 + }, + { + "epoch": 0.4194097841156665, + "grad_norm": 1.7349716703237739, + "learning_rate": 1.3046698007271864e-05, + "loss": 0.3201, + "step": 5294 + }, + { + "epoch": 0.41948900772430187, + "grad_norm": 1.501184102283557, + "learning_rate": 1.3044253869443575e-05, + "loss": 0.3276, + "step": 5295 + }, + { + "epoch": 0.4195682313329372, + "grad_norm": 1.3467820580860035, + "learning_rate": 1.3041809531165819e-05, + "loss": 0.2406, + "step": 5296 + }, + { + "epoch": 0.4196474549415726, + "grad_norm": 1.974548855605805, + "learning_rate": 1.3039364992599538e-05, + "loss": 0.4517, + "step": 5297 + }, + { + "epoch": 0.41972667855020795, + "grad_norm": 1.3849372161870959, + "learning_rate": 1.30369202539057e-05, + "loss": 0.345, + "step": 5298 + }, + { + "epoch": 0.41980590215884334, + "grad_norm": 1.305057581540092, + "learning_rate": 1.3034475315245273e-05, + "loss": 0.338, + "step": 5299 + }, + { + "epoch": 0.4198851257674787, + "grad_norm": 1.654283897611442, + "learning_rate": 1.303203017677925e-05, + "loss": 0.2951, + "step": 5300 + }, + { + "epoch": 0.4199643493761141, + "grad_norm": 1.524409840224814, + "learning_rate": 1.302958483866863e-05, + "loss": 0.396, + "step": 5301 + }, + { + "epoch": 0.4200435729847495, + "grad_norm": 1.292551888311837, + "learning_rate": 1.3027139301074423e-05, + "loss": 0.3093, + "step": 5302 + }, + { + "epoch": 0.4201227965933848, + "grad_norm": 1.1337498032506503, + "learning_rate": 1.3024693564157658e-05, + "loss": 0.2558, + "step": 5303 + }, + { + "epoch": 0.4202020202020202, + "grad_norm": 1.5746279119087407, + "learning_rate": 1.3022247628079381e-05, + "loss": 0.4044, + "step": 5304 + }, + { + "epoch": 0.42028124381065557, + "grad_norm": 1.6976577336693552, + "learning_rate": 1.3019801493000634e-05, + "loss": 0.2844, + "step": 5305 + }, + { + "epoch": 0.42036046741929095, + "grad_norm": 1.2878362062039947, + "learning_rate": 1.3017355159082495e-05, + "loss": 0.2351, + "step": 5306 + }, + { + "epoch": 0.42043969102792633, + "grad_norm": 1.2969092940145905, + "learning_rate": 1.3014908626486032e-05, + "loss": 0.295, + "step": 5307 + }, + { + "epoch": 0.4205189146365617, + "grad_norm": 1.2338148705584753, + "learning_rate": 1.3012461895372343e-05, + "loss": 0.2373, + "step": 5308 + }, + { + "epoch": 0.4205981382451971, + "grad_norm": 1.449628486215075, + "learning_rate": 1.3010014965902535e-05, + "loss": 0.2644, + "step": 5309 + }, + { + "epoch": 0.4206773618538324, + "grad_norm": 1.5930273494525637, + "learning_rate": 1.3007567838237725e-05, + "loss": 0.2788, + "step": 5310 + }, + { + "epoch": 0.4207565854624678, + "grad_norm": 1.753344167154168, + "learning_rate": 1.3005120512539042e-05, + "loss": 0.3698, + "step": 5311 + }, + { + "epoch": 0.4208358090711032, + "grad_norm": 1.2824667671437335, + "learning_rate": 1.300267298896764e-05, + "loss": 0.2971, + "step": 5312 + }, + { + "epoch": 0.42091503267973857, + "grad_norm": 1.6780009824844848, + "learning_rate": 1.3000225267684663e-05, + "loss": 0.3351, + "step": 5313 + }, + { + "epoch": 0.42099425628837395, + "grad_norm": 1.7324769919343599, + "learning_rate": 1.2997777348851288e-05, + "loss": 0.3984, + "step": 5314 + }, + { + "epoch": 0.42107347989700933, + "grad_norm": 1.7636825950419772, + "learning_rate": 1.2995329232628702e-05, + "loss": 0.3157, + "step": 5315 + }, + { + "epoch": 0.42115270350564465, + "grad_norm": 1.4770237736369367, + "learning_rate": 1.2992880919178097e-05, + "loss": 0.3159, + "step": 5316 + }, + { + "epoch": 0.42123192711428004, + "grad_norm": 1.3724093028210702, + "learning_rate": 1.2990432408660682e-05, + "loss": 0.2361, + "step": 5317 + }, + { + "epoch": 0.4213111507229154, + "grad_norm": 1.424386207497933, + "learning_rate": 1.2987983701237688e-05, + "loss": 0.3355, + "step": 5318 + }, + { + "epoch": 0.4213903743315508, + "grad_norm": 1.8857244313966501, + "learning_rate": 1.298553479707034e-05, + "loss": 0.4112, + "step": 5319 + }, + { + "epoch": 0.4214695979401862, + "grad_norm": 1.505656407223853, + "learning_rate": 1.2983085696319892e-05, + "loss": 0.3332, + "step": 5320 + }, + { + "epoch": 0.42154882154882156, + "grad_norm": 1.1562259124849663, + "learning_rate": 1.2980636399147606e-05, + "loss": 0.2403, + "step": 5321 + }, + { + "epoch": 0.42162804515745694, + "grad_norm": 1.4254104253632018, + "learning_rate": 1.2978186905714752e-05, + "loss": 0.2889, + "step": 5322 + }, + { + "epoch": 0.42170726876609227, + "grad_norm": 1.5178854444436547, + "learning_rate": 1.2975737216182625e-05, + "loss": 0.3153, + "step": 5323 + }, + { + "epoch": 0.42178649237472765, + "grad_norm": 1.2456095005863437, + "learning_rate": 1.2973287330712516e-05, + "loss": 0.2159, + "step": 5324 + }, + { + "epoch": 0.42186571598336303, + "grad_norm": 1.273418659889706, + "learning_rate": 1.2970837249465746e-05, + "loss": 0.3266, + "step": 5325 + }, + { + "epoch": 0.4219449395919984, + "grad_norm": 1.6878548988442128, + "learning_rate": 1.2968386972603635e-05, + "loss": 0.3588, + "step": 5326 + }, + { + "epoch": 0.4220241632006338, + "grad_norm": 1.3721699907131348, + "learning_rate": 1.2965936500287526e-05, + "loss": 0.2709, + "step": 5327 + }, + { + "epoch": 0.4221033868092692, + "grad_norm": 1.2987767090340112, + "learning_rate": 1.2963485832678772e-05, + "loss": 0.2547, + "step": 5328 + }, + { + "epoch": 0.42218261041790456, + "grad_norm": 1.2379056712049545, + "learning_rate": 1.2961034969938732e-05, + "loss": 0.2219, + "step": 5329 + }, + { + "epoch": 0.4222618340265399, + "grad_norm": 1.6189478079826551, + "learning_rate": 1.2958583912228785e-05, + "loss": 0.381, + "step": 5330 + }, + { + "epoch": 0.42234105763517527, + "grad_norm": 1.3733530317994096, + "learning_rate": 1.295613265971033e-05, + "loss": 0.2324, + "step": 5331 + }, + { + "epoch": 0.42242028124381065, + "grad_norm": 1.5952434646516784, + "learning_rate": 1.2953681212544757e-05, + "loss": 0.3252, + "step": 5332 + }, + { + "epoch": 0.42249950485244603, + "grad_norm": 1.6695805122142962, + "learning_rate": 1.2951229570893493e-05, + "loss": 0.292, + "step": 5333 + }, + { + "epoch": 0.4225787284610814, + "grad_norm": 1.5465545260038185, + "learning_rate": 1.2948777734917961e-05, + "loss": 0.3228, + "step": 5334 + }, + { + "epoch": 0.4226579520697168, + "grad_norm": 1.742619333597175, + "learning_rate": 1.2946325704779602e-05, + "loss": 0.375, + "step": 5335 + }, + { + "epoch": 0.4227371756783522, + "grad_norm": 1.608051129156309, + "learning_rate": 1.2943873480639875e-05, + "loss": 0.3429, + "step": 5336 + }, + { + "epoch": 0.4228163992869875, + "grad_norm": 1.4473720798249001, + "learning_rate": 1.294142106266024e-05, + "loss": 0.391, + "step": 5337 + }, + { + "epoch": 0.4228956228956229, + "grad_norm": 1.817950454772865, + "learning_rate": 1.2938968451002183e-05, + "loss": 0.3997, + "step": 5338 + }, + { + "epoch": 0.42297484650425826, + "grad_norm": 1.6406967514579258, + "learning_rate": 1.2936515645827198e-05, + "loss": 0.4287, + "step": 5339 + }, + { + "epoch": 0.42305407011289364, + "grad_norm": 1.2732923627053705, + "learning_rate": 1.2934062647296783e-05, + "loss": 0.2551, + "step": 5340 + }, + { + "epoch": 0.423133293721529, + "grad_norm": 1.457510554627103, + "learning_rate": 1.2931609455572462e-05, + "loss": 0.3093, + "step": 5341 + }, + { + "epoch": 0.4232125173301644, + "grad_norm": 1.6648995220716367, + "learning_rate": 1.2929156070815765e-05, + "loss": 0.2919, + "step": 5342 + }, + { + "epoch": 0.4232917409387998, + "grad_norm": 1.5657625603190437, + "learning_rate": 1.2926702493188235e-05, + "loss": 0.3578, + "step": 5343 + }, + { + "epoch": 0.4233709645474351, + "grad_norm": 1.2598454697284138, + "learning_rate": 1.292424872285143e-05, + "loss": 0.26, + "step": 5344 + }, + { + "epoch": 0.4234501881560705, + "grad_norm": 1.5104531454803312, + "learning_rate": 1.2921794759966913e-05, + "loss": 0.2762, + "step": 5345 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 1.290372724031699, + "learning_rate": 1.2919340604696272e-05, + "loss": 0.2844, + "step": 5346 + }, + { + "epoch": 0.42360863537334126, + "grad_norm": 1.3172941870763695, + "learning_rate": 1.29168862572011e-05, + "loss": 0.348, + "step": 5347 + }, + { + "epoch": 0.42368785898197664, + "grad_norm": 1.206274560758607, + "learning_rate": 1.2914431717643e-05, + "loss": 0.3658, + "step": 5348 + }, + { + "epoch": 0.423767082590612, + "grad_norm": 1.4670086427942317, + "learning_rate": 1.2911976986183598e-05, + "loss": 0.3358, + "step": 5349 + }, + { + "epoch": 0.4238463061992474, + "grad_norm": 1.289335118878106, + "learning_rate": 1.2909522062984524e-05, + "loss": 0.287, + "step": 5350 + }, + { + "epoch": 0.42392552980788273, + "grad_norm": 1.3051643071168066, + "learning_rate": 1.290706694820742e-05, + "loss": 0.2865, + "step": 5351 + }, + { + "epoch": 0.4240047534165181, + "grad_norm": 1.2088286681138052, + "learning_rate": 1.2904611642013945e-05, + "loss": 0.2114, + "step": 5352 + }, + { + "epoch": 0.4240839770251535, + "grad_norm": 1.4230314253859666, + "learning_rate": 1.2902156144565769e-05, + "loss": 0.3252, + "step": 5353 + }, + { + "epoch": 0.4241632006337889, + "grad_norm": 1.1818993567321534, + "learning_rate": 1.2899700456024576e-05, + "loss": 0.2558, + "step": 5354 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 1.6018433101059462, + "learning_rate": 1.2897244576552062e-05, + "loss": 0.3696, + "step": 5355 + }, + { + "epoch": 0.42432164785105964, + "grad_norm": 1.3115637245824774, + "learning_rate": 1.289478850630993e-05, + "loss": 0.2671, + "step": 5356 + }, + { + "epoch": 0.42440087145969496, + "grad_norm": 1.4131241566130903, + "learning_rate": 1.2892332245459904e-05, + "loss": 0.338, + "step": 5357 + }, + { + "epoch": 0.42448009506833034, + "grad_norm": 1.3185846469905353, + "learning_rate": 1.288987579416372e-05, + "loss": 0.2558, + "step": 5358 + }, + { + "epoch": 0.4245593186769657, + "grad_norm": 1.1992209755196688, + "learning_rate": 1.2887419152583117e-05, + "loss": 0.2746, + "step": 5359 + }, + { + "epoch": 0.4246385422856011, + "grad_norm": 1.4760978780529286, + "learning_rate": 1.2884962320879857e-05, + "loss": 0.3418, + "step": 5360 + }, + { + "epoch": 0.4247177658942365, + "grad_norm": 1.8056725131778988, + "learning_rate": 1.2882505299215711e-05, + "loss": 0.388, + "step": 5361 + }, + { + "epoch": 0.42479698950287187, + "grad_norm": 1.321283591645511, + "learning_rate": 1.288004808775246e-05, + "loss": 0.2465, + "step": 5362 + }, + { + "epoch": 0.42487621311150725, + "grad_norm": 1.2013034977072816, + "learning_rate": 1.28775906866519e-05, + "loss": 0.1646, + "step": 5363 + }, + { + "epoch": 0.4249554367201426, + "grad_norm": 1.7386095757802484, + "learning_rate": 1.2875133096075839e-05, + "loss": 0.3628, + "step": 5364 + }, + { + "epoch": 0.42503466032877796, + "grad_norm": 1.8435032525717856, + "learning_rate": 1.2872675316186096e-05, + "loss": 0.3344, + "step": 5365 + }, + { + "epoch": 0.42511388393741334, + "grad_norm": 1.6258185510807717, + "learning_rate": 1.2870217347144511e-05, + "loss": 0.3605, + "step": 5366 + }, + { + "epoch": 0.4251931075460487, + "grad_norm": 1.4388781415344534, + "learning_rate": 1.2867759189112921e-05, + "loss": 0.3127, + "step": 5367 + }, + { + "epoch": 0.4252723311546841, + "grad_norm": 1.1994881040489052, + "learning_rate": 1.2865300842253188e-05, + "loss": 0.2033, + "step": 5368 + }, + { + "epoch": 0.4253515547633195, + "grad_norm": 1.2516484779524133, + "learning_rate": 1.2862842306727181e-05, + "loss": 0.2895, + "step": 5369 + }, + { + "epoch": 0.42543077837195487, + "grad_norm": 1.434242516030503, + "learning_rate": 1.2860383582696783e-05, + "loss": 0.3022, + "step": 5370 + }, + { + "epoch": 0.4255100019805902, + "grad_norm": 1.3519392898244098, + "learning_rate": 1.2857924670323892e-05, + "loss": 0.3092, + "step": 5371 + }, + { + "epoch": 0.4255892255892256, + "grad_norm": 1.3830544966052511, + "learning_rate": 1.2855465569770407e-05, + "loss": 0.2783, + "step": 5372 + }, + { + "epoch": 0.42566844919786095, + "grad_norm": 1.3483132715233455, + "learning_rate": 1.2853006281198257e-05, + "loss": 0.2636, + "step": 5373 + }, + { + "epoch": 0.42574767280649634, + "grad_norm": 1.869963143411993, + "learning_rate": 1.2850546804769372e-05, + "loss": 0.4697, + "step": 5374 + }, + { + "epoch": 0.4258268964151317, + "grad_norm": 1.5025478573839384, + "learning_rate": 1.2848087140645695e-05, + "loss": 0.3994, + "step": 5375 + }, + { + "epoch": 0.4259061200237671, + "grad_norm": 1.1380436569733503, + "learning_rate": 1.2845627288989186e-05, + "loss": 0.2087, + "step": 5376 + }, + { + "epoch": 0.4259853436324025, + "grad_norm": 1.4749206660396121, + "learning_rate": 1.284316724996181e-05, + "loss": 0.2968, + "step": 5377 + }, + { + "epoch": 0.4260645672410378, + "grad_norm": 1.4431868450966434, + "learning_rate": 1.2840707023725552e-05, + "loss": 0.2731, + "step": 5378 + }, + { + "epoch": 0.4261437908496732, + "grad_norm": 1.2836643684982045, + "learning_rate": 1.2838246610442406e-05, + "loss": 0.2603, + "step": 5379 + }, + { + "epoch": 0.42622301445830857, + "grad_norm": 1.532947971592485, + "learning_rate": 1.2835786010274376e-05, + "loss": 0.2896, + "step": 5380 + }, + { + "epoch": 0.42630223806694395, + "grad_norm": 1.2125962378133146, + "learning_rate": 1.283332522338348e-05, + "loss": 0.2203, + "step": 5381 + }, + { + "epoch": 0.42638146167557933, + "grad_norm": 1.4430830714804839, + "learning_rate": 1.2830864249931756e-05, + "loss": 0.3699, + "step": 5382 + }, + { + "epoch": 0.4264606852842147, + "grad_norm": 1.4376294651538295, + "learning_rate": 1.2828403090081238e-05, + "loss": 0.3045, + "step": 5383 + }, + { + "epoch": 0.4265399088928501, + "grad_norm": 1.5464936587077387, + "learning_rate": 1.282594174399399e-05, + "loss": 0.3466, + "step": 5384 + }, + { + "epoch": 0.4266191325014854, + "grad_norm": 1.1663907507366942, + "learning_rate": 1.2823480211832073e-05, + "loss": 0.2369, + "step": 5385 + }, + { + "epoch": 0.4266983561101208, + "grad_norm": 1.381108449801392, + "learning_rate": 1.2821018493757569e-05, + "loss": 0.2858, + "step": 5386 + }, + { + "epoch": 0.4267775797187562, + "grad_norm": 1.4900068052255124, + "learning_rate": 1.2818556589932575e-05, + "loss": 0.3727, + "step": 5387 + }, + { + "epoch": 0.42685680332739157, + "grad_norm": 1.4090394852503019, + "learning_rate": 1.2816094500519188e-05, + "loss": 0.3098, + "step": 5388 + }, + { + "epoch": 0.42693602693602695, + "grad_norm": 1.6669878269675182, + "learning_rate": 1.2813632225679528e-05, + "loss": 0.3567, + "step": 5389 + }, + { + "epoch": 0.42701525054466233, + "grad_norm": 1.2099463386222742, + "learning_rate": 1.281116976557573e-05, + "loss": 0.2898, + "step": 5390 + }, + { + "epoch": 0.4270944741532977, + "grad_norm": 1.331961260766164, + "learning_rate": 1.2808707120369923e-05, + "loss": 0.2705, + "step": 5391 + }, + { + "epoch": 0.42717369776193304, + "grad_norm": 1.1794589464847258, + "learning_rate": 1.280624429022427e-05, + "loss": 0.1987, + "step": 5392 + }, + { + "epoch": 0.4272529213705684, + "grad_norm": 1.8481513262049374, + "learning_rate": 1.2803781275300933e-05, + "loss": 0.4254, + "step": 5393 + }, + { + "epoch": 0.4273321449792038, + "grad_norm": 1.3145203662750549, + "learning_rate": 1.2801318075762088e-05, + "loss": 0.3131, + "step": 5394 + }, + { + "epoch": 0.4274113685878392, + "grad_norm": 1.239401965793781, + "learning_rate": 1.2798854691769927e-05, + "loss": 0.277, + "step": 5395 + }, + { + "epoch": 0.42749059219647456, + "grad_norm": 1.3683252291993193, + "learning_rate": 1.2796391123486654e-05, + "loss": 0.2328, + "step": 5396 + }, + { + "epoch": 0.42756981580510994, + "grad_norm": 1.4362893318216192, + "learning_rate": 1.2793927371074477e-05, + "loss": 0.3275, + "step": 5397 + }, + { + "epoch": 0.42764903941374527, + "grad_norm": 1.1718506240670885, + "learning_rate": 1.279146343469563e-05, + "loss": 0.2528, + "step": 5398 + }, + { + "epoch": 0.42772826302238065, + "grad_norm": 1.3127499603795205, + "learning_rate": 1.2788999314512347e-05, + "loss": 0.1929, + "step": 5399 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 1.2428444141788408, + "learning_rate": 1.2786535010686879e-05, + "loss": 0.2068, + "step": 5400 + }, + { + "epoch": 0.4278867102396514, + "grad_norm": 1.285329046696986, + "learning_rate": 1.2784070523381487e-05, + "loss": 0.2293, + "step": 5401 + }, + { + "epoch": 0.4279659338482868, + "grad_norm": 1.4271090747242758, + "learning_rate": 1.2781605852758448e-05, + "loss": 0.1902, + "step": 5402 + }, + { + "epoch": 0.4280451574569222, + "grad_norm": 1.5853355287172861, + "learning_rate": 1.2779140998980048e-05, + "loss": 0.3348, + "step": 5403 + }, + { + "epoch": 0.42812438106555756, + "grad_norm": 1.7229634546243293, + "learning_rate": 1.2776675962208585e-05, + "loss": 0.3129, + "step": 5404 + }, + { + "epoch": 0.4282036046741929, + "grad_norm": 1.4588992325607515, + "learning_rate": 1.2774210742606368e-05, + "loss": 0.2887, + "step": 5405 + }, + { + "epoch": 0.42828282828282827, + "grad_norm": 1.4014347468319028, + "learning_rate": 1.2771745340335726e-05, + "loss": 0.3475, + "step": 5406 + }, + { + "epoch": 0.42836205189146365, + "grad_norm": 1.2241257501542995, + "learning_rate": 1.276927975555899e-05, + "loss": 0.2724, + "step": 5407 + }, + { + "epoch": 0.42844127550009903, + "grad_norm": 1.6433282330507506, + "learning_rate": 1.2766813988438505e-05, + "loss": 0.3178, + "step": 5408 + }, + { + "epoch": 0.4285204991087344, + "grad_norm": 1.4666289676114357, + "learning_rate": 1.2764348039136634e-05, + "loss": 0.3014, + "step": 5409 + }, + { + "epoch": 0.4285997227173698, + "grad_norm": 1.6626758567941506, + "learning_rate": 1.2761881907815744e-05, + "loss": 0.4482, + "step": 5410 + }, + { + "epoch": 0.4286789463260052, + "grad_norm": 1.3915045752123325, + "learning_rate": 1.275941559463822e-05, + "loss": 0.3292, + "step": 5411 + }, + { + "epoch": 0.4287581699346405, + "grad_norm": 1.3488420178464307, + "learning_rate": 1.2756949099766458e-05, + "loss": 0.3079, + "step": 5412 + }, + { + "epoch": 0.4288373935432759, + "grad_norm": 1.2761855978713215, + "learning_rate": 1.2754482423362861e-05, + "loss": 0.2771, + "step": 5413 + }, + { + "epoch": 0.42891661715191126, + "grad_norm": 1.4813392383177097, + "learning_rate": 1.2752015565589852e-05, + "loss": 0.3155, + "step": 5414 + }, + { + "epoch": 0.42899584076054664, + "grad_norm": 1.4125235626782122, + "learning_rate": 1.2749548526609858e-05, + "loss": 0.3407, + "step": 5415 + }, + { + "epoch": 0.429075064369182, + "grad_norm": 1.3543089874507992, + "learning_rate": 1.2747081306585325e-05, + "loss": 0.2857, + "step": 5416 + }, + { + "epoch": 0.4291542879778174, + "grad_norm": 1.388327874850087, + "learning_rate": 1.2744613905678707e-05, + "loss": 0.352, + "step": 5417 + }, + { + "epoch": 0.4292335115864528, + "grad_norm": 1.3170784812768765, + "learning_rate": 1.2742146324052466e-05, + "loss": 0.2004, + "step": 5418 + }, + { + "epoch": 0.4293127351950881, + "grad_norm": 1.7098624732253722, + "learning_rate": 1.273967856186909e-05, + "loss": 0.389, + "step": 5419 + }, + { + "epoch": 0.4293919588037235, + "grad_norm": 1.5161389576590416, + "learning_rate": 1.2737210619291058e-05, + "loss": 0.2684, + "step": 5420 + }, + { + "epoch": 0.4294711824123589, + "grad_norm": 1.4483997912609599, + "learning_rate": 1.2734742496480878e-05, + "loss": 0.3581, + "step": 5421 + }, + { + "epoch": 0.42955040602099426, + "grad_norm": 1.7315986979764633, + "learning_rate": 1.2732274193601066e-05, + "loss": 0.3944, + "step": 5422 + }, + { + "epoch": 0.42962962962962964, + "grad_norm": 1.70707699695665, + "learning_rate": 1.2729805710814142e-05, + "loss": 0.3951, + "step": 5423 + }, + { + "epoch": 0.429708853238265, + "grad_norm": 1.229800085616609, + "learning_rate": 1.2727337048282649e-05, + "loss": 0.3034, + "step": 5424 + }, + { + "epoch": 0.4297880768469004, + "grad_norm": 1.327922796851444, + "learning_rate": 1.2724868206169134e-05, + "loss": 0.2503, + "step": 5425 + }, + { + "epoch": 0.42986730045553573, + "grad_norm": 1.4223452121407245, + "learning_rate": 1.2722399184636158e-05, + "loss": 0.3023, + "step": 5426 + }, + { + "epoch": 0.4299465240641711, + "grad_norm": 1.5392242912135163, + "learning_rate": 1.2719929983846298e-05, + "loss": 0.2474, + "step": 5427 + }, + { + "epoch": 0.4300257476728065, + "grad_norm": 1.286712388834556, + "learning_rate": 1.2717460603962132e-05, + "loss": 0.3497, + "step": 5428 + }, + { + "epoch": 0.4301049712814419, + "grad_norm": 1.620424515101876, + "learning_rate": 1.2714991045146265e-05, + "loss": 0.3743, + "step": 5429 + }, + { + "epoch": 0.43018419489007725, + "grad_norm": 1.7969404020852162, + "learning_rate": 1.2712521307561298e-05, + "loss": 0.28, + "step": 5430 + }, + { + "epoch": 0.43026341849871264, + "grad_norm": 1.1727169216814828, + "learning_rate": 1.2710051391369857e-05, + "loss": 0.264, + "step": 5431 + }, + { + "epoch": 0.430342642107348, + "grad_norm": 1.363950459841746, + "learning_rate": 1.270758129673457e-05, + "loss": 0.3174, + "step": 5432 + }, + { + "epoch": 0.43042186571598334, + "grad_norm": 1.5429165350200493, + "learning_rate": 1.2705111023818083e-05, + "loss": 0.3709, + "step": 5433 + }, + { + "epoch": 0.4305010893246187, + "grad_norm": 1.3033745895471434, + "learning_rate": 1.2702640572783051e-05, + "loss": 0.224, + "step": 5434 + }, + { + "epoch": 0.4305803129332541, + "grad_norm": 1.8829247102128346, + "learning_rate": 1.2700169943792143e-05, + "loss": 0.366, + "step": 5435 + }, + { + "epoch": 0.4306595365418895, + "grad_norm": 1.1961494435101305, + "learning_rate": 1.2697699137008038e-05, + "loss": 0.2143, + "step": 5436 + }, + { + "epoch": 0.43073876015052487, + "grad_norm": 1.2330006922081789, + "learning_rate": 1.2695228152593419e-05, + "loss": 0.2745, + "step": 5437 + }, + { + "epoch": 0.43081798375916025, + "grad_norm": 1.3238045984361115, + "learning_rate": 1.2692756990710998e-05, + "loss": 0.3038, + "step": 5438 + }, + { + "epoch": 0.4308972073677956, + "grad_norm": 1.7456698958864234, + "learning_rate": 1.269028565152349e-05, + "loss": 0.2782, + "step": 5439 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 1.2910717810119143, + "learning_rate": 1.2687814135193613e-05, + "loss": 0.2525, + "step": 5440 + }, + { + "epoch": 0.43105565458506634, + "grad_norm": 1.2644365952577783, + "learning_rate": 1.2685342441884107e-05, + "loss": 0.2642, + "step": 5441 + }, + { + "epoch": 0.4311348781937017, + "grad_norm": 1.972364586257956, + "learning_rate": 1.2682870571757724e-05, + "loss": 0.4723, + "step": 5442 + }, + { + "epoch": 0.4312141018023371, + "grad_norm": 1.7539274480494045, + "learning_rate": 1.2680398524977222e-05, + "loss": 0.3905, + "step": 5443 + }, + { + "epoch": 0.4312933254109725, + "grad_norm": 1.8552369469881138, + "learning_rate": 1.2677926301705376e-05, + "loss": 0.3314, + "step": 5444 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 1.6734085615154801, + "learning_rate": 1.2675453902104967e-05, + "loss": 0.358, + "step": 5445 + }, + { + "epoch": 0.4314517726282432, + "grad_norm": 1.5373536220584754, + "learning_rate": 1.2672981326338793e-05, + "loss": 0.327, + "step": 5446 + }, + { + "epoch": 0.4315309962368786, + "grad_norm": 1.4209827127714774, + "learning_rate": 1.267050857456966e-05, + "loss": 0.3408, + "step": 5447 + }, + { + "epoch": 0.43161021984551395, + "grad_norm": 1.3832346716169261, + "learning_rate": 1.2668035646960384e-05, + "loss": 0.2813, + "step": 5448 + }, + { + "epoch": 0.43168944345414934, + "grad_norm": 1.4619540174563779, + "learning_rate": 1.2665562543673803e-05, + "loss": 0.3294, + "step": 5449 + }, + { + "epoch": 0.4317686670627847, + "grad_norm": 1.5011239471859592, + "learning_rate": 1.2663089264872751e-05, + "loss": 0.2726, + "step": 5450 + }, + { + "epoch": 0.4318478906714201, + "grad_norm": 1.054699979891497, + "learning_rate": 1.2660615810720087e-05, + "loss": 0.2814, + "step": 5451 + }, + { + "epoch": 0.4319271142800555, + "grad_norm": 1.3282021130922372, + "learning_rate": 1.2658142181378675e-05, + "loss": 0.3067, + "step": 5452 + }, + { + "epoch": 0.4320063378886908, + "grad_norm": 1.2653790624505565, + "learning_rate": 1.2655668377011387e-05, + "loss": 0.2886, + "step": 5453 + }, + { + "epoch": 0.4320855614973262, + "grad_norm": 1.5848208966044717, + "learning_rate": 1.2653194397781117e-05, + "loss": 0.3624, + "step": 5454 + }, + { + "epoch": 0.43216478510596157, + "grad_norm": 1.2103689062823213, + "learning_rate": 1.2650720243850762e-05, + "loss": 0.3343, + "step": 5455 + }, + { + "epoch": 0.43224400871459695, + "grad_norm": 1.695500876910567, + "learning_rate": 1.2648245915383233e-05, + "loss": 0.4319, + "step": 5456 + }, + { + "epoch": 0.43232323232323233, + "grad_norm": 1.641318706300649, + "learning_rate": 1.2645771412541455e-05, + "loss": 0.3233, + "step": 5457 + }, + { + "epoch": 0.4324024559318677, + "grad_norm": 1.4264901037907947, + "learning_rate": 1.2643296735488355e-05, + "loss": 0.2835, + "step": 5458 + }, + { + "epoch": 0.4324816795405031, + "grad_norm": 1.7114592953176366, + "learning_rate": 1.2640821884386887e-05, + "loss": 0.4391, + "step": 5459 + }, + { + "epoch": 0.4325609031491384, + "grad_norm": 1.6222246082244827, + "learning_rate": 1.2638346859400006e-05, + "loss": 0.3495, + "step": 5460 + }, + { + "epoch": 0.4326401267577738, + "grad_norm": 1.3545315424505424, + "learning_rate": 1.2635871660690677e-05, + "loss": 0.3564, + "step": 5461 + }, + { + "epoch": 0.4327193503664092, + "grad_norm": 1.7432114165988388, + "learning_rate": 1.2633396288421884e-05, + "loss": 0.4184, + "step": 5462 + }, + { + "epoch": 0.43279857397504456, + "grad_norm": 1.581166045709697, + "learning_rate": 1.2630920742756616e-05, + "loss": 0.3156, + "step": 5463 + }, + { + "epoch": 0.43287779758367995, + "grad_norm": 1.3949629214009491, + "learning_rate": 1.2628445023857875e-05, + "loss": 0.3493, + "step": 5464 + }, + { + "epoch": 0.43295702119231533, + "grad_norm": 1.285804365685932, + "learning_rate": 1.2625969131888677e-05, + "loss": 0.3022, + "step": 5465 + }, + { + "epoch": 0.4330362448009507, + "grad_norm": 1.4283416742607622, + "learning_rate": 1.2623493067012047e-05, + "loss": 0.3524, + "step": 5466 + }, + { + "epoch": 0.43311546840958604, + "grad_norm": 0.9835140281730688, + "learning_rate": 1.2621016829391022e-05, + "loss": 0.182, + "step": 5467 + }, + { + "epoch": 0.4331946920182214, + "grad_norm": 1.1641937817399692, + "learning_rate": 1.2618540419188654e-05, + "loss": 0.2765, + "step": 5468 + }, + { + "epoch": 0.4332739156268568, + "grad_norm": 1.5427922786986064, + "learning_rate": 1.2616063836567994e-05, + "loss": 0.2981, + "step": 5469 + }, + { + "epoch": 0.4333531392354922, + "grad_norm": 1.4924127486511443, + "learning_rate": 1.2613587081692118e-05, + "loss": 0.328, + "step": 5470 + }, + { + "epoch": 0.43343236284412756, + "grad_norm": 1.7285763192172479, + "learning_rate": 1.2611110154724113e-05, + "loss": 0.4709, + "step": 5471 + }, + { + "epoch": 0.43351158645276294, + "grad_norm": 1.5157293613758454, + "learning_rate": 1.2608633055827064e-05, + "loss": 0.3337, + "step": 5472 + }, + { + "epoch": 0.4335908100613983, + "grad_norm": 1.2172657654412193, + "learning_rate": 1.260615578516408e-05, + "loss": 0.2544, + "step": 5473 + }, + { + "epoch": 0.43367003367003365, + "grad_norm": 1.5409390052442435, + "learning_rate": 1.260367834289828e-05, + "loss": 0.3411, + "step": 5474 + }, + { + "epoch": 0.43374925727866903, + "grad_norm": 1.4382072378306487, + "learning_rate": 1.2601200729192789e-05, + "loss": 0.3718, + "step": 5475 + }, + { + "epoch": 0.4338284808873044, + "grad_norm": 1.179465992343742, + "learning_rate": 1.2598722944210746e-05, + "loss": 0.2762, + "step": 5476 + }, + { + "epoch": 0.4339077044959398, + "grad_norm": 1.59375203852541, + "learning_rate": 1.25962449881153e-05, + "loss": 0.4207, + "step": 5477 + }, + { + "epoch": 0.4339869281045752, + "grad_norm": 1.388563111484153, + "learning_rate": 1.2593766861069615e-05, + "loss": 0.2863, + "step": 5478 + }, + { + "epoch": 0.43406615171321056, + "grad_norm": 1.4750851146914306, + "learning_rate": 1.2591288563236864e-05, + "loss": 0.3282, + "step": 5479 + }, + { + "epoch": 0.4341453753218459, + "grad_norm": 1.743719289196784, + "learning_rate": 1.2588810094780227e-05, + "loss": 0.3748, + "step": 5480 + }, + { + "epoch": 0.43422459893048126, + "grad_norm": 1.4148853753485535, + "learning_rate": 1.2586331455862902e-05, + "loss": 0.2627, + "step": 5481 + }, + { + "epoch": 0.43430382253911665, + "grad_norm": 1.4276579213531981, + "learning_rate": 1.2583852646648097e-05, + "loss": 0.328, + "step": 5482 + }, + { + "epoch": 0.434383046147752, + "grad_norm": 1.3557638130504426, + "learning_rate": 1.2581373667299026e-05, + "loss": 0.337, + "step": 5483 + }, + { + "epoch": 0.4344622697563874, + "grad_norm": 1.649902237559774, + "learning_rate": 1.257889451797892e-05, + "loss": 0.322, + "step": 5484 + }, + { + "epoch": 0.4345414933650228, + "grad_norm": 1.452788465442924, + "learning_rate": 1.257641519885102e-05, + "loss": 0.2976, + "step": 5485 + }, + { + "epoch": 0.43462071697365817, + "grad_norm": 1.5559965749267017, + "learning_rate": 1.2573935710078576e-05, + "loss": 0.3137, + "step": 5486 + }, + { + "epoch": 0.4346999405822935, + "grad_norm": 1.4697167871168915, + "learning_rate": 1.2571456051824851e-05, + "loss": 0.3126, + "step": 5487 + }, + { + "epoch": 0.4347791641909289, + "grad_norm": 1.3844928772019096, + "learning_rate": 1.2568976224253115e-05, + "loss": 0.2262, + "step": 5488 + }, + { + "epoch": 0.43485838779956426, + "grad_norm": 1.498134018477766, + "learning_rate": 1.256649622752666e-05, + "loss": 0.3776, + "step": 5489 + }, + { + "epoch": 0.43493761140819964, + "grad_norm": 1.245475668813004, + "learning_rate": 1.2564016061808774e-05, + "loss": 0.2297, + "step": 5490 + }, + { + "epoch": 0.435016835016835, + "grad_norm": 1.589198936945319, + "learning_rate": 1.2561535727262769e-05, + "loss": 0.3655, + "step": 5491 + }, + { + "epoch": 0.4350960586254704, + "grad_norm": 1.6472914681148645, + "learning_rate": 1.2559055224051963e-05, + "loss": 0.3296, + "step": 5492 + }, + { + "epoch": 0.4351752822341058, + "grad_norm": 1.8696894172833403, + "learning_rate": 1.2556574552339682e-05, + "loss": 0.3698, + "step": 5493 + }, + { + "epoch": 0.4352545058427411, + "grad_norm": 1.9858729806649456, + "learning_rate": 1.2554093712289267e-05, + "loss": 0.5243, + "step": 5494 + }, + { + "epoch": 0.4353337294513765, + "grad_norm": 1.6033441160765232, + "learning_rate": 1.2551612704064074e-05, + "loss": 0.3302, + "step": 5495 + }, + { + "epoch": 0.4354129530600119, + "grad_norm": 1.5319594864143755, + "learning_rate": 1.2549131527827458e-05, + "loss": 0.4081, + "step": 5496 + }, + { + "epoch": 0.43549217666864726, + "grad_norm": 1.504913530009671, + "learning_rate": 1.2546650183742801e-05, + "loss": 0.3112, + "step": 5497 + }, + { + "epoch": 0.43557140027728264, + "grad_norm": 1.2767031132609141, + "learning_rate": 1.254416867197348e-05, + "loss": 0.3113, + "step": 5498 + }, + { + "epoch": 0.435650623885918, + "grad_norm": 1.7117570025406732, + "learning_rate": 1.2541686992682896e-05, + "loss": 0.4046, + "step": 5499 + }, + { + "epoch": 0.4357298474945534, + "grad_norm": 1.5293320244132367, + "learning_rate": 1.2539205146034452e-05, + "loss": 0.3628, + "step": 5500 + }, + { + "epoch": 0.4358090711031887, + "grad_norm": 1.3677255643301798, + "learning_rate": 1.2536723132191566e-05, + "loss": 0.3357, + "step": 5501 + }, + { + "epoch": 0.4358882947118241, + "grad_norm": 1.6184853659040477, + "learning_rate": 1.2534240951317669e-05, + "loss": 0.3632, + "step": 5502 + }, + { + "epoch": 0.4359675183204595, + "grad_norm": 1.490242335936876, + "learning_rate": 1.25317586035762e-05, + "loss": 0.3372, + "step": 5503 + }, + { + "epoch": 0.43604674192909487, + "grad_norm": 1.3434680182456935, + "learning_rate": 1.2529276089130607e-05, + "loss": 0.3482, + "step": 5504 + }, + { + "epoch": 0.43612596553773025, + "grad_norm": 1.2352051693493493, + "learning_rate": 1.2526793408144355e-05, + "loss": 0.2803, + "step": 5505 + }, + { + "epoch": 0.43620518914636564, + "grad_norm": 1.6694804540272143, + "learning_rate": 1.2524310560780914e-05, + "loss": 0.3759, + "step": 5506 + }, + { + "epoch": 0.436284412755001, + "grad_norm": 1.231372253748743, + "learning_rate": 1.2521827547203773e-05, + "loss": 0.2659, + "step": 5507 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 1.432983039414379, + "learning_rate": 1.2519344367576418e-05, + "loss": 0.2474, + "step": 5508 + }, + { + "epoch": 0.4364428599722717, + "grad_norm": 1.4892999601753967, + "learning_rate": 1.2516861022062361e-05, + "loss": 0.3025, + "step": 5509 + }, + { + "epoch": 0.4365220835809071, + "grad_norm": 1.345678917014592, + "learning_rate": 1.2514377510825113e-05, + "loss": 0.2936, + "step": 5510 + }, + { + "epoch": 0.4366013071895425, + "grad_norm": 1.5770487553466572, + "learning_rate": 1.2511893834028209e-05, + "loss": 0.4329, + "step": 5511 + }, + { + "epoch": 0.43668053079817787, + "grad_norm": 1.4870329623371363, + "learning_rate": 1.2509409991835178e-05, + "loss": 0.3456, + "step": 5512 + }, + { + "epoch": 0.43675975440681325, + "grad_norm": 1.301633174327458, + "learning_rate": 1.2506925984409574e-05, + "loss": 0.2842, + "step": 5513 + }, + { + "epoch": 0.43683897801544863, + "grad_norm": 1.4075368831718562, + "learning_rate": 1.250444181191496e-05, + "loss": 0.268, + "step": 5514 + }, + { + "epoch": 0.43691820162408396, + "grad_norm": 1.451309717991726, + "learning_rate": 1.2501957474514898e-05, + "loss": 0.282, + "step": 5515 + }, + { + "epoch": 0.43699742523271934, + "grad_norm": 1.3603439783311486, + "learning_rate": 1.249947297237298e-05, + "loss": 0.2704, + "step": 5516 + }, + { + "epoch": 0.4370766488413547, + "grad_norm": 1.4368503013002034, + "learning_rate": 1.249698830565279e-05, + "loss": 0.3424, + "step": 5517 + }, + { + "epoch": 0.4371558724499901, + "grad_norm": 1.332731038219995, + "learning_rate": 1.2494503474517935e-05, + "loss": 0.2255, + "step": 5518 + }, + { + "epoch": 0.4372350960586255, + "grad_norm": 1.4420217355323868, + "learning_rate": 1.2492018479132033e-05, + "loss": 0.2916, + "step": 5519 + }, + { + "epoch": 0.43731431966726086, + "grad_norm": 1.3211587642497646, + "learning_rate": 1.2489533319658703e-05, + "loss": 0.2476, + "step": 5520 + }, + { + "epoch": 0.4373935432758962, + "grad_norm": 1.8274973083374102, + "learning_rate": 1.2487047996261578e-05, + "loss": 0.394, + "step": 5521 + }, + { + "epoch": 0.43747276688453157, + "grad_norm": 1.6968891552047662, + "learning_rate": 1.2484562509104316e-05, + "loss": 0.3865, + "step": 5522 + }, + { + "epoch": 0.43755199049316695, + "grad_norm": 1.3754032116517625, + "learning_rate": 1.2482076858350564e-05, + "loss": 0.2733, + "step": 5523 + }, + { + "epoch": 0.43763121410180233, + "grad_norm": 1.8364161689702525, + "learning_rate": 1.2479591044163997e-05, + "loss": 0.4175, + "step": 5524 + }, + { + "epoch": 0.4377104377104377, + "grad_norm": 1.4297693803153317, + "learning_rate": 1.2477105066708286e-05, + "loss": 0.2563, + "step": 5525 + }, + { + "epoch": 0.4377896613190731, + "grad_norm": 1.71100922622832, + "learning_rate": 1.2474618926147129e-05, + "loss": 0.3214, + "step": 5526 + }, + { + "epoch": 0.4378688849277085, + "grad_norm": 1.352624504305475, + "learning_rate": 1.2472132622644222e-05, + "loss": 0.2543, + "step": 5527 + }, + { + "epoch": 0.4379481085363438, + "grad_norm": 1.9597103256622024, + "learning_rate": 1.2469646156363276e-05, + "loss": 0.4333, + "step": 5528 + }, + { + "epoch": 0.4380273321449792, + "grad_norm": 1.6459129158020713, + "learning_rate": 1.2467159527468014e-05, + "loss": 0.412, + "step": 5529 + }, + { + "epoch": 0.43810655575361457, + "grad_norm": 1.6321473589199504, + "learning_rate": 1.246467273612217e-05, + "loss": 0.3876, + "step": 5530 + }, + { + "epoch": 0.43818577936224995, + "grad_norm": 2.306533984815719, + "learning_rate": 1.2462185782489484e-05, + "loss": 0.3353, + "step": 5531 + }, + { + "epoch": 0.43826500297088533, + "grad_norm": 1.6634065664543873, + "learning_rate": 1.2459698666733712e-05, + "loss": 0.381, + "step": 5532 + }, + { + "epoch": 0.4383442265795207, + "grad_norm": 1.2720165084356065, + "learning_rate": 1.2457211389018619e-05, + "loss": 0.27, + "step": 5533 + }, + { + "epoch": 0.4384234501881561, + "grad_norm": 1.3331215022361775, + "learning_rate": 1.2454723949507978e-05, + "loss": 0.3085, + "step": 5534 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 1.4469390883879574, + "learning_rate": 1.2452236348365579e-05, + "loss": 0.3212, + "step": 5535 + }, + { + "epoch": 0.4385818974054268, + "grad_norm": 1.3066558085486175, + "learning_rate": 1.244974858575521e-05, + "loss": 0.2593, + "step": 5536 + }, + { + "epoch": 0.4386611210140622, + "grad_norm": 1.3556845452823256, + "learning_rate": 1.2447260661840688e-05, + "loss": 0.3022, + "step": 5537 + }, + { + "epoch": 0.43874034462269756, + "grad_norm": 1.412813446296498, + "learning_rate": 1.2444772576785828e-05, + "loss": 0.3349, + "step": 5538 + }, + { + "epoch": 0.43881956823133295, + "grad_norm": 1.4987757106315132, + "learning_rate": 1.2442284330754456e-05, + "loss": 0.3447, + "step": 5539 + }, + { + "epoch": 0.4388987918399683, + "grad_norm": 1.2809113579222797, + "learning_rate": 1.2439795923910413e-05, + "loss": 0.2656, + "step": 5540 + }, + { + "epoch": 0.4389780154486037, + "grad_norm": 1.6354779293121176, + "learning_rate": 1.2437307356417547e-05, + "loss": 0.3831, + "step": 5541 + }, + { + "epoch": 0.43905723905723903, + "grad_norm": 1.564709595851338, + "learning_rate": 1.2434818628439718e-05, + "loss": 0.3179, + "step": 5542 + }, + { + "epoch": 0.4391364626658744, + "grad_norm": 1.4931713730660812, + "learning_rate": 1.24323297401408e-05, + "loss": 0.333, + "step": 5543 + }, + { + "epoch": 0.4392156862745098, + "grad_norm": 1.509970048928823, + "learning_rate": 1.2429840691684672e-05, + "loss": 0.3947, + "step": 5544 + }, + { + "epoch": 0.4392949098831452, + "grad_norm": 1.3668560912590544, + "learning_rate": 1.2427351483235224e-05, + "loss": 0.3083, + "step": 5545 + }, + { + "epoch": 0.43937413349178056, + "grad_norm": 1.9266141282935643, + "learning_rate": 1.2424862114956367e-05, + "loss": 0.3891, + "step": 5546 + }, + { + "epoch": 0.43945335710041594, + "grad_norm": 1.7498146035725137, + "learning_rate": 1.2422372587012001e-05, + "loss": 0.3772, + "step": 5547 + }, + { + "epoch": 0.4395325807090513, + "grad_norm": 1.786747893173884, + "learning_rate": 1.2419882899566056e-05, + "loss": 0.3258, + "step": 5548 + }, + { + "epoch": 0.43961180431768665, + "grad_norm": 1.467619205649559, + "learning_rate": 1.241739305278247e-05, + "loss": 0.3966, + "step": 5549 + }, + { + "epoch": 0.43969102792632203, + "grad_norm": 1.4787014616012117, + "learning_rate": 1.2414903046825178e-05, + "loss": 0.2968, + "step": 5550 + }, + { + "epoch": 0.4397702515349574, + "grad_norm": 1.3699368777074505, + "learning_rate": 1.2412412881858142e-05, + "loss": 0.265, + "step": 5551 + }, + { + "epoch": 0.4398494751435928, + "grad_norm": 1.222884656815162, + "learning_rate": 1.240992255804533e-05, + "loss": 0.209, + "step": 5552 + }, + { + "epoch": 0.4399286987522282, + "grad_norm": 1.4643575849466588, + "learning_rate": 1.2407432075550707e-05, + "loss": 0.3106, + "step": 5553 + }, + { + "epoch": 0.44000792236086356, + "grad_norm": 1.0793841234333779, + "learning_rate": 1.2404941434538269e-05, + "loss": 0.1828, + "step": 5554 + }, + { + "epoch": 0.4400871459694989, + "grad_norm": 1.8614665961933032, + "learning_rate": 1.2402450635172008e-05, + "loss": 0.4639, + "step": 5555 + }, + { + "epoch": 0.44016636957813426, + "grad_norm": 1.2472298582682795, + "learning_rate": 1.2399959677615932e-05, + "loss": 0.2303, + "step": 5556 + }, + { + "epoch": 0.44024559318676965, + "grad_norm": 1.8496281759865363, + "learning_rate": 1.239746856203406e-05, + "loss": 0.307, + "step": 5557 + }, + { + "epoch": 0.440324816795405, + "grad_norm": 1.943752343790704, + "learning_rate": 1.239497728859042e-05, + "loss": 0.364, + "step": 5558 + }, + { + "epoch": 0.4404040404040404, + "grad_norm": 1.4816592777741944, + "learning_rate": 1.2392485857449048e-05, + "loss": 0.3342, + "step": 5559 + }, + { + "epoch": 0.4404832640126758, + "grad_norm": 1.6050090471273073, + "learning_rate": 1.2389994268773995e-05, + "loss": 0.3299, + "step": 5560 + }, + { + "epoch": 0.44056248762131117, + "grad_norm": 1.6392367962249623, + "learning_rate": 1.238750252272932e-05, + "loss": 0.3404, + "step": 5561 + }, + { + "epoch": 0.4406417112299465, + "grad_norm": 1.4964988983324858, + "learning_rate": 1.2385010619479093e-05, + "loss": 0.3285, + "step": 5562 + }, + { + "epoch": 0.4407209348385819, + "grad_norm": 1.492709753864469, + "learning_rate": 1.2382518559187389e-05, + "loss": 0.3792, + "step": 5563 + }, + { + "epoch": 0.44080015844721726, + "grad_norm": 1.321878569212735, + "learning_rate": 1.23800263420183e-05, + "loss": 0.2791, + "step": 5564 + }, + { + "epoch": 0.44087938205585264, + "grad_norm": 1.3989104011025209, + "learning_rate": 1.2377533968135934e-05, + "loss": 0.3068, + "step": 5565 + }, + { + "epoch": 0.440958605664488, + "grad_norm": 1.4729596066716062, + "learning_rate": 1.2375041437704394e-05, + "loss": 0.2183, + "step": 5566 + }, + { + "epoch": 0.4410378292731234, + "grad_norm": 1.1626634673552876, + "learning_rate": 1.2372548750887805e-05, + "loss": 0.2825, + "step": 5567 + }, + { + "epoch": 0.4411170528817588, + "grad_norm": 1.4178911378023322, + "learning_rate": 1.2370055907850293e-05, + "loss": 0.3214, + "step": 5568 + }, + { + "epoch": 0.4411962764903941, + "grad_norm": 1.7536119491841367, + "learning_rate": 1.2367562908756005e-05, + "loss": 0.2716, + "step": 5569 + }, + { + "epoch": 0.4412755000990295, + "grad_norm": 1.3612077929654602, + "learning_rate": 1.2365069753769092e-05, + "loss": 0.2477, + "step": 5570 + }, + { + "epoch": 0.4413547237076649, + "grad_norm": 1.3665235754423977, + "learning_rate": 1.2362576443053716e-05, + "loss": 0.3248, + "step": 5571 + }, + { + "epoch": 0.44143394731630026, + "grad_norm": 1.6563006850044917, + "learning_rate": 1.2360082976774049e-05, + "loss": 0.3998, + "step": 5572 + }, + { + "epoch": 0.44151317092493564, + "grad_norm": 1.1556465611201692, + "learning_rate": 1.2357589355094275e-05, + "loss": 0.1872, + "step": 5573 + }, + { + "epoch": 0.441592394533571, + "grad_norm": 1.1516815317847333, + "learning_rate": 1.2355095578178582e-05, + "loss": 0.3242, + "step": 5574 + }, + { + "epoch": 0.4416716181422064, + "grad_norm": 1.6032025281087987, + "learning_rate": 1.2352601646191182e-05, + "loss": 0.3119, + "step": 5575 + }, + { + "epoch": 0.4417508417508417, + "grad_norm": 1.4502267987187858, + "learning_rate": 1.235010755929628e-05, + "loss": 0.2695, + "step": 5576 + }, + { + "epoch": 0.4418300653594771, + "grad_norm": 1.5242226017208766, + "learning_rate": 1.2347613317658105e-05, + "loss": 0.3038, + "step": 5577 + }, + { + "epoch": 0.4419092889681125, + "grad_norm": 1.607811135328703, + "learning_rate": 1.234511892144089e-05, + "loss": 0.3656, + "step": 5578 + }, + { + "epoch": 0.44198851257674787, + "grad_norm": 1.5822780144113513, + "learning_rate": 1.2342624370808876e-05, + "loss": 0.4001, + "step": 5579 + }, + { + "epoch": 0.44206773618538325, + "grad_norm": 1.445624606232555, + "learning_rate": 1.2340129665926319e-05, + "loss": 0.2533, + "step": 5580 + }, + { + "epoch": 0.44214695979401863, + "grad_norm": 1.461841083046526, + "learning_rate": 1.2337634806957486e-05, + "loss": 0.2538, + "step": 5581 + }, + { + "epoch": 0.442226183402654, + "grad_norm": 1.4950823860907971, + "learning_rate": 1.2335139794066645e-05, + "loss": 0.2502, + "step": 5582 + }, + { + "epoch": 0.44230540701128934, + "grad_norm": 1.278136773901328, + "learning_rate": 1.2332644627418088e-05, + "loss": 0.251, + "step": 5583 + }, + { + "epoch": 0.4423846306199247, + "grad_norm": 1.219042104528466, + "learning_rate": 1.2330149307176105e-05, + "loss": 0.298, + "step": 5584 + }, + { + "epoch": 0.4424638542285601, + "grad_norm": 1.5508062254365524, + "learning_rate": 1.2327653833505005e-05, + "loss": 0.3416, + "step": 5585 + }, + { + "epoch": 0.4425430778371955, + "grad_norm": 1.539181482091282, + "learning_rate": 1.2325158206569095e-05, + "loss": 0.4106, + "step": 5586 + }, + { + "epoch": 0.44262230144583087, + "grad_norm": 1.7262858546380457, + "learning_rate": 1.232266242653271e-05, + "loss": 0.337, + "step": 5587 + }, + { + "epoch": 0.44270152505446625, + "grad_norm": 1.4712714271943768, + "learning_rate": 1.2320166493560176e-05, + "loss": 0.3356, + "step": 5588 + }, + { + "epoch": 0.44278074866310163, + "grad_norm": 1.7458166022993233, + "learning_rate": 1.2317670407815844e-05, + "loss": 0.4123, + "step": 5589 + }, + { + "epoch": 0.44285997227173696, + "grad_norm": 1.8016174858002976, + "learning_rate": 1.2315174169464068e-05, + "loss": 0.2755, + "step": 5590 + }, + { + "epoch": 0.44293919588037234, + "grad_norm": 1.4030103348673175, + "learning_rate": 1.2312677778669211e-05, + "loss": 0.2906, + "step": 5591 + }, + { + "epoch": 0.4430184194890077, + "grad_norm": 1.487165116297807, + "learning_rate": 1.2310181235595652e-05, + "loss": 0.2786, + "step": 5592 + }, + { + "epoch": 0.4430976430976431, + "grad_norm": 1.3117181667343012, + "learning_rate": 1.2307684540407775e-05, + "loss": 0.2367, + "step": 5593 + }, + { + "epoch": 0.4431768667062785, + "grad_norm": 1.338151070222116, + "learning_rate": 1.230518769326997e-05, + "loss": 0.2512, + "step": 5594 + }, + { + "epoch": 0.44325609031491386, + "grad_norm": 1.1997029433516588, + "learning_rate": 1.2302690694346654e-05, + "loss": 0.1748, + "step": 5595 + }, + { + "epoch": 0.4433353139235492, + "grad_norm": 1.2224114046177161, + "learning_rate": 1.230019354380223e-05, + "loss": 0.2466, + "step": 5596 + }, + { + "epoch": 0.44341453753218457, + "grad_norm": 1.2624692573410068, + "learning_rate": 1.2297696241801133e-05, + "loss": 0.2706, + "step": 5597 + }, + { + "epoch": 0.44349376114081995, + "grad_norm": 1.447914568671778, + "learning_rate": 1.2295198788507794e-05, + "loss": 0.2923, + "step": 5598 + }, + { + "epoch": 0.44357298474945533, + "grad_norm": 1.3210649850300396, + "learning_rate": 1.2292701184086656e-05, + "loss": 0.2898, + "step": 5599 + }, + { + "epoch": 0.4436522083580907, + "grad_norm": 1.2037640375666785, + "learning_rate": 1.2290203428702178e-05, + "loss": 0.2773, + "step": 5600 + }, + { + "epoch": 0.4437314319667261, + "grad_norm": 1.415959523648096, + "learning_rate": 1.2287705522518824e-05, + "loss": 0.2772, + "step": 5601 + }, + { + "epoch": 0.4438106555753615, + "grad_norm": 1.466433524688798, + "learning_rate": 1.228520746570107e-05, + "loss": 0.2679, + "step": 5602 + }, + { + "epoch": 0.4438898791839968, + "grad_norm": 1.462996885356959, + "learning_rate": 1.22827092584134e-05, + "loss": 0.2987, + "step": 5603 + }, + { + "epoch": 0.4439691027926322, + "grad_norm": 1.4615390316373624, + "learning_rate": 1.2280210900820309e-05, + "loss": 0.2994, + "step": 5604 + }, + { + "epoch": 0.44404832640126757, + "grad_norm": 1.263746322122163, + "learning_rate": 1.22777123930863e-05, + "loss": 0.2362, + "step": 5605 + }, + { + "epoch": 0.44412755000990295, + "grad_norm": 1.3538626145891963, + "learning_rate": 1.227521373537589e-05, + "loss": 0.2549, + "step": 5606 + }, + { + "epoch": 0.44420677361853833, + "grad_norm": 1.5081078101355778, + "learning_rate": 1.2272714927853604e-05, + "loss": 0.3, + "step": 5607 + }, + { + "epoch": 0.4442859972271737, + "grad_norm": 1.6211686624569182, + "learning_rate": 1.2270215970683977e-05, + "loss": 0.3341, + "step": 5608 + }, + { + "epoch": 0.4443652208358091, + "grad_norm": 1.8599278662623036, + "learning_rate": 1.226771686403155e-05, + "loss": 0.3583, + "step": 5609 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.4266100934874566, + "learning_rate": 1.2265217608060879e-05, + "loss": 0.2859, + "step": 5610 + }, + { + "epoch": 0.4445236680530798, + "grad_norm": 1.4027887297951571, + "learning_rate": 1.226271820293653e-05, + "loss": 0.3042, + "step": 5611 + }, + { + "epoch": 0.4446028916617152, + "grad_norm": 1.4179418474260197, + "learning_rate": 1.2260218648823073e-05, + "loss": 0.3431, + "step": 5612 + }, + { + "epoch": 0.44468211527035056, + "grad_norm": 1.2174693240267271, + "learning_rate": 1.2257718945885096e-05, + "loss": 0.2571, + "step": 5613 + }, + { + "epoch": 0.44476133887898595, + "grad_norm": 1.397708506841354, + "learning_rate": 1.2255219094287186e-05, + "loss": 0.363, + "step": 5614 + }, + { + "epoch": 0.4448405624876213, + "grad_norm": 1.4284158117727428, + "learning_rate": 1.225271909419395e-05, + "loss": 0.2899, + "step": 5615 + }, + { + "epoch": 0.4449197860962567, + "grad_norm": 1.5232388442487834, + "learning_rate": 1.2250218945770005e-05, + "loss": 0.2806, + "step": 5616 + }, + { + "epoch": 0.44499900970489203, + "grad_norm": 1.5501966759126469, + "learning_rate": 1.2247718649179966e-05, + "loss": 0.4024, + "step": 5617 + }, + { + "epoch": 0.4450782333135274, + "grad_norm": 1.4514936474695521, + "learning_rate": 1.2245218204588474e-05, + "loss": 0.3021, + "step": 5618 + }, + { + "epoch": 0.4451574569221628, + "grad_norm": 1.4863064472001586, + "learning_rate": 1.2242717612160163e-05, + "loss": 0.3389, + "step": 5619 + }, + { + "epoch": 0.4452366805307982, + "grad_norm": 1.7453594524603433, + "learning_rate": 1.2240216872059687e-05, + "loss": 0.4303, + "step": 5620 + }, + { + "epoch": 0.44531590413943356, + "grad_norm": 1.3268436436055626, + "learning_rate": 1.2237715984451713e-05, + "loss": 0.3191, + "step": 5621 + }, + { + "epoch": 0.44539512774806894, + "grad_norm": 2.1214515781245535, + "learning_rate": 1.2235214949500906e-05, + "loss": 0.2806, + "step": 5622 + }, + { + "epoch": 0.4454743513567043, + "grad_norm": 1.3528561737401772, + "learning_rate": 1.223271376737195e-05, + "loss": 0.3608, + "step": 5623 + }, + { + "epoch": 0.44555357496533965, + "grad_norm": 1.5317753621719823, + "learning_rate": 1.2230212438229539e-05, + "loss": 0.301, + "step": 5624 + }, + { + "epoch": 0.44563279857397503, + "grad_norm": 1.261222751812893, + "learning_rate": 1.2227710962238367e-05, + "loss": 0.2864, + "step": 5625 + }, + { + "epoch": 0.4457120221826104, + "grad_norm": 1.5698112652160263, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.2741, + "step": 5626 + }, + { + "epoch": 0.4457912457912458, + "grad_norm": 1.3806836766393442, + "learning_rate": 1.22227075703686e-05, + "loss": 0.3045, + "step": 5627 + }, + { + "epoch": 0.4458704693998812, + "grad_norm": 1.2311180337479266, + "learning_rate": 1.2220205654819453e-05, + "loss": 0.2273, + "step": 5628 + }, + { + "epoch": 0.44594969300851656, + "grad_norm": 1.7031789584364545, + "learning_rate": 1.2217703593080445e-05, + "loss": 0.3879, + "step": 5629 + }, + { + "epoch": 0.44602891661715194, + "grad_norm": 1.2619583877073286, + "learning_rate": 1.221520138531633e-05, + "loss": 0.326, + "step": 5630 + }, + { + "epoch": 0.44610814022578726, + "grad_norm": 1.2582322548717744, + "learning_rate": 1.2212699031691861e-05, + "loss": 0.2206, + "step": 5631 + }, + { + "epoch": 0.44618736383442265, + "grad_norm": 1.5115284585892574, + "learning_rate": 1.221019653237181e-05, + "loss": 0.334, + "step": 5632 + }, + { + "epoch": 0.446266587443058, + "grad_norm": 1.491945475518214, + "learning_rate": 1.2207693887520949e-05, + "loss": 0.3372, + "step": 5633 + }, + { + "epoch": 0.4463458110516934, + "grad_norm": 1.4588603187953577, + "learning_rate": 1.2205191097304067e-05, + "loss": 0.2208, + "step": 5634 + }, + { + "epoch": 0.4464250346603288, + "grad_norm": 1.7213235676092462, + "learning_rate": 1.2202688161885967e-05, + "loss": 0.3477, + "step": 5635 + }, + { + "epoch": 0.44650425826896417, + "grad_norm": 1.6365514417234186, + "learning_rate": 1.2200185081431446e-05, + "loss": 0.3598, + "step": 5636 + }, + { + "epoch": 0.4465834818775995, + "grad_norm": 1.8158169757899714, + "learning_rate": 1.2197681856105326e-05, + "loss": 0.3387, + "step": 5637 + }, + { + "epoch": 0.4466627054862349, + "grad_norm": 1.4116078861533325, + "learning_rate": 1.219517848607243e-05, + "loss": 0.3064, + "step": 5638 + }, + { + "epoch": 0.44674192909487026, + "grad_norm": 1.395902534077429, + "learning_rate": 1.2192674971497593e-05, + "loss": 0.2931, + "step": 5639 + }, + { + "epoch": 0.44682115270350564, + "grad_norm": 1.299377546562893, + "learning_rate": 1.219017131254566e-05, + "loss": 0.2398, + "step": 5640 + }, + { + "epoch": 0.446900376312141, + "grad_norm": 1.3244621349745926, + "learning_rate": 1.2187667509381484e-05, + "loss": 0.2814, + "step": 5641 + }, + { + "epoch": 0.4469795999207764, + "grad_norm": 1.3004202124217101, + "learning_rate": 1.2185163562169928e-05, + "loss": 0.2245, + "step": 5642 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 1.493554717170338, + "learning_rate": 1.2182659471075868e-05, + "loss": 0.2889, + "step": 5643 + }, + { + "epoch": 0.4471380471380471, + "grad_norm": 1.2714060205955164, + "learning_rate": 1.2180155236264182e-05, + "loss": 0.2444, + "step": 5644 + }, + { + "epoch": 0.4472172707466825, + "grad_norm": 1.3094269976849267, + "learning_rate": 1.2177650857899767e-05, + "loss": 0.2621, + "step": 5645 + }, + { + "epoch": 0.4472964943553179, + "grad_norm": 1.3355498116081759, + "learning_rate": 1.217514633614752e-05, + "loss": 0.3199, + "step": 5646 + }, + { + "epoch": 0.44737571796395326, + "grad_norm": 1.5933756911115966, + "learning_rate": 1.217264167117235e-05, + "loss": 0.3589, + "step": 5647 + }, + { + "epoch": 0.44745494157258864, + "grad_norm": 1.8591505173394445, + "learning_rate": 1.2170136863139183e-05, + "loss": 0.2954, + "step": 5648 + }, + { + "epoch": 0.447534165181224, + "grad_norm": 1.5608999881471082, + "learning_rate": 1.2167631912212942e-05, + "loss": 0.4124, + "step": 5649 + }, + { + "epoch": 0.4476133887898594, + "grad_norm": 1.3476633916095164, + "learning_rate": 1.2165126818558572e-05, + "loss": 0.2599, + "step": 5650 + }, + { + "epoch": 0.4476926123984947, + "grad_norm": 1.550843676779218, + "learning_rate": 1.2162621582341021e-05, + "loss": 0.327, + "step": 5651 + }, + { + "epoch": 0.4477718360071301, + "grad_norm": 1.567997106644771, + "learning_rate": 1.2160116203725243e-05, + "loss": 0.2653, + "step": 5652 + }, + { + "epoch": 0.4478510596157655, + "grad_norm": 1.4763842217872796, + "learning_rate": 1.2157610682876206e-05, + "loss": 0.3003, + "step": 5653 + }, + { + "epoch": 0.44793028322440087, + "grad_norm": 1.4265145401670776, + "learning_rate": 1.2155105019958888e-05, + "loss": 0.2474, + "step": 5654 + }, + { + "epoch": 0.44800950683303625, + "grad_norm": 1.4110597688766937, + "learning_rate": 1.2152599215138274e-05, + "loss": 0.2818, + "step": 5655 + }, + { + "epoch": 0.44808873044167163, + "grad_norm": 1.3134178866048638, + "learning_rate": 1.215009326857936e-05, + "loss": 0.242, + "step": 5656 + }, + { + "epoch": 0.448167954050307, + "grad_norm": 1.6632629264082104, + "learning_rate": 1.2147587180447149e-05, + "loss": 0.3499, + "step": 5657 + }, + { + "epoch": 0.44824717765894234, + "grad_norm": 1.3206271939147298, + "learning_rate": 1.2145080950906656e-05, + "loss": 0.2558, + "step": 5658 + }, + { + "epoch": 0.4483264012675777, + "grad_norm": 1.4042377194422089, + "learning_rate": 1.2142574580122903e-05, + "loss": 0.2455, + "step": 5659 + }, + { + "epoch": 0.4484056248762131, + "grad_norm": 1.3950783621012701, + "learning_rate": 1.2140068068260923e-05, + "loss": 0.2697, + "step": 5660 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 1.4310829922275936, + "learning_rate": 1.2137561415485761e-05, + "loss": 0.2836, + "step": 5661 + }, + { + "epoch": 0.44856407209348387, + "grad_norm": 1.5322757277538652, + "learning_rate": 1.2135054621962464e-05, + "loss": 0.2942, + "step": 5662 + }, + { + "epoch": 0.44864329570211925, + "grad_norm": 1.6048773198584134, + "learning_rate": 1.2132547687856093e-05, + "loss": 0.282, + "step": 5663 + }, + { + "epoch": 0.44872251931075463, + "grad_norm": 1.5719289004698593, + "learning_rate": 1.2130040613331717e-05, + "loss": 0.3725, + "step": 5664 + }, + { + "epoch": 0.44880174291938996, + "grad_norm": 1.2912692388470648, + "learning_rate": 1.2127533398554417e-05, + "loss": 0.2314, + "step": 5665 + }, + { + "epoch": 0.44888096652802534, + "grad_norm": 1.1860242115550932, + "learning_rate": 1.2125026043689278e-05, + "loss": 0.2297, + "step": 5666 + }, + { + "epoch": 0.4489601901366607, + "grad_norm": 1.3353291468649866, + "learning_rate": 1.2122518548901401e-05, + "loss": 0.229, + "step": 5667 + }, + { + "epoch": 0.4490394137452961, + "grad_norm": 1.680231970648821, + "learning_rate": 1.2120010914355888e-05, + "loss": 0.2709, + "step": 5668 + }, + { + "epoch": 0.4491186373539315, + "grad_norm": 1.6520964488403986, + "learning_rate": 1.2117503140217858e-05, + "loss": 0.3672, + "step": 5669 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 1.491990800763491, + "learning_rate": 1.2114995226652437e-05, + "loss": 0.2796, + "step": 5670 + }, + { + "epoch": 0.44927708457120225, + "grad_norm": 1.4033688481664495, + "learning_rate": 1.2112487173824755e-05, + "loss": 0.2872, + "step": 5671 + }, + { + "epoch": 0.44935630817983757, + "grad_norm": 1.2349682316523733, + "learning_rate": 1.2109978981899956e-05, + "loss": 0.2952, + "step": 5672 + }, + { + "epoch": 0.44943553178847295, + "grad_norm": 1.5665694976544338, + "learning_rate": 1.2107470651043198e-05, + "loss": 0.3462, + "step": 5673 + }, + { + "epoch": 0.44951475539710833, + "grad_norm": 1.9025505633444724, + "learning_rate": 1.2104962181419635e-05, + "loss": 0.3696, + "step": 5674 + }, + { + "epoch": 0.4495939790057437, + "grad_norm": 1.3223970239672982, + "learning_rate": 1.2102453573194442e-05, + "loss": 0.3072, + "step": 5675 + }, + { + "epoch": 0.4496732026143791, + "grad_norm": 1.2478443093308735, + "learning_rate": 1.2099944826532796e-05, + "loss": 0.2699, + "step": 5676 + }, + { + "epoch": 0.4497524262230145, + "grad_norm": 1.321721375854688, + "learning_rate": 1.2097435941599886e-05, + "loss": 0.2597, + "step": 5677 + }, + { + "epoch": 0.4498316498316498, + "grad_norm": 1.4070221763126811, + "learning_rate": 1.2094926918560917e-05, + "loss": 0.2411, + "step": 5678 + }, + { + "epoch": 0.4499108734402852, + "grad_norm": 1.5447655077174007, + "learning_rate": 1.2092417757581085e-05, + "loss": 0.4654, + "step": 5679 + }, + { + "epoch": 0.44999009704892057, + "grad_norm": 1.2587234604786222, + "learning_rate": 1.2089908458825614e-05, + "loss": 0.2615, + "step": 5680 + }, + { + "epoch": 0.45006932065755595, + "grad_norm": 1.4747757819587082, + "learning_rate": 1.2087399022459729e-05, + "loss": 0.2272, + "step": 5681 + }, + { + "epoch": 0.45014854426619133, + "grad_norm": 1.4322719388003904, + "learning_rate": 1.208488944864866e-05, + "loss": 0.2248, + "step": 5682 + }, + { + "epoch": 0.4502277678748267, + "grad_norm": 1.6743515165257654, + "learning_rate": 1.2082379737557655e-05, + "loss": 0.4, + "step": 5683 + }, + { + "epoch": 0.4503069914834621, + "grad_norm": 1.3977804548400712, + "learning_rate": 1.2079869889351961e-05, + "loss": 0.2876, + "step": 5684 + }, + { + "epoch": 0.4503862150920974, + "grad_norm": 1.4194994349903556, + "learning_rate": 1.2077359904196841e-05, + "loss": 0.2952, + "step": 5685 + }, + { + "epoch": 0.4504654387007328, + "grad_norm": 1.2085239542353055, + "learning_rate": 1.2074849782257572e-05, + "loss": 0.2729, + "step": 5686 + }, + { + "epoch": 0.4505446623093682, + "grad_norm": 1.662042138796701, + "learning_rate": 1.2072339523699426e-05, + "loss": 0.3808, + "step": 5687 + }, + { + "epoch": 0.45062388591800356, + "grad_norm": 1.6334799971591731, + "learning_rate": 1.2069829128687693e-05, + "loss": 0.3934, + "step": 5688 + }, + { + "epoch": 0.45070310952663895, + "grad_norm": 1.5110405515963639, + "learning_rate": 1.2067318597387672e-05, + "loss": 0.342, + "step": 5689 + }, + { + "epoch": 0.4507823331352743, + "grad_norm": 1.7289544962085541, + "learning_rate": 1.2064807929964668e-05, + "loss": 0.3268, + "step": 5690 + }, + { + "epoch": 0.4508615567439097, + "grad_norm": 1.3999189308063729, + "learning_rate": 1.2062297126584e-05, + "loss": 0.2473, + "step": 5691 + }, + { + "epoch": 0.45094078035254503, + "grad_norm": 1.6340043213683135, + "learning_rate": 1.2059786187410984e-05, + "loss": 0.3313, + "step": 5692 + }, + { + "epoch": 0.4510200039611804, + "grad_norm": 1.4999903860809731, + "learning_rate": 1.2057275112610962e-05, + "loss": 0.292, + "step": 5693 + }, + { + "epoch": 0.4510992275698158, + "grad_norm": 1.79067462279451, + "learning_rate": 1.2054763902349273e-05, + "loss": 0.3804, + "step": 5694 + }, + { + "epoch": 0.4511784511784512, + "grad_norm": 1.4983228693808994, + "learning_rate": 1.2052252556791267e-05, + "loss": 0.3344, + "step": 5695 + }, + { + "epoch": 0.45125767478708656, + "grad_norm": 1.4351855962247557, + "learning_rate": 1.2049741076102307e-05, + "loss": 0.3592, + "step": 5696 + }, + { + "epoch": 0.45133689839572194, + "grad_norm": 1.5138150060860103, + "learning_rate": 1.2047229460447759e-05, + "loss": 0.3605, + "step": 5697 + }, + { + "epoch": 0.4514161220043573, + "grad_norm": 1.350547225842883, + "learning_rate": 1.2044717709993e-05, + "loss": 0.2811, + "step": 5698 + }, + { + "epoch": 0.45149534561299265, + "grad_norm": 1.4699234739128908, + "learning_rate": 1.2042205824903419e-05, + "loss": 0.3613, + "step": 5699 + }, + { + "epoch": 0.45157456922162803, + "grad_norm": 1.365828196254392, + "learning_rate": 1.203969380534441e-05, + "loss": 0.2788, + "step": 5700 + }, + { + "epoch": 0.4516537928302634, + "grad_norm": 1.2272319374468275, + "learning_rate": 1.2037181651481378e-05, + "loss": 0.2243, + "step": 5701 + }, + { + "epoch": 0.4517330164388988, + "grad_norm": 1.117004464847774, + "learning_rate": 1.2034669363479741e-05, + "loss": 0.2283, + "step": 5702 + }, + { + "epoch": 0.4518122400475342, + "grad_norm": 1.7623755314442646, + "learning_rate": 1.2032156941504913e-05, + "loss": 0.3985, + "step": 5703 + }, + { + "epoch": 0.45189146365616956, + "grad_norm": 2.17191804917099, + "learning_rate": 1.2029644385722327e-05, + "loss": 0.4895, + "step": 5704 + }, + { + "epoch": 0.45197068726480494, + "grad_norm": 1.3829735457567776, + "learning_rate": 1.2027131696297429e-05, + "loss": 0.246, + "step": 5705 + }, + { + "epoch": 0.45204991087344026, + "grad_norm": 1.480793656550592, + "learning_rate": 1.202461887339566e-05, + "loss": 0.361, + "step": 5706 + }, + { + "epoch": 0.45212913448207565, + "grad_norm": 1.0810474546126068, + "learning_rate": 1.2022105917182478e-05, + "loss": 0.2011, + "step": 5707 + }, + { + "epoch": 0.452208358090711, + "grad_norm": 1.3395626871411956, + "learning_rate": 1.2019592827823354e-05, + "loss": 0.2786, + "step": 5708 + }, + { + "epoch": 0.4522875816993464, + "grad_norm": 1.797745758637623, + "learning_rate": 1.2017079605483758e-05, + "loss": 0.3497, + "step": 5709 + }, + { + "epoch": 0.4523668053079818, + "grad_norm": 1.2294759186821698, + "learning_rate": 1.201456625032918e-05, + "loss": 0.1891, + "step": 5710 + }, + { + "epoch": 0.45244602891661717, + "grad_norm": 1.4325641156224465, + "learning_rate": 1.2012052762525104e-05, + "loss": 0.319, + "step": 5711 + }, + { + "epoch": 0.45252525252525255, + "grad_norm": 1.4960332139200154, + "learning_rate": 1.2009539142237034e-05, + "loss": 0.3187, + "step": 5712 + }, + { + "epoch": 0.4526044761338879, + "grad_norm": 1.4592232893191308, + "learning_rate": 1.2007025389630484e-05, + "loss": 0.307, + "step": 5713 + }, + { + "epoch": 0.45268369974252326, + "grad_norm": 1.3591185586590093, + "learning_rate": 1.2004511504870966e-05, + "loss": 0.3745, + "step": 5714 + }, + { + "epoch": 0.45276292335115864, + "grad_norm": 1.4897727322545176, + "learning_rate": 1.2001997488124011e-05, + "loss": 0.2485, + "step": 5715 + }, + { + "epoch": 0.452842146959794, + "grad_norm": 1.483043569721881, + "learning_rate": 1.1999483339555159e-05, + "loss": 0.274, + "step": 5716 + }, + { + "epoch": 0.4529213705684294, + "grad_norm": 1.5536590422542322, + "learning_rate": 1.1996969059329944e-05, + "loss": 0.2812, + "step": 5717 + }, + { + "epoch": 0.4530005941770648, + "grad_norm": 1.8383420363434644, + "learning_rate": 1.1994454647613928e-05, + "loss": 0.3539, + "step": 5718 + }, + { + "epoch": 0.4530798177857001, + "grad_norm": 1.409749698569539, + "learning_rate": 1.199194010457267e-05, + "loss": 0.3041, + "step": 5719 + }, + { + "epoch": 0.4531590413943355, + "grad_norm": 1.4454408871043587, + "learning_rate": 1.1989425430371739e-05, + "loss": 0.2869, + "step": 5720 + }, + { + "epoch": 0.4532382650029709, + "grad_norm": 1.4106434422151606, + "learning_rate": 1.198691062517672e-05, + "loss": 0.3308, + "step": 5721 + }, + { + "epoch": 0.45331748861160626, + "grad_norm": 1.3976137445032315, + "learning_rate": 1.1984395689153195e-05, + "loss": 0.2172, + "step": 5722 + }, + { + "epoch": 0.45339671222024164, + "grad_norm": 1.2540457976088886, + "learning_rate": 1.1981880622466759e-05, + "loss": 0.2578, + "step": 5723 + }, + { + "epoch": 0.453475935828877, + "grad_norm": 1.597901950492082, + "learning_rate": 1.1979365425283022e-05, + "loss": 0.3333, + "step": 5724 + }, + { + "epoch": 0.4535551594375124, + "grad_norm": 1.6601591706430912, + "learning_rate": 1.1976850097767598e-05, + "loss": 0.3309, + "step": 5725 + }, + { + "epoch": 0.4536343830461477, + "grad_norm": 1.470213884409713, + "learning_rate": 1.1974334640086104e-05, + "loss": 0.3141, + "step": 5726 + }, + { + "epoch": 0.4537136066547831, + "grad_norm": 1.308721113495105, + "learning_rate": 1.1971819052404177e-05, + "loss": 0.2414, + "step": 5727 + }, + { + "epoch": 0.4537928302634185, + "grad_norm": 1.2961204700262785, + "learning_rate": 1.196930333488745e-05, + "loss": 0.2337, + "step": 5728 + }, + { + "epoch": 0.45387205387205387, + "grad_norm": 1.6198537127432755, + "learning_rate": 1.1966787487701577e-05, + "loss": 0.3709, + "step": 5729 + }, + { + "epoch": 0.45395127748068925, + "grad_norm": 1.4217143114476545, + "learning_rate": 1.1964271511012208e-05, + "loss": 0.23, + "step": 5730 + }, + { + "epoch": 0.45403050108932463, + "grad_norm": 1.7570629663085537, + "learning_rate": 1.1961755404985015e-05, + "loss": 0.4046, + "step": 5731 + }, + { + "epoch": 0.45410972469796, + "grad_norm": 1.5291335670041004, + "learning_rate": 1.1959239169785668e-05, + "loss": 0.4338, + "step": 5732 + }, + { + "epoch": 0.45418894830659534, + "grad_norm": 1.1781689159578843, + "learning_rate": 1.1956722805579846e-05, + "loss": 0.2519, + "step": 5733 + }, + { + "epoch": 0.4542681719152307, + "grad_norm": 1.3892331256496078, + "learning_rate": 1.1954206312533246e-05, + "loss": 0.2791, + "step": 5734 + }, + { + "epoch": 0.4543473955238661, + "grad_norm": 1.4392557216515836, + "learning_rate": 1.1951689690811558e-05, + "loss": 0.3333, + "step": 5735 + }, + { + "epoch": 0.4544266191325015, + "grad_norm": 1.5172430300275368, + "learning_rate": 1.1949172940580498e-05, + "loss": 0.357, + "step": 5736 + }, + { + "epoch": 0.45450584274113687, + "grad_norm": 1.339423960795033, + "learning_rate": 1.1946656062005781e-05, + "loss": 0.2698, + "step": 5737 + }, + { + "epoch": 0.45458506634977225, + "grad_norm": 1.3538659210414938, + "learning_rate": 1.1944139055253126e-05, + "loss": 0.3092, + "step": 5738 + }, + { + "epoch": 0.45466428995840763, + "grad_norm": 1.3601226664085724, + "learning_rate": 1.1941621920488271e-05, + "loss": 0.3536, + "step": 5739 + }, + { + "epoch": 0.45474351356704296, + "grad_norm": 1.5244300446272274, + "learning_rate": 1.1939104657876953e-05, + "loss": 0.3995, + "step": 5740 + }, + { + "epoch": 0.45482273717567834, + "grad_norm": 1.4251734219174097, + "learning_rate": 1.1936587267584924e-05, + "loss": 0.2687, + "step": 5741 + }, + { + "epoch": 0.4549019607843137, + "grad_norm": 1.3092030846523643, + "learning_rate": 1.193406974977794e-05, + "loss": 0.2706, + "step": 5742 + }, + { + "epoch": 0.4549811843929491, + "grad_norm": 1.3208564538139058, + "learning_rate": 1.1931552104621776e-05, + "loss": 0.31, + "step": 5743 + }, + { + "epoch": 0.4550604080015845, + "grad_norm": 1.5727958859080353, + "learning_rate": 1.1929034332282192e-05, + "loss": 0.2961, + "step": 5744 + }, + { + "epoch": 0.45513963161021986, + "grad_norm": 1.6031212043223757, + "learning_rate": 1.1926516432924984e-05, + "loss": 0.3371, + "step": 5745 + }, + { + "epoch": 0.45521885521885525, + "grad_norm": 1.2869036772558198, + "learning_rate": 1.1923998406715937e-05, + "loss": 0.2306, + "step": 5746 + }, + { + "epoch": 0.45529807882749057, + "grad_norm": 1.4517582998730816, + "learning_rate": 1.1921480253820852e-05, + "loss": 0.3081, + "step": 5747 + }, + { + "epoch": 0.45537730243612595, + "grad_norm": 1.6206205154294324, + "learning_rate": 1.1918961974405539e-05, + "loss": 0.3239, + "step": 5748 + }, + { + "epoch": 0.45545652604476133, + "grad_norm": 1.2887167604453937, + "learning_rate": 1.1916443568635812e-05, + "loss": 0.2097, + "step": 5749 + }, + { + "epoch": 0.4555357496533967, + "grad_norm": 1.514621119634706, + "learning_rate": 1.1913925036677497e-05, + "loss": 0.301, + "step": 5750 + }, + { + "epoch": 0.4556149732620321, + "grad_norm": 1.2683254306705893, + "learning_rate": 1.191140637869643e-05, + "loss": 0.2318, + "step": 5751 + }, + { + "epoch": 0.4556941968706675, + "grad_norm": 1.5431935493771602, + "learning_rate": 1.1908887594858447e-05, + "loss": 0.3735, + "step": 5752 + }, + { + "epoch": 0.45577342047930286, + "grad_norm": 1.7857798091214419, + "learning_rate": 1.1906368685329403e-05, + "loss": 0.3739, + "step": 5753 + }, + { + "epoch": 0.4558526440879382, + "grad_norm": 1.4597646732032068, + "learning_rate": 1.1903849650275154e-05, + "loss": 0.3089, + "step": 5754 + }, + { + "epoch": 0.45593186769657357, + "grad_norm": 1.5590509118365934, + "learning_rate": 1.1901330489861564e-05, + "loss": 0.3693, + "step": 5755 + }, + { + "epoch": 0.45601109130520895, + "grad_norm": 1.5386134339392226, + "learning_rate": 1.1898811204254515e-05, + "loss": 0.3234, + "step": 5756 + }, + { + "epoch": 0.45609031491384433, + "grad_norm": 1.252584357863649, + "learning_rate": 1.189629179361988e-05, + "loss": 0.2621, + "step": 5757 + }, + { + "epoch": 0.4561695385224797, + "grad_norm": 1.5034891076727033, + "learning_rate": 1.1893772258123554e-05, + "loss": 0.2951, + "step": 5758 + }, + { + "epoch": 0.4562487621311151, + "grad_norm": 1.2413364254139216, + "learning_rate": 1.1891252597931441e-05, + "loss": 0.2453, + "step": 5759 + }, + { + "epoch": 0.4563279857397504, + "grad_norm": 1.6617547901197451, + "learning_rate": 1.1888732813209442e-05, + "loss": 0.3789, + "step": 5760 + }, + { + "epoch": 0.4564072093483858, + "grad_norm": 1.4517932555882869, + "learning_rate": 1.1886212904123477e-05, + "loss": 0.3268, + "step": 5761 + }, + { + "epoch": 0.4564864329570212, + "grad_norm": 1.2400286812980494, + "learning_rate": 1.1883692870839466e-05, + "loss": 0.2754, + "step": 5762 + }, + { + "epoch": 0.45656565656565656, + "grad_norm": 1.3262068219975502, + "learning_rate": 1.1881172713523346e-05, + "loss": 0.1943, + "step": 5763 + }, + { + "epoch": 0.45664488017429194, + "grad_norm": 1.48672114506166, + "learning_rate": 1.1878652432341053e-05, + "loss": 0.3167, + "step": 5764 + }, + { + "epoch": 0.4567241037829273, + "grad_norm": 1.5008902543237528, + "learning_rate": 1.1876132027458535e-05, + "loss": 0.323, + "step": 5765 + }, + { + "epoch": 0.4568033273915627, + "grad_norm": 1.3969924935786768, + "learning_rate": 1.1873611499041752e-05, + "loss": 0.2427, + "step": 5766 + }, + { + "epoch": 0.45688255100019803, + "grad_norm": 1.5149441443533935, + "learning_rate": 1.1871090847256667e-05, + "loss": 0.2943, + "step": 5767 + }, + { + "epoch": 0.4569617746088334, + "grad_norm": 1.45923579524261, + "learning_rate": 1.1868570072269252e-05, + "loss": 0.2675, + "step": 5768 + }, + { + "epoch": 0.4570409982174688, + "grad_norm": 1.429117756291073, + "learning_rate": 1.186604917424549e-05, + "loss": 0.3155, + "step": 5769 + }, + { + "epoch": 0.4571202218261042, + "grad_norm": 1.3350056860898667, + "learning_rate": 1.1863528153351369e-05, + "loss": 0.2388, + "step": 5770 + }, + { + "epoch": 0.45719944543473956, + "grad_norm": 1.5321491947718167, + "learning_rate": 1.1861007009752884e-05, + "loss": 0.3431, + "step": 5771 + }, + { + "epoch": 0.45727866904337494, + "grad_norm": 1.2085103238833457, + "learning_rate": 1.1858485743616044e-05, + "loss": 0.269, + "step": 5772 + }, + { + "epoch": 0.4573578926520103, + "grad_norm": 1.5498835246711031, + "learning_rate": 1.185596435510686e-05, + "loss": 0.2705, + "step": 5773 + }, + { + "epoch": 0.45743711626064565, + "grad_norm": 1.3797000368243975, + "learning_rate": 1.1853442844391354e-05, + "loss": 0.2681, + "step": 5774 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 1.9592103269698289, + "learning_rate": 1.1850921211635554e-05, + "loss": 0.3459, + "step": 5775 + }, + { + "epoch": 0.4575955634779164, + "grad_norm": 1.2909897101579606, + "learning_rate": 1.1848399457005496e-05, + "loss": 0.2894, + "step": 5776 + }, + { + "epoch": 0.4576747870865518, + "grad_norm": 1.4453921883376117, + "learning_rate": 1.1845877580667232e-05, + "loss": 0.2199, + "step": 5777 + }, + { + "epoch": 0.4577540106951872, + "grad_norm": 1.5272811631257404, + "learning_rate": 1.1843355582786806e-05, + "loss": 0.274, + "step": 5778 + }, + { + "epoch": 0.45783323430382256, + "grad_norm": 1.6125278654822204, + "learning_rate": 1.1840833463530289e-05, + "loss": 0.3534, + "step": 5779 + }, + { + "epoch": 0.45791245791245794, + "grad_norm": 1.395935209993027, + "learning_rate": 1.1838311223063745e-05, + "loss": 0.3339, + "step": 5780 + }, + { + "epoch": 0.45799168152109326, + "grad_norm": 1.55512571670022, + "learning_rate": 1.1835788861553252e-05, + "loss": 0.3615, + "step": 5781 + }, + { + "epoch": 0.45807090512972864, + "grad_norm": 1.569535485537372, + "learning_rate": 1.1833266379164894e-05, + "loss": 0.2826, + "step": 5782 + }, + { + "epoch": 0.458150128738364, + "grad_norm": 1.30058408106229, + "learning_rate": 1.183074377606477e-05, + "loss": 0.2461, + "step": 5783 + }, + { + "epoch": 0.4582293523469994, + "grad_norm": 1.630604386877381, + "learning_rate": 1.1828221052418973e-05, + "loss": 0.3472, + "step": 5784 + }, + { + "epoch": 0.4583085759556348, + "grad_norm": 1.269280938566071, + "learning_rate": 1.182569820839362e-05, + "loss": 0.2378, + "step": 5785 + }, + { + "epoch": 0.45838779956427017, + "grad_norm": 1.7220948284689819, + "learning_rate": 1.1823175244154823e-05, + "loss": 0.3285, + "step": 5786 + }, + { + "epoch": 0.45846702317290555, + "grad_norm": 1.2080600454938273, + "learning_rate": 1.1820652159868706e-05, + "loss": 0.2531, + "step": 5787 + }, + { + "epoch": 0.4585462467815409, + "grad_norm": 1.3942546131497715, + "learning_rate": 1.1818128955701409e-05, + "loss": 0.2282, + "step": 5788 + }, + { + "epoch": 0.45862547039017626, + "grad_norm": 1.1519972686446, + "learning_rate": 1.1815605631819066e-05, + "loss": 0.2448, + "step": 5789 + }, + { + "epoch": 0.45870469399881164, + "grad_norm": 1.4903720459379082, + "learning_rate": 1.181308218838783e-05, + "loss": 0.3902, + "step": 5790 + }, + { + "epoch": 0.458783917607447, + "grad_norm": 1.2604366536057166, + "learning_rate": 1.1810558625573856e-05, + "loss": 0.2314, + "step": 5791 + }, + { + "epoch": 0.4588631412160824, + "grad_norm": 1.4650561507737663, + "learning_rate": 1.1808034943543308e-05, + "loss": 0.3038, + "step": 5792 + }, + { + "epoch": 0.4589423648247178, + "grad_norm": 1.2026375208177977, + "learning_rate": 1.1805511142462355e-05, + "loss": 0.21, + "step": 5793 + }, + { + "epoch": 0.4590215884333531, + "grad_norm": 1.3351964738289657, + "learning_rate": 1.1802987222497186e-05, + "loss": 0.2694, + "step": 5794 + }, + { + "epoch": 0.4591008120419885, + "grad_norm": 1.494899818043445, + "learning_rate": 1.1800463183813982e-05, + "loss": 0.336, + "step": 5795 + }, + { + "epoch": 0.4591800356506239, + "grad_norm": 1.6648417341917845, + "learning_rate": 1.1797939026578941e-05, + "loss": 0.3255, + "step": 5796 + }, + { + "epoch": 0.45925925925925926, + "grad_norm": 1.674855659653074, + "learning_rate": 1.1795414750958265e-05, + "loss": 0.3838, + "step": 5797 + }, + { + "epoch": 0.45933848286789464, + "grad_norm": 1.5051472501295597, + "learning_rate": 1.1792890357118165e-05, + "loss": 0.3576, + "step": 5798 + }, + { + "epoch": 0.45941770647653, + "grad_norm": 1.3496815061636451, + "learning_rate": 1.1790365845224866e-05, + "loss": 0.2512, + "step": 5799 + }, + { + "epoch": 0.4594969300851654, + "grad_norm": 1.5108426684795782, + "learning_rate": 1.1787841215444588e-05, + "loss": 0.4088, + "step": 5800 + }, + { + "epoch": 0.4595761536938007, + "grad_norm": 1.5901979752085984, + "learning_rate": 1.1785316467943568e-05, + "loss": 0.3537, + "step": 5801 + }, + { + "epoch": 0.4596553773024361, + "grad_norm": 1.7061098616246408, + "learning_rate": 1.1782791602888052e-05, + "loss": 0.3571, + "step": 5802 + }, + { + "epoch": 0.4597346009110715, + "grad_norm": 1.5201164721215994, + "learning_rate": 1.1780266620444285e-05, + "loss": 0.242, + "step": 5803 + }, + { + "epoch": 0.45981382451970687, + "grad_norm": 1.2207544637052976, + "learning_rate": 1.1777741520778529e-05, + "loss": 0.2173, + "step": 5804 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 1.4754438542813102, + "learning_rate": 1.1775216304057046e-05, + "loss": 0.435, + "step": 5805 + }, + { + "epoch": 0.45997227173697763, + "grad_norm": 1.1743709684450185, + "learning_rate": 1.1772690970446113e-05, + "loss": 0.2146, + "step": 5806 + }, + { + "epoch": 0.460051495345613, + "grad_norm": 1.237303037004652, + "learning_rate": 1.177016552011201e-05, + "loss": 0.2859, + "step": 5807 + }, + { + "epoch": 0.46013071895424834, + "grad_norm": 1.2029282991803614, + "learning_rate": 1.176763995322102e-05, + "loss": 0.234, + "step": 5808 + }, + { + "epoch": 0.4602099425628837, + "grad_norm": 1.2952291340166198, + "learning_rate": 1.1765114269939448e-05, + "loss": 0.2942, + "step": 5809 + }, + { + "epoch": 0.4602891661715191, + "grad_norm": 1.3861406224676602, + "learning_rate": 1.1762588470433593e-05, + "loss": 0.3288, + "step": 5810 + }, + { + "epoch": 0.4603683897801545, + "grad_norm": 1.4694750471892293, + "learning_rate": 1.176006255486977e-05, + "loss": 0.352, + "step": 5811 + }, + { + "epoch": 0.46044761338878987, + "grad_norm": 1.6924848631070186, + "learning_rate": 1.1757536523414297e-05, + "loss": 0.3088, + "step": 5812 + }, + { + "epoch": 0.46052683699742525, + "grad_norm": 1.4901303566245971, + "learning_rate": 1.1755010376233498e-05, + "loss": 0.3659, + "step": 5813 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 1.3640039024549215, + "learning_rate": 1.175248411349371e-05, + "loss": 0.2377, + "step": 5814 + }, + { + "epoch": 0.46068528421469596, + "grad_norm": 1.3703779517395518, + "learning_rate": 1.1749957735361279e-05, + "loss": 0.3209, + "step": 5815 + }, + { + "epoch": 0.46076450782333134, + "grad_norm": 1.439294206824867, + "learning_rate": 1.174743124200255e-05, + "loss": 0.3338, + "step": 5816 + }, + { + "epoch": 0.4608437314319667, + "grad_norm": 1.3179582780701287, + "learning_rate": 1.1744904633583883e-05, + "loss": 0.2568, + "step": 5817 + }, + { + "epoch": 0.4609229550406021, + "grad_norm": 1.3738466216599348, + "learning_rate": 1.1742377910271638e-05, + "loss": 0.3188, + "step": 5818 + }, + { + "epoch": 0.4610021786492375, + "grad_norm": 1.2526366875212827, + "learning_rate": 1.1739851072232195e-05, + "loss": 0.1994, + "step": 5819 + }, + { + "epoch": 0.46108140225787286, + "grad_norm": 1.353160274388425, + "learning_rate": 1.1737324119631927e-05, + "loss": 0.2055, + "step": 5820 + }, + { + "epoch": 0.46116062586650824, + "grad_norm": 1.7081065976857985, + "learning_rate": 1.173479705263723e-05, + "loss": 0.4134, + "step": 5821 + }, + { + "epoch": 0.46123984947514357, + "grad_norm": 1.2886692529671708, + "learning_rate": 1.1732269871414492e-05, + "loss": 0.2103, + "step": 5822 + }, + { + "epoch": 0.46131907308377895, + "grad_norm": 1.325797169787858, + "learning_rate": 1.1729742576130119e-05, + "loss": 0.3467, + "step": 5823 + }, + { + "epoch": 0.46139829669241433, + "grad_norm": 1.5460490773505353, + "learning_rate": 1.1727215166950519e-05, + "loss": 0.3146, + "step": 5824 + }, + { + "epoch": 0.4614775203010497, + "grad_norm": 1.3872533892595444, + "learning_rate": 1.172468764404211e-05, + "loss": 0.2733, + "step": 5825 + }, + { + "epoch": 0.4615567439096851, + "grad_norm": 1.4970445979959512, + "learning_rate": 1.172216000757132e-05, + "loss": 0.2673, + "step": 5826 + }, + { + "epoch": 0.4616359675183205, + "grad_norm": 1.2613006210776547, + "learning_rate": 1.1719632257704581e-05, + "loss": 0.2661, + "step": 5827 + }, + { + "epoch": 0.46171519112695586, + "grad_norm": 1.3411424333430852, + "learning_rate": 1.171710439460833e-05, + "loss": 0.2484, + "step": 5828 + }, + { + "epoch": 0.4617944147355912, + "grad_norm": 1.3555948879770965, + "learning_rate": 1.1714576418449017e-05, + "loss": 0.2856, + "step": 5829 + }, + { + "epoch": 0.46187363834422657, + "grad_norm": 1.5903874552063866, + "learning_rate": 1.1712048329393097e-05, + "loss": 0.4079, + "step": 5830 + }, + { + "epoch": 0.46195286195286195, + "grad_norm": 1.1493036164470645, + "learning_rate": 1.1709520127607035e-05, + "loss": 0.1884, + "step": 5831 + }, + { + "epoch": 0.46203208556149733, + "grad_norm": 1.3868844470942308, + "learning_rate": 1.1706991813257295e-05, + "loss": 0.2199, + "step": 5832 + }, + { + "epoch": 0.4621113091701327, + "grad_norm": 1.3331341065218882, + "learning_rate": 1.1704463386510358e-05, + "loss": 0.2838, + "step": 5833 + }, + { + "epoch": 0.4621905327787681, + "grad_norm": 1.1197279688201989, + "learning_rate": 1.170193484753271e-05, + "loss": 0.1892, + "step": 5834 + }, + { + "epoch": 0.4622697563874034, + "grad_norm": 1.583624950072256, + "learning_rate": 1.169940619649084e-05, + "loss": 0.3658, + "step": 5835 + }, + { + "epoch": 0.4623489799960388, + "grad_norm": 1.7080872713857547, + "learning_rate": 1.1696877433551248e-05, + "loss": 0.3536, + "step": 5836 + }, + { + "epoch": 0.4624282036046742, + "grad_norm": 1.069071998411596, + "learning_rate": 1.1694348558880447e-05, + "loss": 0.2347, + "step": 5837 + }, + { + "epoch": 0.46250742721330956, + "grad_norm": 1.4593226024461419, + "learning_rate": 1.1691819572644941e-05, + "loss": 0.3536, + "step": 5838 + }, + { + "epoch": 0.46258665082194494, + "grad_norm": 1.4287101868376675, + "learning_rate": 1.1689290475011258e-05, + "loss": 0.3557, + "step": 5839 + }, + { + "epoch": 0.4626658744305803, + "grad_norm": 1.2918833237530993, + "learning_rate": 1.1686761266145926e-05, + "loss": 0.2853, + "step": 5840 + }, + { + "epoch": 0.4627450980392157, + "grad_norm": 1.4059293127245893, + "learning_rate": 1.1684231946215478e-05, + "loss": 0.233, + "step": 5841 + }, + { + "epoch": 0.46282432164785103, + "grad_norm": 1.3019925088270827, + "learning_rate": 1.1681702515386466e-05, + "loss": 0.2923, + "step": 5842 + }, + { + "epoch": 0.4629035452564864, + "grad_norm": 1.2342383719973151, + "learning_rate": 1.167917297382543e-05, + "loss": 0.2533, + "step": 5843 + }, + { + "epoch": 0.4629827688651218, + "grad_norm": 1.526115511376604, + "learning_rate": 1.1676643321698934e-05, + "loss": 0.2866, + "step": 5844 + }, + { + "epoch": 0.4630619924737572, + "grad_norm": 1.34552946388799, + "learning_rate": 1.1674113559173548e-05, + "loss": 0.3095, + "step": 5845 + }, + { + "epoch": 0.46314121608239256, + "grad_norm": 1.4549767257678594, + "learning_rate": 1.1671583686415833e-05, + "loss": 0.3155, + "step": 5846 + }, + { + "epoch": 0.46322043969102794, + "grad_norm": 1.990093310569591, + "learning_rate": 1.1669053703592381e-05, + "loss": 0.3812, + "step": 5847 + }, + { + "epoch": 0.4632996632996633, + "grad_norm": 1.4509912979858135, + "learning_rate": 1.1666523610869769e-05, + "loss": 0.3414, + "step": 5848 + }, + { + "epoch": 0.46337888690829865, + "grad_norm": 1.6202417294566522, + "learning_rate": 1.1663993408414597e-05, + "loss": 0.3421, + "step": 5849 + }, + { + "epoch": 0.46345811051693403, + "grad_norm": 1.0987634947033051, + "learning_rate": 1.1661463096393468e-05, + "loss": 0.1903, + "step": 5850 + }, + { + "epoch": 0.4635373341255694, + "grad_norm": 1.334617601431492, + "learning_rate": 1.1658932674972985e-05, + "loss": 0.3234, + "step": 5851 + }, + { + "epoch": 0.4636165577342048, + "grad_norm": 1.2463186147500964, + "learning_rate": 1.1656402144319772e-05, + "loss": 0.2276, + "step": 5852 + }, + { + "epoch": 0.4636957813428402, + "grad_norm": 1.298234601286431, + "learning_rate": 1.1653871504600445e-05, + "loss": 0.249, + "step": 5853 + }, + { + "epoch": 0.46377500495147556, + "grad_norm": 1.5794023520799612, + "learning_rate": 1.1651340755981634e-05, + "loss": 0.3443, + "step": 5854 + }, + { + "epoch": 0.46385422856011094, + "grad_norm": 1.2784290013496649, + "learning_rate": 1.1648809898629987e-05, + "loss": 0.3174, + "step": 5855 + }, + { + "epoch": 0.46393345216874626, + "grad_norm": 1.452076026122277, + "learning_rate": 1.1646278932712138e-05, + "loss": 0.3894, + "step": 5856 + }, + { + "epoch": 0.46401267577738164, + "grad_norm": 1.3344227301528433, + "learning_rate": 1.1643747858394743e-05, + "loss": 0.2706, + "step": 5857 + }, + { + "epoch": 0.464091899386017, + "grad_norm": 0.9562097964729624, + "learning_rate": 1.1641216675844461e-05, + "loss": 0.1585, + "step": 5858 + }, + { + "epoch": 0.4641711229946524, + "grad_norm": 1.5174806803779837, + "learning_rate": 1.1638685385227958e-05, + "loss": 0.2979, + "step": 5859 + }, + { + "epoch": 0.4642503466032878, + "grad_norm": 1.6830460786407306, + "learning_rate": 1.1636153986711906e-05, + "loss": 0.403, + "step": 5860 + }, + { + "epoch": 0.46432957021192317, + "grad_norm": 1.3076463841194843, + "learning_rate": 1.163362248046299e-05, + "loss": 0.2632, + "step": 5861 + }, + { + "epoch": 0.46440879382055855, + "grad_norm": 1.4283172494652356, + "learning_rate": 1.1631090866647891e-05, + "loss": 0.2957, + "step": 5862 + }, + { + "epoch": 0.4644880174291939, + "grad_norm": 1.181185972061132, + "learning_rate": 1.1628559145433308e-05, + "loss": 0.2359, + "step": 5863 + }, + { + "epoch": 0.46456724103782926, + "grad_norm": 1.4970012536710695, + "learning_rate": 1.1626027316985942e-05, + "loss": 0.2791, + "step": 5864 + }, + { + "epoch": 0.46464646464646464, + "grad_norm": 1.559380902925295, + "learning_rate": 1.1623495381472499e-05, + "loss": 0.3301, + "step": 5865 + }, + { + "epoch": 0.4647256882551, + "grad_norm": 1.09724164281304, + "learning_rate": 1.16209633390597e-05, + "loss": 0.1572, + "step": 5866 + }, + { + "epoch": 0.4648049118637354, + "grad_norm": 1.2164473046723352, + "learning_rate": 1.161843118991426e-05, + "loss": 0.2158, + "step": 5867 + }, + { + "epoch": 0.4648841354723708, + "grad_norm": 1.6507502862357475, + "learning_rate": 1.1615898934202917e-05, + "loss": 0.3585, + "step": 5868 + }, + { + "epoch": 0.46496335908100617, + "grad_norm": 1.5215294171972211, + "learning_rate": 1.1613366572092404e-05, + "loss": 0.3672, + "step": 5869 + }, + { + "epoch": 0.4650425826896415, + "grad_norm": 1.449691723728734, + "learning_rate": 1.1610834103749465e-05, + "loss": 0.2966, + "step": 5870 + }, + { + "epoch": 0.4651218062982769, + "grad_norm": 1.6031097741537381, + "learning_rate": 1.1608301529340848e-05, + "loss": 0.338, + "step": 5871 + }, + { + "epoch": 0.46520102990691226, + "grad_norm": 1.3388235624601201, + "learning_rate": 1.1605768849033318e-05, + "loss": 0.2733, + "step": 5872 + }, + { + "epoch": 0.46528025351554764, + "grad_norm": 1.3082532811308232, + "learning_rate": 1.1603236062993635e-05, + "loss": 0.2612, + "step": 5873 + }, + { + "epoch": 0.465359477124183, + "grad_norm": 1.3401811728108182, + "learning_rate": 1.1600703171388572e-05, + "loss": 0.3436, + "step": 5874 + }, + { + "epoch": 0.4654387007328184, + "grad_norm": 1.4250120946522484, + "learning_rate": 1.1598170174384907e-05, + "loss": 0.2687, + "step": 5875 + }, + { + "epoch": 0.4655179243414537, + "grad_norm": 1.7167241785795126, + "learning_rate": 1.1595637072149424e-05, + "loss": 0.4221, + "step": 5876 + }, + { + "epoch": 0.4655971479500891, + "grad_norm": 1.3247607893483335, + "learning_rate": 1.159310386484892e-05, + "loss": 0.2892, + "step": 5877 + }, + { + "epoch": 0.4656763715587245, + "grad_norm": 1.4817050289616598, + "learning_rate": 1.159057055265019e-05, + "loss": 0.277, + "step": 5878 + }, + { + "epoch": 0.46575559516735987, + "grad_norm": 1.8994949105587406, + "learning_rate": 1.1588037135720043e-05, + "loss": 0.3407, + "step": 5879 + }, + { + "epoch": 0.46583481877599525, + "grad_norm": 1.3109245555235354, + "learning_rate": 1.1585503614225292e-05, + "loss": 0.272, + "step": 5880 + }, + { + "epoch": 0.46591404238463063, + "grad_norm": 1.3529965601333391, + "learning_rate": 1.1582969988332757e-05, + "loss": 0.2307, + "step": 5881 + }, + { + "epoch": 0.465993265993266, + "grad_norm": 1.3791916813137433, + "learning_rate": 1.1580436258209266e-05, + "loss": 0.3558, + "step": 5882 + }, + { + "epoch": 0.46607248960190134, + "grad_norm": 1.402465504951365, + "learning_rate": 1.1577902424021653e-05, + "loss": 0.2872, + "step": 5883 + }, + { + "epoch": 0.4661517132105367, + "grad_norm": 1.5555047853239525, + "learning_rate": 1.1575368485936752e-05, + "loss": 0.3506, + "step": 5884 + }, + { + "epoch": 0.4662309368191721, + "grad_norm": 1.4793861360882956, + "learning_rate": 1.1572834444121424e-05, + "loss": 0.3755, + "step": 5885 + }, + { + "epoch": 0.4663101604278075, + "grad_norm": 1.3379543298678456, + "learning_rate": 1.157030029874251e-05, + "loss": 0.2952, + "step": 5886 + }, + { + "epoch": 0.46638938403644287, + "grad_norm": 1.5695977445171625, + "learning_rate": 1.1567766049966882e-05, + "loss": 0.3603, + "step": 5887 + }, + { + "epoch": 0.46646860764507825, + "grad_norm": 1.2398785096113882, + "learning_rate": 1.1565231697961398e-05, + "loss": 0.2797, + "step": 5888 + }, + { + "epoch": 0.46654783125371363, + "grad_norm": 1.2241398310527707, + "learning_rate": 1.1562697242892939e-05, + "loss": 0.2543, + "step": 5889 + }, + { + "epoch": 0.46662705486234896, + "grad_norm": 1.383505644409573, + "learning_rate": 1.156016268492839e-05, + "loss": 0.2983, + "step": 5890 + }, + { + "epoch": 0.46670627847098434, + "grad_norm": 1.2846054374557785, + "learning_rate": 1.155762802423463e-05, + "loss": 0.3342, + "step": 5891 + }, + { + "epoch": 0.4667855020796197, + "grad_norm": 1.1907580795481385, + "learning_rate": 1.1555093260978562e-05, + "loss": 0.1971, + "step": 5892 + }, + { + "epoch": 0.4668647256882551, + "grad_norm": 1.407457724818261, + "learning_rate": 1.1552558395327087e-05, + "loss": 0.2772, + "step": 5893 + }, + { + "epoch": 0.4669439492968905, + "grad_norm": 1.2549333894460415, + "learning_rate": 1.155002342744711e-05, + "loss": 0.2265, + "step": 5894 + }, + { + "epoch": 0.46702317290552586, + "grad_norm": 1.5459277325212406, + "learning_rate": 1.1547488357505549e-05, + "loss": 0.3602, + "step": 5895 + }, + { + "epoch": 0.46710239651416124, + "grad_norm": 1.5646472437035468, + "learning_rate": 1.1544953185669327e-05, + "loss": 0.2949, + "step": 5896 + }, + { + "epoch": 0.46718162012279657, + "grad_norm": 1.356841730982432, + "learning_rate": 1.154241791210537e-05, + "loss": 0.235, + "step": 5897 + }, + { + "epoch": 0.46726084373143195, + "grad_norm": 1.2008078649428995, + "learning_rate": 1.1539882536980616e-05, + "loss": 0.2086, + "step": 5898 + }, + { + "epoch": 0.46734006734006733, + "grad_norm": 1.6774727721437268, + "learning_rate": 1.1537347060462007e-05, + "loss": 0.3408, + "step": 5899 + }, + { + "epoch": 0.4674192909487027, + "grad_norm": 1.5095871506198835, + "learning_rate": 1.1534811482716487e-05, + "loss": 0.2959, + "step": 5900 + }, + { + "epoch": 0.4674985145573381, + "grad_norm": 1.479597507570882, + "learning_rate": 1.1532275803911021e-05, + "loss": 0.3246, + "step": 5901 + }, + { + "epoch": 0.4675777381659735, + "grad_norm": 1.4314014571486418, + "learning_rate": 1.1529740024212566e-05, + "loss": 0.2955, + "step": 5902 + }, + { + "epoch": 0.46765696177460886, + "grad_norm": 1.4497805663916152, + "learning_rate": 1.1527204143788086e-05, + "loss": 0.3409, + "step": 5903 + }, + { + "epoch": 0.4677361853832442, + "grad_norm": 1.7079909826979645, + "learning_rate": 1.1524668162804566e-05, + "loss": 0.3229, + "step": 5904 + }, + { + "epoch": 0.46781540899187957, + "grad_norm": 1.4018138618292848, + "learning_rate": 1.1522132081428982e-05, + "loss": 0.2869, + "step": 5905 + }, + { + "epoch": 0.46789463260051495, + "grad_norm": 1.5362083590279747, + "learning_rate": 1.1519595899828325e-05, + "loss": 0.384, + "step": 5906 + }, + { + "epoch": 0.46797385620915033, + "grad_norm": 1.279129152749894, + "learning_rate": 1.151705961816959e-05, + "loss": 0.388, + "step": 5907 + }, + { + "epoch": 0.4680530798177857, + "grad_norm": 1.491971794167894, + "learning_rate": 1.151452323661978e-05, + "loss": 0.3205, + "step": 5908 + }, + { + "epoch": 0.4681323034264211, + "grad_norm": 1.4883387354921294, + "learning_rate": 1.15119867553459e-05, + "loss": 0.3121, + "step": 5909 + }, + { + "epoch": 0.4682115270350565, + "grad_norm": 1.3052414195923816, + "learning_rate": 1.150945017451497e-05, + "loss": 0.2389, + "step": 5910 + }, + { + "epoch": 0.4682907506436918, + "grad_norm": 1.2167432251666415, + "learning_rate": 1.1506913494294005e-05, + "loss": 0.2623, + "step": 5911 + }, + { + "epoch": 0.4683699742523272, + "grad_norm": 1.3460327688366835, + "learning_rate": 1.1504376714850041e-05, + "loss": 0.3318, + "step": 5912 + }, + { + "epoch": 0.46844919786096256, + "grad_norm": 1.4621456928169723, + "learning_rate": 1.1501839836350106e-05, + "loss": 0.31, + "step": 5913 + }, + { + "epoch": 0.46852842146959794, + "grad_norm": 1.2594011445357711, + "learning_rate": 1.1499302858961245e-05, + "loss": 0.3236, + "step": 5914 + }, + { + "epoch": 0.4686076450782333, + "grad_norm": 1.1108403646694895, + "learning_rate": 1.1496765782850507e-05, + "loss": 0.2505, + "step": 5915 + }, + { + "epoch": 0.4686868686868687, + "grad_norm": 1.496173446371289, + "learning_rate": 1.149422860818494e-05, + "loss": 0.3351, + "step": 5916 + }, + { + "epoch": 0.46876609229550403, + "grad_norm": 1.6609630208205832, + "learning_rate": 1.1491691335131614e-05, + "loss": 0.3534, + "step": 5917 + }, + { + "epoch": 0.4688453159041394, + "grad_norm": 1.242232844219903, + "learning_rate": 1.148915396385759e-05, + "loss": 0.2511, + "step": 5918 + }, + { + "epoch": 0.4689245395127748, + "grad_norm": 1.4020675928915163, + "learning_rate": 1.1486616494529939e-05, + "loss": 0.2571, + "step": 5919 + }, + { + "epoch": 0.4690037631214102, + "grad_norm": 1.3758723864915638, + "learning_rate": 1.1484078927315749e-05, + "loss": 0.2942, + "step": 5920 + }, + { + "epoch": 0.46908298673004556, + "grad_norm": 1.4871185537974108, + "learning_rate": 1.1481541262382102e-05, + "loss": 0.2218, + "step": 5921 + }, + { + "epoch": 0.46916221033868094, + "grad_norm": 1.195507971647317, + "learning_rate": 1.1479003499896089e-05, + "loss": 0.2192, + "step": 5922 + }, + { + "epoch": 0.4692414339473163, + "grad_norm": 1.316782426701206, + "learning_rate": 1.1476465640024814e-05, + "loss": 0.2656, + "step": 5923 + }, + { + "epoch": 0.46932065755595165, + "grad_norm": 1.477730636338676, + "learning_rate": 1.147392768293538e-05, + "loss": 0.2566, + "step": 5924 + }, + { + "epoch": 0.46939988116458703, + "grad_norm": 1.5364536641776358, + "learning_rate": 1.1471389628794902e-05, + "loss": 0.3449, + "step": 5925 + }, + { + "epoch": 0.4694791047732224, + "grad_norm": 1.5516792838355584, + "learning_rate": 1.1468851477770495e-05, + "loss": 0.3398, + "step": 5926 + }, + { + "epoch": 0.4695583283818578, + "grad_norm": 1.3872521058879526, + "learning_rate": 1.1466313230029284e-05, + "loss": 0.3252, + "step": 5927 + }, + { + "epoch": 0.4696375519904932, + "grad_norm": 1.7100901729966975, + "learning_rate": 1.1463774885738408e-05, + "loss": 0.3753, + "step": 5928 + }, + { + "epoch": 0.46971677559912856, + "grad_norm": 1.6736545382458248, + "learning_rate": 1.1461236445064993e-05, + "loss": 0.2758, + "step": 5929 + }, + { + "epoch": 0.46979599920776394, + "grad_norm": 1.2981479966578386, + "learning_rate": 1.1458697908176194e-05, + "loss": 0.308, + "step": 5930 + }, + { + "epoch": 0.46987522281639926, + "grad_norm": 1.3274911620652021, + "learning_rate": 1.1456159275239153e-05, + "loss": 0.2804, + "step": 5931 + }, + { + "epoch": 0.46995444642503464, + "grad_norm": 1.5261867594043812, + "learning_rate": 1.1453620546421032e-05, + "loss": 0.2517, + "step": 5932 + }, + { + "epoch": 0.47003367003367, + "grad_norm": 1.4038590648956586, + "learning_rate": 1.1451081721888992e-05, + "loss": 0.267, + "step": 5933 + }, + { + "epoch": 0.4701128936423054, + "grad_norm": 1.499493483332517, + "learning_rate": 1.1448542801810203e-05, + "loss": 0.3292, + "step": 5934 + }, + { + "epoch": 0.4701921172509408, + "grad_norm": 1.6769715140374728, + "learning_rate": 1.144600378635184e-05, + "loss": 0.2732, + "step": 5935 + }, + { + "epoch": 0.47027134085957617, + "grad_norm": 1.5400471128934965, + "learning_rate": 1.1443464675681089e-05, + "loss": 0.3277, + "step": 5936 + }, + { + "epoch": 0.47035056446821155, + "grad_norm": 1.376606461102735, + "learning_rate": 1.1440925469965129e-05, + "loss": 0.284, + "step": 5937 + }, + { + "epoch": 0.4704297880768469, + "grad_norm": 1.6647905708565027, + "learning_rate": 1.1438386169371164e-05, + "loss": 0.37, + "step": 5938 + }, + { + "epoch": 0.47050901168548226, + "grad_norm": 1.8576555372334442, + "learning_rate": 1.143584677406639e-05, + "loss": 0.3821, + "step": 5939 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 1.6176689734431222, + "learning_rate": 1.1433307284218014e-05, + "loss": 0.3724, + "step": 5940 + }, + { + "epoch": 0.470667458902753, + "grad_norm": 1.154470501836486, + "learning_rate": 1.1430767699993247e-05, + "loss": 0.236, + "step": 5941 + }, + { + "epoch": 0.4707466825113884, + "grad_norm": 1.4834278086407116, + "learning_rate": 1.1428228021559316e-05, + "loss": 0.2748, + "step": 5942 + }, + { + "epoch": 0.4708259061200238, + "grad_norm": 1.4147886401587715, + "learning_rate": 1.142568824908344e-05, + "loss": 0.2135, + "step": 5943 + }, + { + "epoch": 0.47090512972865917, + "grad_norm": 1.2925225013373358, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.2721, + "step": 5944 + }, + { + "epoch": 0.4709843533372945, + "grad_norm": 1.218865879245756, + "learning_rate": 1.1420608422674793e-05, + "loss": 0.205, + "step": 5945 + }, + { + "epoch": 0.4710635769459299, + "grad_norm": 1.2818951399811043, + "learning_rate": 1.1418068369076503e-05, + "loss": 0.2212, + "step": 5946 + }, + { + "epoch": 0.47114280055456526, + "grad_norm": 1.5054391030063439, + "learning_rate": 1.1415528222105237e-05, + "loss": 0.3062, + "step": 5947 + }, + { + "epoch": 0.47122202416320064, + "grad_norm": 1.4852779923486854, + "learning_rate": 1.1412987981928245e-05, + "loss": 0.3026, + "step": 5948 + }, + { + "epoch": 0.471301247771836, + "grad_norm": 1.770618919107372, + "learning_rate": 1.1410447648712795e-05, + "loss": 0.3651, + "step": 5949 + }, + { + "epoch": 0.4713804713804714, + "grad_norm": 1.3022645623025109, + "learning_rate": 1.1407907222626156e-05, + "loss": 0.2436, + "step": 5950 + }, + { + "epoch": 0.4714596949891068, + "grad_norm": 1.8189120329705482, + "learning_rate": 1.1405366703835596e-05, + "loss": 0.4105, + "step": 5951 + }, + { + "epoch": 0.4715389185977421, + "grad_norm": 1.4035210034995014, + "learning_rate": 1.1402826092508405e-05, + "loss": 0.317, + "step": 5952 + }, + { + "epoch": 0.4716181422063775, + "grad_norm": 1.317219889470921, + "learning_rate": 1.1400285388811862e-05, + "loss": 0.2371, + "step": 5953 + }, + { + "epoch": 0.47169736581501287, + "grad_norm": 1.5607656846624318, + "learning_rate": 1.1397744592913268e-05, + "loss": 0.492, + "step": 5954 + }, + { + "epoch": 0.47177658942364825, + "grad_norm": 1.4225192075784638, + "learning_rate": 1.1395203704979915e-05, + "loss": 0.2534, + "step": 5955 + }, + { + "epoch": 0.47185581303228363, + "grad_norm": 1.239232030738451, + "learning_rate": 1.1392662725179114e-05, + "loss": 0.2385, + "step": 5956 + }, + { + "epoch": 0.471935036640919, + "grad_norm": 1.3459192602928847, + "learning_rate": 1.139012165367817e-05, + "loss": 0.3007, + "step": 5957 + }, + { + "epoch": 0.47201426024955434, + "grad_norm": 1.5164196766937215, + "learning_rate": 1.1387580490644408e-05, + "loss": 0.2717, + "step": 5958 + }, + { + "epoch": 0.4720934838581897, + "grad_norm": 1.3116721827093707, + "learning_rate": 1.1385039236245143e-05, + "loss": 0.2823, + "step": 5959 + }, + { + "epoch": 0.4721727074668251, + "grad_norm": 1.404417349529674, + "learning_rate": 1.1382497890647712e-05, + "loss": 0.3753, + "step": 5960 + }, + { + "epoch": 0.4722519310754605, + "grad_norm": 1.3195655864414761, + "learning_rate": 1.1379956454019445e-05, + "loss": 0.2571, + "step": 5961 + }, + { + "epoch": 0.47233115468409587, + "grad_norm": 1.2762592388574814, + "learning_rate": 1.1377414926527688e-05, + "loss": 0.2989, + "step": 5962 + }, + { + "epoch": 0.47241037829273125, + "grad_norm": 1.3396648047037105, + "learning_rate": 1.1374873308339784e-05, + "loss": 0.3004, + "step": 5963 + }, + { + "epoch": 0.47248960190136663, + "grad_norm": 1.6120739594418916, + "learning_rate": 1.1372331599623088e-05, + "loss": 0.3593, + "step": 5964 + }, + { + "epoch": 0.47256882551000196, + "grad_norm": 1.8364815022000327, + "learning_rate": 1.136978980054496e-05, + "loss": 0.4019, + "step": 5965 + }, + { + "epoch": 0.47264804911863734, + "grad_norm": 1.2683728517848438, + "learning_rate": 1.1367247911272765e-05, + "loss": 0.2819, + "step": 5966 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 1.370140902306888, + "learning_rate": 1.1364705931973872e-05, + "loss": 0.281, + "step": 5967 + }, + { + "epoch": 0.4728064963359081, + "grad_norm": 1.5152248178680985, + "learning_rate": 1.1362163862815663e-05, + "loss": 0.2529, + "step": 5968 + }, + { + "epoch": 0.4728857199445435, + "grad_norm": 1.5128497787601511, + "learning_rate": 1.1359621703965516e-05, + "loss": 0.3365, + "step": 5969 + }, + { + "epoch": 0.47296494355317886, + "grad_norm": 1.561562238164092, + "learning_rate": 1.135707945559082e-05, + "loss": 0.2592, + "step": 5970 + }, + { + "epoch": 0.47304416716181424, + "grad_norm": 1.5266372534147483, + "learning_rate": 1.1354537117858975e-05, + "loss": 0.3673, + "step": 5971 + }, + { + "epoch": 0.47312339077044957, + "grad_norm": 1.2184324700492684, + "learning_rate": 1.1351994690937377e-05, + "loss": 0.2603, + "step": 5972 + }, + { + "epoch": 0.47320261437908495, + "grad_norm": 1.3089692427512045, + "learning_rate": 1.1349452174993437e-05, + "loss": 0.2772, + "step": 5973 + }, + { + "epoch": 0.47328183798772033, + "grad_norm": 1.3054058114184117, + "learning_rate": 1.1346909570194558e-05, + "loss": 0.2523, + "step": 5974 + }, + { + "epoch": 0.4733610615963557, + "grad_norm": 1.3652135593794648, + "learning_rate": 1.134436687670817e-05, + "loss": 0.2676, + "step": 5975 + }, + { + "epoch": 0.4734402852049911, + "grad_norm": 1.4566075736221442, + "learning_rate": 1.134182409470169e-05, + "loss": 0.3784, + "step": 5976 + }, + { + "epoch": 0.4735195088136265, + "grad_norm": 1.2841338732952017, + "learning_rate": 1.133928122434255e-05, + "loss": 0.2441, + "step": 5977 + }, + { + "epoch": 0.47359873242226186, + "grad_norm": 1.3276317416263952, + "learning_rate": 1.1336738265798187e-05, + "loss": 0.2396, + "step": 5978 + }, + { + "epoch": 0.4736779560308972, + "grad_norm": 1.2583401128047258, + "learning_rate": 1.1334195219236039e-05, + "loss": 0.243, + "step": 5979 + }, + { + "epoch": 0.47375717963953257, + "grad_norm": 1.2683330238971544, + "learning_rate": 1.1331652084823554e-05, + "loss": 0.2722, + "step": 5980 + }, + { + "epoch": 0.47383640324816795, + "grad_norm": 1.205500992216874, + "learning_rate": 1.1329108862728192e-05, + "loss": 0.2435, + "step": 5981 + }, + { + "epoch": 0.47391562685680333, + "grad_norm": 1.760538755431643, + "learning_rate": 1.1326565553117404e-05, + "loss": 0.348, + "step": 5982 + }, + { + "epoch": 0.4739948504654387, + "grad_norm": 1.27579246952634, + "learning_rate": 1.1324022156158654e-05, + "loss": 0.2266, + "step": 5983 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 1.229733502088595, + "learning_rate": 1.132147867201942e-05, + "loss": 0.2496, + "step": 5984 + }, + { + "epoch": 0.4741532976827095, + "grad_norm": 1.28064531087544, + "learning_rate": 1.1318935100867172e-05, + "loss": 0.274, + "step": 5985 + }, + { + "epoch": 0.4742325212913448, + "grad_norm": 1.1309236981176227, + "learning_rate": 1.1316391442869394e-05, + "loss": 0.233, + "step": 5986 + }, + { + "epoch": 0.4743117448999802, + "grad_norm": 1.5470727135678277, + "learning_rate": 1.1313847698193577e-05, + "loss": 0.3193, + "step": 5987 + }, + { + "epoch": 0.47439096850861556, + "grad_norm": 1.457668273838197, + "learning_rate": 1.1311303867007207e-05, + "loss": 0.301, + "step": 5988 + }, + { + "epoch": 0.47447019211725094, + "grad_norm": 1.4992119551980507, + "learning_rate": 1.1308759949477786e-05, + "loss": 0.3176, + "step": 5989 + }, + { + "epoch": 0.4745494157258863, + "grad_norm": 1.5842216988059632, + "learning_rate": 1.1306215945772823e-05, + "loss": 0.3275, + "step": 5990 + }, + { + "epoch": 0.4746286393345217, + "grad_norm": 1.493669317042276, + "learning_rate": 1.1303671856059824e-05, + "loss": 0.2543, + "step": 5991 + }, + { + "epoch": 0.4747078629431571, + "grad_norm": 1.251839685129144, + "learning_rate": 1.1301127680506305e-05, + "loss": 0.2492, + "step": 5992 + }, + { + "epoch": 0.4747870865517924, + "grad_norm": 1.633406133734667, + "learning_rate": 1.1298583419279792e-05, + "loss": 0.3561, + "step": 5993 + }, + { + "epoch": 0.4748663101604278, + "grad_norm": 1.6011566996730517, + "learning_rate": 1.1296039072547804e-05, + "loss": 0.3474, + "step": 5994 + }, + { + "epoch": 0.4749455337690632, + "grad_norm": 1.2264189686986706, + "learning_rate": 1.1293494640477885e-05, + "loss": 0.271, + "step": 5995 + }, + { + "epoch": 0.47502475737769856, + "grad_norm": 1.2919760214549674, + "learning_rate": 1.1290950123237564e-05, + "loss": 0.2586, + "step": 5996 + }, + { + "epoch": 0.47510398098633394, + "grad_norm": 1.6796117248621207, + "learning_rate": 1.128840552099439e-05, + "loss": 0.4553, + "step": 5997 + }, + { + "epoch": 0.4751832045949693, + "grad_norm": 1.227128886322809, + "learning_rate": 1.1285860833915914e-05, + "loss": 0.1913, + "step": 5998 + }, + { + "epoch": 0.47526242820360465, + "grad_norm": 1.2250094891337917, + "learning_rate": 1.1283316062169685e-05, + "loss": 0.2115, + "step": 5999 + }, + { + "epoch": 0.47534165181224003, + "grad_norm": 1.7108502469572162, + "learning_rate": 1.1280771205923269e-05, + "loss": 0.345, + "step": 6000 + }, + { + "epoch": 0.4754208754208754, + "grad_norm": 1.7599864752159344, + "learning_rate": 1.1278226265344234e-05, + "loss": 0.3063, + "step": 6001 + }, + { + "epoch": 0.4755000990295108, + "grad_norm": 2.2697419168620634, + "learning_rate": 1.127568124060015e-05, + "loss": 0.4012, + "step": 6002 + }, + { + "epoch": 0.4755793226381462, + "grad_norm": 1.2980018632857855, + "learning_rate": 1.1273136131858595e-05, + "loss": 0.245, + "step": 6003 + }, + { + "epoch": 0.47565854624678156, + "grad_norm": 1.3332070098050692, + "learning_rate": 1.1270590939287149e-05, + "loss": 0.3118, + "step": 6004 + }, + { + "epoch": 0.47573776985541694, + "grad_norm": 1.5320629454573327, + "learning_rate": 1.1268045663053404e-05, + "loss": 0.3201, + "step": 6005 + }, + { + "epoch": 0.47581699346405226, + "grad_norm": 1.1469372709242003, + "learning_rate": 1.1265500303324954e-05, + "loss": 0.1437, + "step": 6006 + }, + { + "epoch": 0.47589621707268764, + "grad_norm": 1.5315649411517587, + "learning_rate": 1.12629548602694e-05, + "loss": 0.3112, + "step": 6007 + }, + { + "epoch": 0.475975440681323, + "grad_norm": 1.5694279047961142, + "learning_rate": 1.1260409334054342e-05, + "loss": 0.4045, + "step": 6008 + }, + { + "epoch": 0.4760546642899584, + "grad_norm": 1.3664205732274488, + "learning_rate": 1.1257863724847398e-05, + "loss": 0.2469, + "step": 6009 + }, + { + "epoch": 0.4761338878985938, + "grad_norm": 1.5024692934757262, + "learning_rate": 1.1255318032816175e-05, + "loss": 0.3241, + "step": 6010 + }, + { + "epoch": 0.47621311150722917, + "grad_norm": 1.1753045154524353, + "learning_rate": 1.1252772258128303e-05, + "loss": 0.2462, + "step": 6011 + }, + { + "epoch": 0.47629233511586455, + "grad_norm": 1.1477444374949006, + "learning_rate": 1.1250226400951408e-05, + "loss": 0.2704, + "step": 6012 + }, + { + "epoch": 0.4763715587244999, + "grad_norm": 3.3942339342087204, + "learning_rate": 1.1247680461453114e-05, + "loss": 0.2028, + "step": 6013 + }, + { + "epoch": 0.47645078233313526, + "grad_norm": 1.329591636823522, + "learning_rate": 1.1245134439801073e-05, + "loss": 0.2634, + "step": 6014 + }, + { + "epoch": 0.47653000594177064, + "grad_norm": 1.343285052504467, + "learning_rate": 1.1242588336162916e-05, + "loss": 0.2703, + "step": 6015 + }, + { + "epoch": 0.476609229550406, + "grad_norm": 1.7792368334618545, + "learning_rate": 1.1240042150706296e-05, + "loss": 0.3568, + "step": 6016 + }, + { + "epoch": 0.4766884531590414, + "grad_norm": 1.5280482148636647, + "learning_rate": 1.1237495883598868e-05, + "loss": 0.3661, + "step": 6017 + }, + { + "epoch": 0.4767676767676768, + "grad_norm": 1.355486962129685, + "learning_rate": 1.1234949535008289e-05, + "loss": 0.2357, + "step": 6018 + }, + { + "epoch": 0.47684690037631217, + "grad_norm": 1.6909830984396892, + "learning_rate": 1.1232403105102226e-05, + "loss": 0.3152, + "step": 6019 + }, + { + "epoch": 0.4769261239849475, + "grad_norm": 1.5336670804054056, + "learning_rate": 1.122985659404835e-05, + "loss": 0.3227, + "step": 6020 + }, + { + "epoch": 0.4770053475935829, + "grad_norm": 1.2197454151740297, + "learning_rate": 1.1227310002014332e-05, + "loss": 0.2194, + "step": 6021 + }, + { + "epoch": 0.47708457120221825, + "grad_norm": 1.7414665018098963, + "learning_rate": 1.1224763329167859e-05, + "loss": 0.324, + "step": 6022 + }, + { + "epoch": 0.47716379481085364, + "grad_norm": 1.6191861133208167, + "learning_rate": 1.122221657567661e-05, + "loss": 0.2768, + "step": 6023 + }, + { + "epoch": 0.477243018419489, + "grad_norm": 1.374871305369388, + "learning_rate": 1.1219669741708282e-05, + "loss": 0.2296, + "step": 6024 + }, + { + "epoch": 0.4773222420281244, + "grad_norm": 1.5133038029946626, + "learning_rate": 1.121712282743057e-05, + "loss": 0.2706, + "step": 6025 + }, + { + "epoch": 0.4774014656367598, + "grad_norm": 1.7551595723008606, + "learning_rate": 1.1214575833011178e-05, + "loss": 0.3844, + "step": 6026 + }, + { + "epoch": 0.4774806892453951, + "grad_norm": 1.7334702973960008, + "learning_rate": 1.121202875861781e-05, + "loss": 0.4162, + "step": 6027 + }, + { + "epoch": 0.4775599128540305, + "grad_norm": 1.2567969880572345, + "learning_rate": 1.1209481604418182e-05, + "loss": 0.2887, + "step": 6028 + }, + { + "epoch": 0.47763913646266587, + "grad_norm": 1.4518389879777716, + "learning_rate": 1.1206934370580009e-05, + "loss": 0.2228, + "step": 6029 + }, + { + "epoch": 0.47771836007130125, + "grad_norm": 1.6290220968104137, + "learning_rate": 1.1204387057271016e-05, + "loss": 0.3764, + "step": 6030 + }, + { + "epoch": 0.47779758367993663, + "grad_norm": 1.4868357406347164, + "learning_rate": 1.1201839664658929e-05, + "loss": 0.2691, + "step": 6031 + }, + { + "epoch": 0.477876807288572, + "grad_norm": 1.7429748829828071, + "learning_rate": 1.1199292192911482e-05, + "loss": 0.359, + "step": 6032 + }, + { + "epoch": 0.47795603089720734, + "grad_norm": 1.5935227691304323, + "learning_rate": 1.1196744642196417e-05, + "loss": 0.3865, + "step": 6033 + }, + { + "epoch": 0.4780352545058427, + "grad_norm": 1.8330323041966783, + "learning_rate": 1.1194197012681473e-05, + "loss": 0.2916, + "step": 6034 + }, + { + "epoch": 0.4781144781144781, + "grad_norm": 1.2639469625366093, + "learning_rate": 1.1191649304534405e-05, + "loss": 0.2989, + "step": 6035 + }, + { + "epoch": 0.4781937017231135, + "grad_norm": 1.32882367930818, + "learning_rate": 1.1189101517922961e-05, + "loss": 0.2583, + "step": 6036 + }, + { + "epoch": 0.47827292533174887, + "grad_norm": 1.6894754003309616, + "learning_rate": 1.1186553653014906e-05, + "loss": 0.255, + "step": 6037 + }, + { + "epoch": 0.47835214894038425, + "grad_norm": 1.2684105548175266, + "learning_rate": 1.1184005709978002e-05, + "loss": 0.2497, + "step": 6038 + }, + { + "epoch": 0.47843137254901963, + "grad_norm": 1.2440034613844855, + "learning_rate": 1.118145768898002e-05, + "loss": 0.2192, + "step": 6039 + }, + { + "epoch": 0.47851059615765495, + "grad_norm": 2.242382660724379, + "learning_rate": 1.1178909590188731e-05, + "loss": 0.4079, + "step": 6040 + }, + { + "epoch": 0.47858981976629034, + "grad_norm": 1.7001611684196702, + "learning_rate": 1.117636141377192e-05, + "loss": 0.3439, + "step": 6041 + }, + { + "epoch": 0.4786690433749257, + "grad_norm": 1.3333455683643658, + "learning_rate": 1.117381315989737e-05, + "loss": 0.2382, + "step": 6042 + }, + { + "epoch": 0.4787482669835611, + "grad_norm": 1.6264075179649133, + "learning_rate": 1.117126482873287e-05, + "loss": 0.3446, + "step": 6043 + }, + { + "epoch": 0.4788274905921965, + "grad_norm": 1.4433284741071852, + "learning_rate": 1.1168716420446219e-05, + "loss": 0.278, + "step": 6044 + }, + { + "epoch": 0.47890671420083186, + "grad_norm": 1.187198190464409, + "learning_rate": 1.1166167935205214e-05, + "loss": 0.3087, + "step": 6045 + }, + { + "epoch": 0.47898593780946724, + "grad_norm": 1.4613588748416362, + "learning_rate": 1.1163619373177663e-05, + "loss": 0.3026, + "step": 6046 + }, + { + "epoch": 0.47906516141810257, + "grad_norm": 1.5677270430045716, + "learning_rate": 1.1161070734531375e-05, + "loss": 0.3013, + "step": 6047 + }, + { + "epoch": 0.47914438502673795, + "grad_norm": 1.4638961224695024, + "learning_rate": 1.1158522019434163e-05, + "loss": 0.3343, + "step": 6048 + }, + { + "epoch": 0.47922360863537333, + "grad_norm": 1.4432812664285235, + "learning_rate": 1.1155973228053854e-05, + "loss": 0.3026, + "step": 6049 + }, + { + "epoch": 0.4793028322440087, + "grad_norm": 1.7201391117878246, + "learning_rate": 1.1153424360558268e-05, + "loss": 0.3999, + "step": 6050 + }, + { + "epoch": 0.4793820558526441, + "grad_norm": 1.2652796050206054, + "learning_rate": 1.115087541711524e-05, + "loss": 0.3032, + "step": 6051 + }, + { + "epoch": 0.4794612794612795, + "grad_norm": 1.5879817665026315, + "learning_rate": 1.1148326397892601e-05, + "loss": 0.3051, + "step": 6052 + }, + { + "epoch": 0.47954050306991486, + "grad_norm": 1.334309076025932, + "learning_rate": 1.1145777303058197e-05, + "loss": 0.2718, + "step": 6053 + }, + { + "epoch": 0.4796197266785502, + "grad_norm": 1.1271325317580927, + "learning_rate": 1.1143228132779867e-05, + "loss": 0.1848, + "step": 6054 + }, + { + "epoch": 0.47969895028718557, + "grad_norm": 1.5355220001918997, + "learning_rate": 1.1140678887225468e-05, + "loss": 0.3171, + "step": 6055 + }, + { + "epoch": 0.47977817389582095, + "grad_norm": 1.351441638714954, + "learning_rate": 1.1138129566562853e-05, + "loss": 0.2739, + "step": 6056 + }, + { + "epoch": 0.47985739750445633, + "grad_norm": 1.3387486077673278, + "learning_rate": 1.1135580170959881e-05, + "loss": 0.2767, + "step": 6057 + }, + { + "epoch": 0.4799366211130917, + "grad_norm": 1.6932996645532263, + "learning_rate": 1.1133030700584419e-05, + "loss": 0.3285, + "step": 6058 + }, + { + "epoch": 0.4800158447217271, + "grad_norm": 1.6059731099901198, + "learning_rate": 1.1130481155604336e-05, + "loss": 0.2816, + "step": 6059 + }, + { + "epoch": 0.4800950683303625, + "grad_norm": 1.7226693778961917, + "learning_rate": 1.1127931536187511e-05, + "loss": 0.3246, + "step": 6060 + }, + { + "epoch": 0.4801742919389978, + "grad_norm": 1.360874599177222, + "learning_rate": 1.1125381842501819e-05, + "loss": 0.3091, + "step": 6061 + }, + { + "epoch": 0.4802535155476332, + "grad_norm": 1.6501154279475763, + "learning_rate": 1.1122832074715149e-05, + "loss": 0.2993, + "step": 6062 + }, + { + "epoch": 0.48033273915626856, + "grad_norm": 1.6870640200987324, + "learning_rate": 1.1120282232995389e-05, + "loss": 0.3337, + "step": 6063 + }, + { + "epoch": 0.48041196276490394, + "grad_norm": 1.6210418443737509, + "learning_rate": 1.1117732317510437e-05, + "loss": 0.3656, + "step": 6064 + }, + { + "epoch": 0.4804911863735393, + "grad_norm": 1.5093452687261766, + "learning_rate": 1.111518232842819e-05, + "loss": 0.3052, + "step": 6065 + }, + { + "epoch": 0.4805704099821747, + "grad_norm": 1.5609851300451982, + "learning_rate": 1.1112632265916548e-05, + "loss": 0.3695, + "step": 6066 + }, + { + "epoch": 0.4806496335908101, + "grad_norm": 1.5088324998048293, + "learning_rate": 1.1110082130143427e-05, + "loss": 0.2535, + "step": 6067 + }, + { + "epoch": 0.4807288571994454, + "grad_norm": 1.547690678961717, + "learning_rate": 1.1107531921276742e-05, + "loss": 0.2419, + "step": 6068 + }, + { + "epoch": 0.4808080808080808, + "grad_norm": 1.3274831775373486, + "learning_rate": 1.1104981639484404e-05, + "loss": 0.2472, + "step": 6069 + }, + { + "epoch": 0.4808873044167162, + "grad_norm": 1.4082663194143967, + "learning_rate": 1.1102431284934345e-05, + "loss": 0.2078, + "step": 6070 + }, + { + "epoch": 0.48096652802535156, + "grad_norm": 1.671828606916373, + "learning_rate": 1.1099880857794491e-05, + "loss": 0.3965, + "step": 6071 + }, + { + "epoch": 0.48104575163398694, + "grad_norm": 1.7348320874419227, + "learning_rate": 1.1097330358232775e-05, + "loss": 0.4225, + "step": 6072 + }, + { + "epoch": 0.4811249752426223, + "grad_norm": 1.2078015512494769, + "learning_rate": 1.1094779786417133e-05, + "loss": 0.2257, + "step": 6073 + }, + { + "epoch": 0.48120419885125765, + "grad_norm": 1.5601087144285826, + "learning_rate": 1.1092229142515512e-05, + "loss": 0.3128, + "step": 6074 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 1.5348638556364782, + "learning_rate": 1.1089678426695854e-05, + "loss": 0.2811, + "step": 6075 + }, + { + "epoch": 0.4813626460685284, + "grad_norm": 1.144681120404384, + "learning_rate": 1.1087127639126118e-05, + "loss": 0.2154, + "step": 6076 + }, + { + "epoch": 0.4814418696771638, + "grad_norm": 1.0768422041387595, + "learning_rate": 1.1084576779974257e-05, + "loss": 0.2095, + "step": 6077 + }, + { + "epoch": 0.4815210932857992, + "grad_norm": 1.294588525842855, + "learning_rate": 1.1082025849408231e-05, + "loss": 0.2461, + "step": 6078 + }, + { + "epoch": 0.48160031689443455, + "grad_norm": 1.6261272786961174, + "learning_rate": 1.1079474847596014e-05, + "loss": 0.3868, + "step": 6079 + }, + { + "epoch": 0.48167954050306994, + "grad_norm": 1.5340116649883544, + "learning_rate": 1.1076923774705568e-05, + "loss": 0.2396, + "step": 6080 + }, + { + "epoch": 0.48175876411170526, + "grad_norm": 1.390338022518972, + "learning_rate": 1.1074372630904878e-05, + "loss": 0.3044, + "step": 6081 + }, + { + "epoch": 0.48183798772034064, + "grad_norm": 1.2863309913843124, + "learning_rate": 1.1071821416361917e-05, + "loss": 0.2391, + "step": 6082 + }, + { + "epoch": 0.481917211328976, + "grad_norm": 1.1726082141951204, + "learning_rate": 1.106927013124467e-05, + "loss": 0.203, + "step": 6083 + }, + { + "epoch": 0.4819964349376114, + "grad_norm": 1.4276651602226915, + "learning_rate": 1.1066718775721135e-05, + "loss": 0.2449, + "step": 6084 + }, + { + "epoch": 0.4820756585462468, + "grad_norm": 1.3158419117919298, + "learning_rate": 1.1064167349959299e-05, + "loss": 0.2616, + "step": 6085 + }, + { + "epoch": 0.48215488215488217, + "grad_norm": 1.3031351146360042, + "learning_rate": 1.1061615854127165e-05, + "loss": 0.2501, + "step": 6086 + }, + { + "epoch": 0.48223410576351755, + "grad_norm": 1.47023789758246, + "learning_rate": 1.1059064288392733e-05, + "loss": 0.3226, + "step": 6087 + }, + { + "epoch": 0.4823133293721529, + "grad_norm": 1.411754131252147, + "learning_rate": 1.1056512652924014e-05, + "loss": 0.2434, + "step": 6088 + }, + { + "epoch": 0.48239255298078826, + "grad_norm": 1.347650328191249, + "learning_rate": 1.1053960947889021e-05, + "loss": 0.2648, + "step": 6089 + }, + { + "epoch": 0.48247177658942364, + "grad_norm": 1.2983872738821516, + "learning_rate": 1.1051409173455771e-05, + "loss": 0.2545, + "step": 6090 + }, + { + "epoch": 0.482551000198059, + "grad_norm": 1.365900899385686, + "learning_rate": 1.1048857329792284e-05, + "loss": 0.1888, + "step": 6091 + }, + { + "epoch": 0.4826302238066944, + "grad_norm": 1.9224272628383734, + "learning_rate": 1.1046305417066594e-05, + "loss": 0.3606, + "step": 6092 + }, + { + "epoch": 0.4827094474153298, + "grad_norm": 1.499434046670531, + "learning_rate": 1.1043753435446722e-05, + "loss": 0.4271, + "step": 6093 + }, + { + "epoch": 0.48278867102396517, + "grad_norm": 1.1968634204507425, + "learning_rate": 1.104120138510071e-05, + "loss": 0.2532, + "step": 6094 + }, + { + "epoch": 0.4828678946326005, + "grad_norm": 1.5236227774319238, + "learning_rate": 1.1038649266196597e-05, + "loss": 0.3163, + "step": 6095 + }, + { + "epoch": 0.4829471182412359, + "grad_norm": 1.4116947999482448, + "learning_rate": 1.1036097078902428e-05, + "loss": 0.3145, + "step": 6096 + }, + { + "epoch": 0.48302634184987125, + "grad_norm": 1.2596153432837602, + "learning_rate": 1.1033544823386248e-05, + "loss": 0.2994, + "step": 6097 + }, + { + "epoch": 0.48310556545850664, + "grad_norm": 1.289307072363218, + "learning_rate": 1.103099249981612e-05, + "loss": 0.2409, + "step": 6098 + }, + { + "epoch": 0.483184789067142, + "grad_norm": 1.544297408445771, + "learning_rate": 1.1028440108360092e-05, + "loss": 0.3131, + "step": 6099 + }, + { + "epoch": 0.4832640126757774, + "grad_norm": 1.557601485409439, + "learning_rate": 1.1025887649186236e-05, + "loss": 0.2814, + "step": 6100 + }, + { + "epoch": 0.4833432362844128, + "grad_norm": 1.4540441143302076, + "learning_rate": 1.1023335122462611e-05, + "loss": 0.2815, + "step": 6101 + }, + { + "epoch": 0.4834224598930481, + "grad_norm": 1.317005895710935, + "learning_rate": 1.102078252835729e-05, + "loss": 0.2596, + "step": 6102 + }, + { + "epoch": 0.4835016835016835, + "grad_norm": 1.535044273860009, + "learning_rate": 1.1018229867038358e-05, + "loss": 0.3207, + "step": 6103 + }, + { + "epoch": 0.48358090711031887, + "grad_norm": 1.4982955697882976, + "learning_rate": 1.1015677138673882e-05, + "loss": 0.2412, + "step": 6104 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 1.4052336110341974, + "learning_rate": 1.1013124343431955e-05, + "loss": 0.318, + "step": 6105 + }, + { + "epoch": 0.48373935432758963, + "grad_norm": 1.4741090632617713, + "learning_rate": 1.1010571481480668e-05, + "loss": 0.3629, + "step": 6106 + }, + { + "epoch": 0.483818577936225, + "grad_norm": 1.3490346128218675, + "learning_rate": 1.1008018552988109e-05, + "loss": 0.2351, + "step": 6107 + }, + { + "epoch": 0.4838978015448604, + "grad_norm": 1.36436398062383, + "learning_rate": 1.1005465558122382e-05, + "loss": 0.2847, + "step": 6108 + }, + { + "epoch": 0.4839770251534957, + "grad_norm": 1.272969101307285, + "learning_rate": 1.1002912497051582e-05, + "loss": 0.1808, + "step": 6109 + }, + { + "epoch": 0.4840562487621311, + "grad_norm": 1.3055478096477444, + "learning_rate": 1.1000359369943818e-05, + "loss": 0.2161, + "step": 6110 + }, + { + "epoch": 0.4841354723707665, + "grad_norm": 1.5306078048121168, + "learning_rate": 1.099780617696721e-05, + "loss": 0.3376, + "step": 6111 + }, + { + "epoch": 0.48421469597940187, + "grad_norm": 1.3730613627599653, + "learning_rate": 1.099525291828986e-05, + "loss": 0.2235, + "step": 6112 + }, + { + "epoch": 0.48429391958803725, + "grad_norm": 1.3100682501879188, + "learning_rate": 1.0992699594079896e-05, + "loss": 0.3177, + "step": 6113 + }, + { + "epoch": 0.48437314319667263, + "grad_norm": 1.7476161209058387, + "learning_rate": 1.0990146204505444e-05, + "loss": 0.3261, + "step": 6114 + }, + { + "epoch": 0.48445236680530795, + "grad_norm": 1.697437545080915, + "learning_rate": 1.0987592749734624e-05, + "loss": 0.2964, + "step": 6115 + }, + { + "epoch": 0.48453159041394334, + "grad_norm": 2.054339072774018, + "learning_rate": 1.0985039229935575e-05, + "loss": 0.3761, + "step": 6116 + }, + { + "epoch": 0.4846108140225787, + "grad_norm": 1.3524976967062285, + "learning_rate": 1.098248564527643e-05, + "loss": 0.2775, + "step": 6117 + }, + { + "epoch": 0.4846900376312141, + "grad_norm": 1.3286666958139592, + "learning_rate": 1.0979931995925335e-05, + "loss": 0.2147, + "step": 6118 + }, + { + "epoch": 0.4847692612398495, + "grad_norm": 1.629975469966293, + "learning_rate": 1.0977378282050436e-05, + "loss": 0.2756, + "step": 6119 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 1.5810534047348923, + "learning_rate": 1.0974824503819877e-05, + "loss": 0.3123, + "step": 6120 + }, + { + "epoch": 0.48492770845712024, + "grad_norm": 1.4893143668380495, + "learning_rate": 1.0972270661401812e-05, + "loss": 0.3211, + "step": 6121 + }, + { + "epoch": 0.48500693206575557, + "grad_norm": 1.7121285060124665, + "learning_rate": 1.0969716754964408e-05, + "loss": 0.2966, + "step": 6122 + }, + { + "epoch": 0.48508615567439095, + "grad_norm": 1.7242540541528943, + "learning_rate": 1.0967162784675818e-05, + "loss": 0.2644, + "step": 6123 + }, + { + "epoch": 0.48516537928302633, + "grad_norm": 1.9237001576761963, + "learning_rate": 1.0964608750704215e-05, + "loss": 0.2968, + "step": 6124 + }, + { + "epoch": 0.4852446028916617, + "grad_norm": 1.2791805749682155, + "learning_rate": 1.0962054653217764e-05, + "loss": 0.2776, + "step": 6125 + }, + { + "epoch": 0.4853238265002971, + "grad_norm": 1.3204379381013476, + "learning_rate": 1.0959500492384646e-05, + "loss": 0.2396, + "step": 6126 + }, + { + "epoch": 0.4854030501089325, + "grad_norm": 1.2136200820565095, + "learning_rate": 1.0956946268373034e-05, + "loss": 0.2379, + "step": 6127 + }, + { + "epoch": 0.48548227371756786, + "grad_norm": 1.1866725081229594, + "learning_rate": 1.0954391981351117e-05, + "loss": 0.2514, + "step": 6128 + }, + { + "epoch": 0.4855614973262032, + "grad_norm": 1.3051367107904754, + "learning_rate": 1.0951837631487081e-05, + "loss": 0.1947, + "step": 6129 + }, + { + "epoch": 0.48564072093483857, + "grad_norm": 1.4428799347708885, + "learning_rate": 1.0949283218949117e-05, + "loss": 0.4157, + "step": 6130 + }, + { + "epoch": 0.48571994454347395, + "grad_norm": 1.3938253661732678, + "learning_rate": 1.094672874390542e-05, + "loss": 0.3459, + "step": 6131 + }, + { + "epoch": 0.48579916815210933, + "grad_norm": 1.6269717457578443, + "learning_rate": 1.094417420652419e-05, + "loss": 0.3469, + "step": 6132 + }, + { + "epoch": 0.4858783917607447, + "grad_norm": 1.4581714390539897, + "learning_rate": 1.0941619606973633e-05, + "loss": 0.2911, + "step": 6133 + }, + { + "epoch": 0.4859576153693801, + "grad_norm": 1.1782084152370544, + "learning_rate": 1.0939064945421953e-05, + "loss": 0.263, + "step": 6134 + }, + { + "epoch": 0.4860368389780155, + "grad_norm": 1.4019701514320915, + "learning_rate": 1.0936510222037368e-05, + "loss": 0.2849, + "step": 6135 + }, + { + "epoch": 0.4861160625866508, + "grad_norm": 1.1700490384584799, + "learning_rate": 1.0933955436988088e-05, + "loss": 0.2313, + "step": 6136 + }, + { + "epoch": 0.4861952861952862, + "grad_norm": 1.7318887895354502, + "learning_rate": 1.0931400590442337e-05, + "loss": 0.32, + "step": 6137 + }, + { + "epoch": 0.48627450980392156, + "grad_norm": 1.609251339289543, + "learning_rate": 1.0928845682568344e-05, + "loss": 0.3662, + "step": 6138 + }, + { + "epoch": 0.48635373341255694, + "grad_norm": 1.361670973189962, + "learning_rate": 1.0926290713534324e-05, + "loss": 0.3193, + "step": 6139 + }, + { + "epoch": 0.4864329570211923, + "grad_norm": 1.8012673427868948, + "learning_rate": 1.0923735683508521e-05, + "loss": 0.2807, + "step": 6140 + }, + { + "epoch": 0.4865121806298277, + "grad_norm": 1.6193749438186245, + "learning_rate": 1.092118059265917e-05, + "loss": 0.3244, + "step": 6141 + }, + { + "epoch": 0.4865914042384631, + "grad_norm": 1.2210525094443065, + "learning_rate": 1.0918625441154508e-05, + "loss": 0.2621, + "step": 6142 + }, + { + "epoch": 0.4866706278470984, + "grad_norm": 1.6117566094763687, + "learning_rate": 1.091607022916278e-05, + "loss": 0.3951, + "step": 6143 + }, + { + "epoch": 0.4867498514557338, + "grad_norm": 1.1755785169079471, + "learning_rate": 1.0913514956852236e-05, + "loss": 0.2441, + "step": 6144 + }, + { + "epoch": 0.4868290750643692, + "grad_norm": 1.405229639012253, + "learning_rate": 1.0910959624391127e-05, + "loss": 0.2962, + "step": 6145 + }, + { + "epoch": 0.48690829867300456, + "grad_norm": 1.5968607903656973, + "learning_rate": 1.090840423194771e-05, + "loss": 0.3532, + "step": 6146 + }, + { + "epoch": 0.48698752228163994, + "grad_norm": 1.4864717981694215, + "learning_rate": 1.0905848779690246e-05, + "loss": 0.3387, + "step": 6147 + }, + { + "epoch": 0.4870667458902753, + "grad_norm": 1.4055737743461898, + "learning_rate": 1.0903293267786998e-05, + "loss": 0.2189, + "step": 6148 + }, + { + "epoch": 0.4871459694989107, + "grad_norm": 1.52242155947934, + "learning_rate": 1.0900737696406235e-05, + "loss": 0.2739, + "step": 6149 + }, + { + "epoch": 0.48722519310754603, + "grad_norm": 1.4062902494315441, + "learning_rate": 1.0898182065716227e-05, + "loss": 0.2452, + "step": 6150 + }, + { + "epoch": 0.4873044167161814, + "grad_norm": 1.467042791784936, + "learning_rate": 1.0895626375885255e-05, + "loss": 0.3041, + "step": 6151 + }, + { + "epoch": 0.4873836403248168, + "grad_norm": 1.2393769660727878, + "learning_rate": 1.0893070627081595e-05, + "loss": 0.1865, + "step": 6152 + }, + { + "epoch": 0.4874628639334522, + "grad_norm": 1.580462881695226, + "learning_rate": 1.089051481947353e-05, + "loss": 0.3253, + "step": 6153 + }, + { + "epoch": 0.48754208754208755, + "grad_norm": 1.4570043502367767, + "learning_rate": 1.0887958953229349e-05, + "loss": 0.2974, + "step": 6154 + }, + { + "epoch": 0.48762131115072294, + "grad_norm": 1.1194671289501286, + "learning_rate": 1.0885403028517345e-05, + "loss": 0.1863, + "step": 6155 + }, + { + "epoch": 0.48770053475935826, + "grad_norm": 1.53081016103416, + "learning_rate": 1.0882847045505809e-05, + "loss": 0.3968, + "step": 6156 + }, + { + "epoch": 0.48777975836799364, + "grad_norm": 1.491539542669247, + "learning_rate": 1.0880291004363047e-05, + "loss": 0.3016, + "step": 6157 + }, + { + "epoch": 0.487858981976629, + "grad_norm": 1.2611339462723674, + "learning_rate": 1.0877734905257354e-05, + "loss": 0.254, + "step": 6158 + }, + { + "epoch": 0.4879382055852644, + "grad_norm": 1.1961698704125379, + "learning_rate": 1.0875178748357045e-05, + "loss": 0.1887, + "step": 6159 + }, + { + "epoch": 0.4880174291938998, + "grad_norm": 1.3721810304073363, + "learning_rate": 1.0872622533830423e-05, + "loss": 0.285, + "step": 6160 + }, + { + "epoch": 0.48809665280253517, + "grad_norm": 1.567837652185398, + "learning_rate": 1.0870066261845807e-05, + "loss": 0.2954, + "step": 6161 + }, + { + "epoch": 0.48817587641117055, + "grad_norm": 1.233238588712617, + "learning_rate": 1.0867509932571517e-05, + "loss": 0.2553, + "step": 6162 + }, + { + "epoch": 0.4882551000198059, + "grad_norm": 1.397418530126188, + "learning_rate": 1.0864953546175867e-05, + "loss": 0.2703, + "step": 6163 + }, + { + "epoch": 0.48833432362844126, + "grad_norm": 1.5610715288104426, + "learning_rate": 1.0862397102827189e-05, + "loss": 0.384, + "step": 6164 + }, + { + "epoch": 0.48841354723707664, + "grad_norm": 1.245660498200154, + "learning_rate": 1.0859840602693813e-05, + "loss": 0.2671, + "step": 6165 + }, + { + "epoch": 0.488492770845712, + "grad_norm": 1.3947092697329637, + "learning_rate": 1.0857284045944071e-05, + "loss": 0.2203, + "step": 6166 + }, + { + "epoch": 0.4885719944543474, + "grad_norm": 1.218362053557024, + "learning_rate": 1.0854727432746302e-05, + "loss": 0.2852, + "step": 6167 + }, + { + "epoch": 0.4886512180629828, + "grad_norm": 1.341351718036167, + "learning_rate": 1.0852170763268838e-05, + "loss": 0.189, + "step": 6168 + }, + { + "epoch": 0.48873044167161817, + "grad_norm": 1.5884808936970678, + "learning_rate": 1.0849614037680032e-05, + "loss": 0.3538, + "step": 6169 + }, + { + "epoch": 0.4888096652802535, + "grad_norm": 1.3849156013508366, + "learning_rate": 1.0847057256148234e-05, + "loss": 0.2517, + "step": 6170 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 1.335160074346344, + "learning_rate": 1.0844500418841788e-05, + "loss": 0.2567, + "step": 6171 + }, + { + "epoch": 0.48896811249752425, + "grad_norm": 1.9884280331886677, + "learning_rate": 1.0841943525929053e-05, + "loss": 0.387, + "step": 6172 + }, + { + "epoch": 0.48904733610615964, + "grad_norm": 1.4552717666439374, + "learning_rate": 1.0839386577578389e-05, + "loss": 0.3008, + "step": 6173 + }, + { + "epoch": 0.489126559714795, + "grad_norm": 1.3772706742532324, + "learning_rate": 1.0836829573958155e-05, + "loss": 0.2517, + "step": 6174 + }, + { + "epoch": 0.4892057833234304, + "grad_norm": 1.4315596710202125, + "learning_rate": 1.083427251523672e-05, + "loss": 0.3322, + "step": 6175 + }, + { + "epoch": 0.4892850069320658, + "grad_norm": 1.2146382197625711, + "learning_rate": 1.0831715401582458e-05, + "loss": 0.2601, + "step": 6176 + }, + { + "epoch": 0.4893642305407011, + "grad_norm": 1.4693611008861538, + "learning_rate": 1.0829158233163737e-05, + "loss": 0.2508, + "step": 6177 + }, + { + "epoch": 0.4894434541493365, + "grad_norm": 1.501286271071185, + "learning_rate": 1.0826601010148935e-05, + "loss": 0.4254, + "step": 6178 + }, + { + "epoch": 0.48952267775797187, + "grad_norm": 1.5815926457007046, + "learning_rate": 1.0824043732706435e-05, + "loss": 0.347, + "step": 6179 + }, + { + "epoch": 0.48960190136660725, + "grad_norm": 1.5415088564225938, + "learning_rate": 1.0821486401004618e-05, + "loss": 0.3386, + "step": 6180 + }, + { + "epoch": 0.48968112497524263, + "grad_norm": 1.4474455234323833, + "learning_rate": 1.0818929015211877e-05, + "loss": 0.2656, + "step": 6181 + }, + { + "epoch": 0.489760348583878, + "grad_norm": 1.4060165907615751, + "learning_rate": 1.0816371575496598e-05, + "loss": 0.2271, + "step": 6182 + }, + { + "epoch": 0.4898395721925134, + "grad_norm": 1.3963858824595528, + "learning_rate": 1.081381408202718e-05, + "loss": 0.2914, + "step": 6183 + }, + { + "epoch": 0.4899187958011487, + "grad_norm": 1.627029400041405, + "learning_rate": 1.0811256534972024e-05, + "loss": 0.3751, + "step": 6184 + }, + { + "epoch": 0.4899980194097841, + "grad_norm": 1.2250895246335802, + "learning_rate": 1.0808698934499524e-05, + "loss": 0.2247, + "step": 6185 + }, + { + "epoch": 0.4900772430184195, + "grad_norm": 1.3257314242280365, + "learning_rate": 1.0806141280778093e-05, + "loss": 0.2859, + "step": 6186 + }, + { + "epoch": 0.49015646662705487, + "grad_norm": 1.1850919805751492, + "learning_rate": 1.0803583573976137e-05, + "loss": 0.2401, + "step": 6187 + }, + { + "epoch": 0.49023569023569025, + "grad_norm": 1.1107332607084066, + "learning_rate": 1.0801025814262068e-05, + "loss": 0.2204, + "step": 6188 + }, + { + "epoch": 0.49031491384432563, + "grad_norm": 1.5225029620487145, + "learning_rate": 1.0798468001804305e-05, + "loss": 0.3076, + "step": 6189 + }, + { + "epoch": 0.490394137452961, + "grad_norm": 1.4702612479048018, + "learning_rate": 1.0795910136771266e-05, + "loss": 0.3108, + "step": 6190 + }, + { + "epoch": 0.49047336106159634, + "grad_norm": 1.4772785731823188, + "learning_rate": 1.0793352219331371e-05, + "loss": 0.3196, + "step": 6191 + }, + { + "epoch": 0.4905525846702317, + "grad_norm": 1.536878832032189, + "learning_rate": 1.0790794249653056e-05, + "loss": 0.3255, + "step": 6192 + }, + { + "epoch": 0.4906318082788671, + "grad_norm": 1.73651753305478, + "learning_rate": 1.0788236227904738e-05, + "loss": 0.2982, + "step": 6193 + }, + { + "epoch": 0.4907110318875025, + "grad_norm": 1.1649184716370518, + "learning_rate": 1.0785678154254865e-05, + "loss": 0.2053, + "step": 6194 + }, + { + "epoch": 0.49079025549613786, + "grad_norm": 1.188098789055214, + "learning_rate": 1.0783120028871858e-05, + "loss": 0.2314, + "step": 6195 + }, + { + "epoch": 0.49086947910477324, + "grad_norm": 1.3450706436396398, + "learning_rate": 1.0780561851924168e-05, + "loss": 0.2233, + "step": 6196 + }, + { + "epoch": 0.49094870271340857, + "grad_norm": 1.784045083633948, + "learning_rate": 1.0778003623580237e-05, + "loss": 0.3568, + "step": 6197 + }, + { + "epoch": 0.49102792632204395, + "grad_norm": 1.554802606764818, + "learning_rate": 1.077544534400851e-05, + "loss": 0.3411, + "step": 6198 + }, + { + "epoch": 0.49110714993067933, + "grad_norm": 1.6058798273090809, + "learning_rate": 1.0772887013377438e-05, + "loss": 0.416, + "step": 6199 + }, + { + "epoch": 0.4911863735393147, + "grad_norm": 1.5391679701428511, + "learning_rate": 1.0770328631855476e-05, + "loss": 0.3855, + "step": 6200 + }, + { + "epoch": 0.4912655971479501, + "grad_norm": 1.5378938980102377, + "learning_rate": 1.0767770199611078e-05, + "loss": 0.2856, + "step": 6201 + }, + { + "epoch": 0.4913448207565855, + "grad_norm": 1.166805596897456, + "learning_rate": 1.076521171681271e-05, + "loss": 0.2466, + "step": 6202 + }, + { + "epoch": 0.49142404436522086, + "grad_norm": 1.536547216923099, + "learning_rate": 1.0762653183628831e-05, + "loss": 0.2851, + "step": 6203 + }, + { + "epoch": 0.4915032679738562, + "grad_norm": 1.4223236654328237, + "learning_rate": 1.0760094600227908e-05, + "loss": 0.3708, + "step": 6204 + }, + { + "epoch": 0.49158249158249157, + "grad_norm": 1.3718094513279078, + "learning_rate": 1.0757535966778416e-05, + "loss": 0.2244, + "step": 6205 + }, + { + "epoch": 0.49166171519112695, + "grad_norm": 1.5640241098868874, + "learning_rate": 1.0754977283448824e-05, + "loss": 0.258, + "step": 6206 + }, + { + "epoch": 0.49174093879976233, + "grad_norm": 1.6900784168741398, + "learning_rate": 1.0752418550407611e-05, + "loss": 0.4154, + "step": 6207 + }, + { + "epoch": 0.4918201624083977, + "grad_norm": 1.7994718856508958, + "learning_rate": 1.0749859767823256e-05, + "loss": 0.384, + "step": 6208 + }, + { + "epoch": 0.4918993860170331, + "grad_norm": 1.4717917761484856, + "learning_rate": 1.0747300935864245e-05, + "loss": 0.3591, + "step": 6209 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 1.3383497822924593, + "learning_rate": 1.074474205469906e-05, + "loss": 0.2885, + "step": 6210 + }, + { + "epoch": 0.4920578332343038, + "grad_norm": 1.6496577144297149, + "learning_rate": 1.0742183124496197e-05, + "loss": 0.387, + "step": 6211 + }, + { + "epoch": 0.4921370568429392, + "grad_norm": 1.3466125167078744, + "learning_rate": 1.0739624145424146e-05, + "loss": 0.3039, + "step": 6212 + }, + { + "epoch": 0.49221628045157456, + "grad_norm": 1.5160308645798655, + "learning_rate": 1.0737065117651404e-05, + "loss": 0.2912, + "step": 6213 + }, + { + "epoch": 0.49229550406020994, + "grad_norm": 1.526127365587252, + "learning_rate": 1.0734506041346468e-05, + "loss": 0.294, + "step": 6214 + }, + { + "epoch": 0.4923747276688453, + "grad_norm": 1.5005855609234349, + "learning_rate": 1.0731946916677847e-05, + "loss": 0.3837, + "step": 6215 + }, + { + "epoch": 0.4924539512774807, + "grad_norm": 3.7392082045902666, + "learning_rate": 1.0729387743814041e-05, + "loss": 0.2066, + "step": 6216 + }, + { + "epoch": 0.4925331748861161, + "grad_norm": 1.372508750294774, + "learning_rate": 1.0726828522923563e-05, + "loss": 0.2848, + "step": 6217 + }, + { + "epoch": 0.4926123984947514, + "grad_norm": 1.3999552651359242, + "learning_rate": 1.0724269254174921e-05, + "loss": 0.2922, + "step": 6218 + }, + { + "epoch": 0.4926916221033868, + "grad_norm": 1.4554709891905275, + "learning_rate": 1.0721709937736638e-05, + "loss": 0.357, + "step": 6219 + }, + { + "epoch": 0.4927708457120222, + "grad_norm": 1.9487352301395504, + "learning_rate": 1.0719150573777226e-05, + "loss": 0.2293, + "step": 6220 + }, + { + "epoch": 0.49285006932065756, + "grad_norm": 1.613084113343599, + "learning_rate": 1.071659116246521e-05, + "loss": 0.2746, + "step": 6221 + }, + { + "epoch": 0.49292929292929294, + "grad_norm": 1.6120754882680997, + "learning_rate": 1.0714031703969112e-05, + "loss": 0.2751, + "step": 6222 + }, + { + "epoch": 0.4930085165379283, + "grad_norm": 1.4454114067831845, + "learning_rate": 1.0711472198457462e-05, + "loss": 0.3271, + "step": 6223 + }, + { + "epoch": 0.4930877401465637, + "grad_norm": 2.0995044285519193, + "learning_rate": 1.0708912646098795e-05, + "loss": 0.3394, + "step": 6224 + }, + { + "epoch": 0.49316696375519903, + "grad_norm": 1.4956951299165366, + "learning_rate": 1.0706353047061638e-05, + "loss": 0.2947, + "step": 6225 + }, + { + "epoch": 0.4932461873638344, + "grad_norm": 1.3049652912090355, + "learning_rate": 1.070379340151453e-05, + "loss": 0.2738, + "step": 6226 + }, + { + "epoch": 0.4933254109724698, + "grad_norm": 1.1146566156178852, + "learning_rate": 1.0701233709626018e-05, + "loss": 0.2278, + "step": 6227 + }, + { + "epoch": 0.4934046345811052, + "grad_norm": 1.3275771711305728, + "learning_rate": 1.0698673971564637e-05, + "loss": 0.2079, + "step": 6228 + }, + { + "epoch": 0.49348385818974055, + "grad_norm": 1.5243329371612238, + "learning_rate": 1.0696114187498938e-05, + "loss": 0.3093, + "step": 6229 + }, + { + "epoch": 0.49356308179837594, + "grad_norm": 1.1577371866744388, + "learning_rate": 1.0693554357597469e-05, + "loss": 0.244, + "step": 6230 + }, + { + "epoch": 0.4936423054070113, + "grad_norm": 1.5597814402026162, + "learning_rate": 1.069099448202878e-05, + "loss": 0.2221, + "step": 6231 + }, + { + "epoch": 0.49372152901564664, + "grad_norm": 1.7503557147364344, + "learning_rate": 1.0688434560961434e-05, + "loss": 0.3088, + "step": 6232 + }, + { + "epoch": 0.493800752624282, + "grad_norm": 1.4601903226428612, + "learning_rate": 1.068587459456398e-05, + "loss": 0.2771, + "step": 6233 + }, + { + "epoch": 0.4938799762329174, + "grad_norm": 1.1382601182910943, + "learning_rate": 1.0683314583004986e-05, + "loss": 0.2491, + "step": 6234 + }, + { + "epoch": 0.4939591998415528, + "grad_norm": 1.5727566425849564, + "learning_rate": 1.0680754526453017e-05, + "loss": 0.2989, + "step": 6235 + }, + { + "epoch": 0.49403842345018817, + "grad_norm": 1.762614677251262, + "learning_rate": 1.0678194425076633e-05, + "loss": 0.4067, + "step": 6236 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 1.2959135276725782, + "learning_rate": 1.0675634279044416e-05, + "loss": 0.2455, + "step": 6237 + }, + { + "epoch": 0.4941968706674589, + "grad_norm": 1.4073370915886836, + "learning_rate": 1.0673074088524926e-05, + "loss": 0.2874, + "step": 6238 + }, + { + "epoch": 0.49427609427609426, + "grad_norm": 1.3388927040782768, + "learning_rate": 1.067051385368675e-05, + "loss": 0.256, + "step": 6239 + }, + { + "epoch": 0.49435531788472964, + "grad_norm": 1.818760707827175, + "learning_rate": 1.0667953574698461e-05, + "loss": 0.2888, + "step": 6240 + }, + { + "epoch": 0.494434541493365, + "grad_norm": 1.1949986080402983, + "learning_rate": 1.0665393251728645e-05, + "loss": 0.1809, + "step": 6241 + }, + { + "epoch": 0.4945137651020004, + "grad_norm": 1.0965629093802949, + "learning_rate": 1.0662832884945884e-05, + "loss": 0.2424, + "step": 6242 + }, + { + "epoch": 0.4945929887106358, + "grad_norm": 1.2472935835416856, + "learning_rate": 1.0660272474518767e-05, + "loss": 0.2297, + "step": 6243 + }, + { + "epoch": 0.49467221231927117, + "grad_norm": 1.5464503337392086, + "learning_rate": 1.0657712020615885e-05, + "loss": 0.2786, + "step": 6244 + }, + { + "epoch": 0.4947514359279065, + "grad_norm": 1.4992610845825791, + "learning_rate": 1.0655151523405831e-05, + "loss": 0.3562, + "step": 6245 + }, + { + "epoch": 0.4948306595365419, + "grad_norm": 1.245430373819781, + "learning_rate": 1.06525909830572e-05, + "loss": 0.2233, + "step": 6246 + }, + { + "epoch": 0.49490988314517725, + "grad_norm": 1.4250430094631756, + "learning_rate": 1.0650030399738594e-05, + "loss": 0.2508, + "step": 6247 + }, + { + "epoch": 0.49498910675381264, + "grad_norm": 1.4143829630506732, + "learning_rate": 1.0647469773618617e-05, + "loss": 0.2001, + "step": 6248 + }, + { + "epoch": 0.495068330362448, + "grad_norm": 1.3857901342762828, + "learning_rate": 1.0644909104865869e-05, + "loss": 0.2782, + "step": 6249 + }, + { + "epoch": 0.4951475539710834, + "grad_norm": 1.616204388193114, + "learning_rate": 1.0642348393648956e-05, + "loss": 0.3346, + "step": 6250 + }, + { + "epoch": 0.4952267775797188, + "grad_norm": 1.359732965277956, + "learning_rate": 1.0639787640136497e-05, + "loss": 0.2634, + "step": 6251 + }, + { + "epoch": 0.4953060011883541, + "grad_norm": 1.4073442282474358, + "learning_rate": 1.0637226844497096e-05, + "loss": 0.2656, + "step": 6252 + }, + { + "epoch": 0.4953852247969895, + "grad_norm": 1.435274302556513, + "learning_rate": 1.0634666006899375e-05, + "loss": 0.2603, + "step": 6253 + }, + { + "epoch": 0.49546444840562487, + "grad_norm": 1.3454932940119206, + "learning_rate": 1.0632105127511952e-05, + "loss": 0.2368, + "step": 6254 + }, + { + "epoch": 0.49554367201426025, + "grad_norm": 1.4679275124837228, + "learning_rate": 1.0629544206503445e-05, + "loss": 0.3462, + "step": 6255 + }, + { + "epoch": 0.49562289562289563, + "grad_norm": 1.2267471935575582, + "learning_rate": 1.0626983244042486e-05, + "loss": 0.2273, + "step": 6256 + }, + { + "epoch": 0.495702119231531, + "grad_norm": 1.541427155453795, + "learning_rate": 1.0624422240297694e-05, + "loss": 0.3725, + "step": 6257 + }, + { + "epoch": 0.4957813428401664, + "grad_norm": 1.7333655301296014, + "learning_rate": 1.0621861195437703e-05, + "loss": 0.304, + "step": 6258 + }, + { + "epoch": 0.4958605664488017, + "grad_norm": 1.3265310992758543, + "learning_rate": 1.0619300109631146e-05, + "loss": 0.2628, + "step": 6259 + }, + { + "epoch": 0.4959397900574371, + "grad_norm": 1.6433740813037165, + "learning_rate": 1.0616738983046652e-05, + "loss": 0.3621, + "step": 6260 + }, + { + "epoch": 0.4960190136660725, + "grad_norm": 1.5750802900338863, + "learning_rate": 1.0614177815852866e-05, + "loss": 0.3248, + "step": 6261 + }, + { + "epoch": 0.49609823727470787, + "grad_norm": 1.5121960093776905, + "learning_rate": 1.0611616608218429e-05, + "loss": 0.3265, + "step": 6262 + }, + { + "epoch": 0.49617746088334325, + "grad_norm": 1.7768865384174732, + "learning_rate": 1.0609055360311978e-05, + "loss": 0.3581, + "step": 6263 + }, + { + "epoch": 0.49625668449197863, + "grad_norm": 1.2474729755742784, + "learning_rate": 1.0606494072302164e-05, + "loss": 0.2448, + "step": 6264 + }, + { + "epoch": 0.496335908100614, + "grad_norm": 1.2985944960321019, + "learning_rate": 1.0603932744357632e-05, + "loss": 0.3004, + "step": 6265 + }, + { + "epoch": 0.49641513170924934, + "grad_norm": 1.2504828116047193, + "learning_rate": 1.0601371376647034e-05, + "loss": 0.3334, + "step": 6266 + }, + { + "epoch": 0.4964943553178847, + "grad_norm": 1.7673045482375467, + "learning_rate": 1.0598809969339028e-05, + "loss": 0.4251, + "step": 6267 + }, + { + "epoch": 0.4965735789265201, + "grad_norm": 1.4300543141766955, + "learning_rate": 1.0596248522602264e-05, + "loss": 0.3234, + "step": 6268 + }, + { + "epoch": 0.4966528025351555, + "grad_norm": 1.168006961050211, + "learning_rate": 1.0593687036605402e-05, + "loss": 0.2139, + "step": 6269 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 1.3531644655552078, + "learning_rate": 1.0591125511517108e-05, + "loss": 0.2416, + "step": 6270 + }, + { + "epoch": 0.49681124975242624, + "grad_norm": 1.4595553564355188, + "learning_rate": 1.0588563947506043e-05, + "loss": 0.3178, + "step": 6271 + }, + { + "epoch": 0.49689047336106157, + "grad_norm": 1.2664669002686002, + "learning_rate": 1.0586002344740875e-05, + "loss": 0.2647, + "step": 6272 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 1.544365897153881, + "learning_rate": 1.0583440703390271e-05, + "loss": 0.3532, + "step": 6273 + }, + { + "epoch": 0.49704892057833233, + "grad_norm": 1.600690620995663, + "learning_rate": 1.0580879023622903e-05, + "loss": 0.2683, + "step": 6274 + }, + { + "epoch": 0.4971281441869677, + "grad_norm": 1.2714748179550441, + "learning_rate": 1.0578317305607451e-05, + "loss": 0.2867, + "step": 6275 + }, + { + "epoch": 0.4972073677956031, + "grad_norm": 1.4301270003448296, + "learning_rate": 1.057575554951258e-05, + "loss": 0.2947, + "step": 6276 + }, + { + "epoch": 0.4972865914042385, + "grad_norm": 1.3371345246476254, + "learning_rate": 1.0573193755506982e-05, + "loss": 0.2499, + "step": 6277 + }, + { + "epoch": 0.49736581501287386, + "grad_norm": 1.2749612202368028, + "learning_rate": 1.0570631923759331e-05, + "loss": 0.2905, + "step": 6278 + }, + { + "epoch": 0.4974450386215092, + "grad_norm": 1.5034597442355828, + "learning_rate": 1.0568070054438314e-05, + "loss": 0.2907, + "step": 6279 + }, + { + "epoch": 0.49752426223014456, + "grad_norm": 1.3295244689756287, + "learning_rate": 1.0565508147712618e-05, + "loss": 0.3003, + "step": 6280 + }, + { + "epoch": 0.49760348583877995, + "grad_norm": 1.518507654758298, + "learning_rate": 1.056294620375093e-05, + "loss": 0.3952, + "step": 6281 + }, + { + "epoch": 0.49768270944741533, + "grad_norm": 1.9016861623652315, + "learning_rate": 1.0560384222721943e-05, + "loss": 0.3065, + "step": 6282 + }, + { + "epoch": 0.4977619330560507, + "grad_norm": 1.510930296790475, + "learning_rate": 1.0557822204794353e-05, + "loss": 0.3301, + "step": 6283 + }, + { + "epoch": 0.4978411566646861, + "grad_norm": 1.2444624449144386, + "learning_rate": 1.0555260150136852e-05, + "loss": 0.2394, + "step": 6284 + }, + { + "epoch": 0.4979203802733215, + "grad_norm": 1.2715390656264163, + "learning_rate": 1.0552698058918146e-05, + "loss": 0.1841, + "step": 6285 + }, + { + "epoch": 0.4979996038819568, + "grad_norm": 1.527953083438856, + "learning_rate": 1.055013593130693e-05, + "loss": 0.2792, + "step": 6286 + }, + { + "epoch": 0.4980788274905922, + "grad_norm": 1.5205960747348843, + "learning_rate": 1.0547573767471913e-05, + "loss": 0.278, + "step": 6287 + }, + { + "epoch": 0.49815805109922756, + "grad_norm": 1.1759475177062586, + "learning_rate": 1.0545011567581794e-05, + "loss": 0.2069, + "step": 6288 + }, + { + "epoch": 0.49823727470786294, + "grad_norm": 1.5542197874672103, + "learning_rate": 1.0542449331805287e-05, + "loss": 0.2755, + "step": 6289 + }, + { + "epoch": 0.4983164983164983, + "grad_norm": 1.6735682879426799, + "learning_rate": 1.05398870603111e-05, + "loss": 0.3069, + "step": 6290 + }, + { + "epoch": 0.4983957219251337, + "grad_norm": 1.1616981710431828, + "learning_rate": 1.0537324753267952e-05, + "loss": 0.2068, + "step": 6291 + }, + { + "epoch": 0.4984749455337691, + "grad_norm": 1.3631482937227253, + "learning_rate": 1.053476241084455e-05, + "loss": 0.2849, + "step": 6292 + }, + { + "epoch": 0.4985541691424044, + "grad_norm": 1.2351585466231974, + "learning_rate": 1.0532200033209618e-05, + "loss": 0.2237, + "step": 6293 + }, + { + "epoch": 0.4986333927510398, + "grad_norm": 1.2898599967755477, + "learning_rate": 1.0529637620531876e-05, + "loss": 0.2411, + "step": 6294 + }, + { + "epoch": 0.4987126163596752, + "grad_norm": 1.5160490050774054, + "learning_rate": 1.0527075172980043e-05, + "loss": 0.292, + "step": 6295 + }, + { + "epoch": 0.49879183996831056, + "grad_norm": 1.664701938006669, + "learning_rate": 1.0524512690722848e-05, + "loss": 0.4192, + "step": 6296 + }, + { + "epoch": 0.49887106357694594, + "grad_norm": 1.7257417530026737, + "learning_rate": 1.0521950173929017e-05, + "loss": 0.4208, + "step": 6297 + }, + { + "epoch": 0.4989502871855813, + "grad_norm": 1.3613911180602976, + "learning_rate": 1.0519387622767274e-05, + "loss": 0.2872, + "step": 6298 + }, + { + "epoch": 0.4990295107942167, + "grad_norm": 1.2873052402710918, + "learning_rate": 1.051682503740636e-05, + "loss": 0.2803, + "step": 6299 + }, + { + "epoch": 0.49910873440285203, + "grad_norm": 1.4843627688896437, + "learning_rate": 1.0514262418015e-05, + "loss": 0.3811, + "step": 6300 + }, + { + "epoch": 0.4991879580114874, + "grad_norm": 1.3470545590179122, + "learning_rate": 1.0511699764761935e-05, + "loss": 0.2975, + "step": 6301 + }, + { + "epoch": 0.4992671816201228, + "grad_norm": 1.6528728456851278, + "learning_rate": 1.0509137077815906e-05, + "loss": 0.3831, + "step": 6302 + }, + { + "epoch": 0.4993464052287582, + "grad_norm": 1.4427697117228835, + "learning_rate": 1.0506574357345647e-05, + "loss": 0.3384, + "step": 6303 + }, + { + "epoch": 0.49942562883739355, + "grad_norm": 0.9572009528886708, + "learning_rate": 1.0504011603519904e-05, + "loss": 0.1484, + "step": 6304 + }, + { + "epoch": 0.49950485244602894, + "grad_norm": 1.784759294359079, + "learning_rate": 1.0501448816507425e-05, + "loss": 0.2918, + "step": 6305 + }, + { + "epoch": 0.4995840760546643, + "grad_norm": 1.315874661285899, + "learning_rate": 1.0498885996476952e-05, + "loss": 0.2748, + "step": 6306 + }, + { + "epoch": 0.49966329966329964, + "grad_norm": 1.803346168407661, + "learning_rate": 1.0496323143597237e-05, + "loss": 0.3221, + "step": 6307 + }, + { + "epoch": 0.499742523271935, + "grad_norm": 1.3579686590364919, + "learning_rate": 1.049376025803703e-05, + "loss": 0.2781, + "step": 6308 + }, + { + "epoch": 0.4998217468805704, + "grad_norm": 1.3139149388195164, + "learning_rate": 1.0491197339965087e-05, + "loss": 0.2832, + "step": 6309 + }, + { + "epoch": 0.4999009704892058, + "grad_norm": 1.2048091184539846, + "learning_rate": 1.0488634389550166e-05, + "loss": 0.2217, + "step": 6310 + }, + { + "epoch": 0.49998019409784117, + "grad_norm": 1.3287662211919806, + "learning_rate": 1.0486071406961017e-05, + "loss": 0.238, + "step": 6311 + }, + { + "epoch": 0.5000594177064765, + "grad_norm": 1.4266026307722997, + "learning_rate": 1.0483508392366404e-05, + "loss": 0.3412, + "step": 6312 + }, + { + "epoch": 0.5001386413151119, + "grad_norm": 1.5612220547135882, + "learning_rate": 1.0480945345935094e-05, + "loss": 0.3407, + "step": 6313 + }, + { + "epoch": 0.5002178649237473, + "grad_norm": 1.5972355706250965, + "learning_rate": 1.0478382267835843e-05, + "loss": 0.3087, + "step": 6314 + }, + { + "epoch": 0.5002970885323826, + "grad_norm": 1.4969821150311746, + "learning_rate": 1.0475819158237426e-05, + "loss": 0.3352, + "step": 6315 + }, + { + "epoch": 0.5003763121410181, + "grad_norm": 1.5942406862924126, + "learning_rate": 1.0473256017308601e-05, + "loss": 0.3134, + "step": 6316 + }, + { + "epoch": 0.5004555357496534, + "grad_norm": 1.3503875305968052, + "learning_rate": 1.047069284521815e-05, + "loss": 0.264, + "step": 6317 + }, + { + "epoch": 0.5005347593582887, + "grad_norm": 1.3902456508809122, + "learning_rate": 1.0468129642134837e-05, + "loss": 0.3021, + "step": 6318 + }, + { + "epoch": 0.5006139829669242, + "grad_norm": 1.74825180207307, + "learning_rate": 1.046556640822744e-05, + "loss": 0.3069, + "step": 6319 + }, + { + "epoch": 0.5006932065755595, + "grad_norm": 1.1880302661403606, + "learning_rate": 1.0463003143664734e-05, + "loss": 0.2653, + "step": 6320 + }, + { + "epoch": 0.5007724301841949, + "grad_norm": 1.476139350780901, + "learning_rate": 1.0460439848615502e-05, + "loss": 0.2875, + "step": 6321 + }, + { + "epoch": 0.5008516537928303, + "grad_norm": 1.5435811135936066, + "learning_rate": 1.0457876523248518e-05, + "loss": 0.1652, + "step": 6322 + }, + { + "epoch": 0.5009308774014657, + "grad_norm": 1.4391808932957582, + "learning_rate": 1.0455313167732573e-05, + "loss": 0.3389, + "step": 6323 + }, + { + "epoch": 0.501010101010101, + "grad_norm": 1.2931238020920421, + "learning_rate": 1.0452749782236443e-05, + "loss": 0.2588, + "step": 6324 + }, + { + "epoch": 0.5010893246187363, + "grad_norm": 1.3202406215263744, + "learning_rate": 1.0450186366928917e-05, + "loss": 0.2223, + "step": 6325 + }, + { + "epoch": 0.5011685482273718, + "grad_norm": 1.2655135659324483, + "learning_rate": 1.044762292197879e-05, + "loss": 0.2495, + "step": 6326 + }, + { + "epoch": 0.5012477718360071, + "grad_norm": 1.4301511529458228, + "learning_rate": 1.0445059447554844e-05, + "loss": 0.3412, + "step": 6327 + }, + { + "epoch": 0.5013269954446425, + "grad_norm": 1.4949349713764832, + "learning_rate": 1.0442495943825874e-05, + "loss": 0.3055, + "step": 6328 + }, + { + "epoch": 0.5014062190532779, + "grad_norm": 1.287338881075157, + "learning_rate": 1.0439932410960678e-05, + "loss": 0.3185, + "step": 6329 + }, + { + "epoch": 0.5014854426619133, + "grad_norm": 1.3154378514895597, + "learning_rate": 1.0437368849128046e-05, + "loss": 0.2072, + "step": 6330 + }, + { + "epoch": 0.5015646662705486, + "grad_norm": 1.4571953480781072, + "learning_rate": 1.043480525849678e-05, + "loss": 0.2244, + "step": 6331 + }, + { + "epoch": 0.501643889879184, + "grad_norm": 1.3076757025975771, + "learning_rate": 1.0432241639235686e-05, + "loss": 0.2991, + "step": 6332 + }, + { + "epoch": 0.5017231134878194, + "grad_norm": 1.324348010807515, + "learning_rate": 1.0429677991513554e-05, + "loss": 0.2188, + "step": 6333 + }, + { + "epoch": 0.5018023370964547, + "grad_norm": 1.3863302514469567, + "learning_rate": 1.0427114315499196e-05, + "loss": 0.3068, + "step": 6334 + }, + { + "epoch": 0.5018815607050902, + "grad_norm": 1.8921203092021055, + "learning_rate": 1.0424550611361412e-05, + "loss": 0.3052, + "step": 6335 + }, + { + "epoch": 0.5019607843137255, + "grad_norm": 1.490879549646653, + "learning_rate": 1.0421986879269017e-05, + "loss": 0.3738, + "step": 6336 + }, + { + "epoch": 0.5020400079223608, + "grad_norm": 1.5907067481251906, + "learning_rate": 1.0419423119390815e-05, + "loss": 0.3113, + "step": 6337 + }, + { + "epoch": 0.5021192315309962, + "grad_norm": 1.278589378102372, + "learning_rate": 1.041685933189562e-05, + "loss": 0.1976, + "step": 6338 + }, + { + "epoch": 0.5021984551396316, + "grad_norm": 1.4468349983015867, + "learning_rate": 1.041429551695224e-05, + "loss": 0.2964, + "step": 6339 + }, + { + "epoch": 0.502277678748267, + "grad_norm": 1.500715182540103, + "learning_rate": 1.0411731674729497e-05, + "loss": 0.2871, + "step": 6340 + }, + { + "epoch": 0.5023569023569023, + "grad_norm": 1.0764066568969828, + "learning_rate": 1.0409167805396202e-05, + "loss": 0.1743, + "step": 6341 + }, + { + "epoch": 0.5024361259655378, + "grad_norm": 1.4355219117678222, + "learning_rate": 1.040660390912118e-05, + "loss": 0.414, + "step": 6342 + }, + { + "epoch": 0.5025153495741731, + "grad_norm": 1.564899334878142, + "learning_rate": 1.0404039986073244e-05, + "loss": 0.327, + "step": 6343 + }, + { + "epoch": 0.5025945731828084, + "grad_norm": 1.5037304857000693, + "learning_rate": 1.0401476036421219e-05, + "loss": 0.3019, + "step": 6344 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 1.3928120752202564, + "learning_rate": 1.039891206033393e-05, + "loss": 0.2134, + "step": 6345 + }, + { + "epoch": 0.5027530204000792, + "grad_norm": 1.4469178046637121, + "learning_rate": 1.0396348057980202e-05, + "loss": 0.2972, + "step": 6346 + }, + { + "epoch": 0.5028322440087146, + "grad_norm": 1.4359649333357054, + "learning_rate": 1.0393784029528858e-05, + "loss": 0.2773, + "step": 6347 + }, + { + "epoch": 0.50291146761735, + "grad_norm": 1.4255707160412432, + "learning_rate": 1.0391219975148734e-05, + "loss": 0.3843, + "step": 6348 + }, + { + "epoch": 0.5029906912259854, + "grad_norm": 1.3612589872962728, + "learning_rate": 1.0388655895008654e-05, + "loss": 0.2539, + "step": 6349 + }, + { + "epoch": 0.5030699148346207, + "grad_norm": 1.6285956201381435, + "learning_rate": 1.0386091789277458e-05, + "loss": 0.3218, + "step": 6350 + }, + { + "epoch": 0.503149138443256, + "grad_norm": 1.2423368370314876, + "learning_rate": 1.038352765812397e-05, + "loss": 0.3027, + "step": 6351 + }, + { + "epoch": 0.5032283620518915, + "grad_norm": 1.2518129229801136, + "learning_rate": 1.0380963501717034e-05, + "loss": 0.3468, + "step": 6352 + }, + { + "epoch": 0.5033075856605268, + "grad_norm": 1.459526927392641, + "learning_rate": 1.0378399320225486e-05, + "loss": 0.2918, + "step": 6353 + }, + { + "epoch": 0.5033868092691622, + "grad_norm": 1.183591304700173, + "learning_rate": 1.037583511381816e-05, + "loss": 0.2244, + "step": 6354 + }, + { + "epoch": 0.5034660328777976, + "grad_norm": 1.4313192692663943, + "learning_rate": 1.0373270882663899e-05, + "loss": 0.3969, + "step": 6355 + }, + { + "epoch": 0.503545256486433, + "grad_norm": 1.5780992396789457, + "learning_rate": 1.0370706626931553e-05, + "loss": 0.3588, + "step": 6356 + }, + { + "epoch": 0.5036244800950683, + "grad_norm": 1.1625966134437107, + "learning_rate": 1.0368142346789954e-05, + "loss": 0.2655, + "step": 6357 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 1.437105815844461, + "learning_rate": 1.0365578042407956e-05, + "loss": 0.319, + "step": 6358 + }, + { + "epoch": 0.5037829273123391, + "grad_norm": 1.2733907531753645, + "learning_rate": 1.03630137139544e-05, + "loss": 0.284, + "step": 6359 + }, + { + "epoch": 0.5038621509209744, + "grad_norm": 1.435809079003575, + "learning_rate": 1.0360449361598137e-05, + "loss": 0.2839, + "step": 6360 + }, + { + "epoch": 0.5039413745296099, + "grad_norm": 1.6249705527188003, + "learning_rate": 1.0357884985508022e-05, + "loss": 0.2991, + "step": 6361 + }, + { + "epoch": 0.5040205981382452, + "grad_norm": 1.704940301500535, + "learning_rate": 1.03553205858529e-05, + "loss": 0.4343, + "step": 6362 + }, + { + "epoch": 0.5040998217468806, + "grad_norm": 1.6877267088032806, + "learning_rate": 1.0352756162801626e-05, + "loss": 0.3925, + "step": 6363 + }, + { + "epoch": 0.5041790453555159, + "grad_norm": 1.3248634560321138, + "learning_rate": 1.035019171652306e-05, + "loss": 0.2881, + "step": 6364 + }, + { + "epoch": 0.5042582689641513, + "grad_norm": 1.1378056290900431, + "learning_rate": 1.0347627247186053e-05, + "loss": 0.221, + "step": 6365 + }, + { + "epoch": 0.5043374925727867, + "grad_norm": 1.5673366086376939, + "learning_rate": 1.0345062754959463e-05, + "loss": 0.393, + "step": 6366 + }, + { + "epoch": 0.504416716181422, + "grad_norm": 1.414995363440549, + "learning_rate": 1.0342498240012153e-05, + "loss": 0.301, + "step": 6367 + }, + { + "epoch": 0.5044959397900575, + "grad_norm": 1.2063677710576597, + "learning_rate": 1.0339933702512978e-05, + "loss": 0.2292, + "step": 6368 + }, + { + "epoch": 0.5045751633986928, + "grad_norm": 1.1707263613291337, + "learning_rate": 1.0337369142630808e-05, + "loss": 0.2108, + "step": 6369 + }, + { + "epoch": 0.5046543870073282, + "grad_norm": 1.1985221189545539, + "learning_rate": 1.0334804560534504e-05, + "loss": 0.2547, + "step": 6370 + }, + { + "epoch": 0.5047336106159636, + "grad_norm": 1.278989342246064, + "learning_rate": 1.0332239956392926e-05, + "loss": 0.2883, + "step": 6371 + }, + { + "epoch": 0.5048128342245989, + "grad_norm": 1.4100396056115252, + "learning_rate": 1.032967533037495e-05, + "loss": 0.3149, + "step": 6372 + }, + { + "epoch": 0.5048920578332343, + "grad_norm": 1.2774621508238386, + "learning_rate": 1.0327110682649436e-05, + "loss": 0.2699, + "step": 6373 + }, + { + "epoch": 0.5049712814418696, + "grad_norm": 1.3135613067580887, + "learning_rate": 1.0324546013385258e-05, + "loss": 0.2478, + "step": 6374 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 1.5670962180579926, + "learning_rate": 1.0321981322751291e-05, + "loss": 0.343, + "step": 6375 + }, + { + "epoch": 0.5051297286591404, + "grad_norm": 1.5408071416404392, + "learning_rate": 1.03194166109164e-05, + "loss": 0.3523, + "step": 6376 + }, + { + "epoch": 0.5052089522677758, + "grad_norm": 1.4833899120386194, + "learning_rate": 1.0316851878049465e-05, + "loss": 0.2376, + "step": 6377 + }, + { + "epoch": 0.5052881758764112, + "grad_norm": 1.487260079556074, + "learning_rate": 1.0314287124319353e-05, + "loss": 0.3652, + "step": 6378 + }, + { + "epoch": 0.5053673994850465, + "grad_norm": 1.3703523160461057, + "learning_rate": 1.031172234989495e-05, + "loss": 0.2771, + "step": 6379 + }, + { + "epoch": 0.5054466230936819, + "grad_norm": 1.6207356903244037, + "learning_rate": 1.030915755494513e-05, + "loss": 0.4147, + "step": 6380 + }, + { + "epoch": 0.5055258467023173, + "grad_norm": 1.5215774082448033, + "learning_rate": 1.030659273963877e-05, + "loss": 0.3735, + "step": 6381 + }, + { + "epoch": 0.5056050703109527, + "grad_norm": 1.3647215287614936, + "learning_rate": 1.0304027904144756e-05, + "loss": 0.3171, + "step": 6382 + }, + { + "epoch": 0.505684293919588, + "grad_norm": 1.5227745167813815, + "learning_rate": 1.0301463048631968e-05, + "loss": 0.3718, + "step": 6383 + }, + { + "epoch": 0.5057635175282235, + "grad_norm": 1.2902648478382346, + "learning_rate": 1.0298898173269285e-05, + "loss": 0.2381, + "step": 6384 + }, + { + "epoch": 0.5058427411368588, + "grad_norm": 1.364612823540458, + "learning_rate": 1.0296333278225599e-05, + "loss": 0.2405, + "step": 6385 + }, + { + "epoch": 0.5059219647454941, + "grad_norm": 1.2743178880217614, + "learning_rate": 1.0293768363669791e-05, + "loss": 0.2172, + "step": 6386 + }, + { + "epoch": 0.5060011883541295, + "grad_norm": 1.5114480877915564, + "learning_rate": 1.0291203429770749e-05, + "loss": 0.3032, + "step": 6387 + }, + { + "epoch": 0.5060804119627649, + "grad_norm": 1.7953323070021674, + "learning_rate": 1.0288638476697365e-05, + "loss": 0.3065, + "step": 6388 + }, + { + "epoch": 0.5061596355714003, + "grad_norm": 1.4854213913587258, + "learning_rate": 1.0286073504618524e-05, + "loss": 0.314, + "step": 6389 + }, + { + "epoch": 0.5062388591800356, + "grad_norm": 1.28258545390892, + "learning_rate": 1.0283508513703118e-05, + "loss": 0.317, + "step": 6390 + }, + { + "epoch": 0.5063180827886711, + "grad_norm": 1.664153732903344, + "learning_rate": 1.0280943504120045e-05, + "loss": 0.3049, + "step": 6391 + }, + { + "epoch": 0.5063973063973064, + "grad_norm": 1.3287969335967227, + "learning_rate": 1.027837847603819e-05, + "loss": 0.2937, + "step": 6392 + }, + { + "epoch": 0.5064765300059417, + "grad_norm": 1.3513776619080435, + "learning_rate": 1.0275813429626456e-05, + "loss": 0.2436, + "step": 6393 + }, + { + "epoch": 0.5065557536145772, + "grad_norm": 1.5177674479376446, + "learning_rate": 1.027324836505373e-05, + "loss": 0.3859, + "step": 6394 + }, + { + "epoch": 0.5066349772232125, + "grad_norm": 1.4038432760608448, + "learning_rate": 1.0270683282488913e-05, + "loss": 0.2824, + "step": 6395 + }, + { + "epoch": 0.5067142008318479, + "grad_norm": 1.3341102595463405, + "learning_rate": 1.026811818210091e-05, + "loss": 0.1889, + "step": 6396 + }, + { + "epoch": 0.5067934244404833, + "grad_norm": 1.1399310245436207, + "learning_rate": 1.0265553064058612e-05, + "loss": 0.1709, + "step": 6397 + }, + { + "epoch": 0.5068726480491187, + "grad_norm": 1.1846882525796747, + "learning_rate": 1.0262987928530921e-05, + "loss": 0.2669, + "step": 6398 + }, + { + "epoch": 0.506951871657754, + "grad_norm": 1.5292716481835442, + "learning_rate": 1.0260422775686743e-05, + "loss": 0.4058, + "step": 6399 + }, + { + "epoch": 0.5070310952663893, + "grad_norm": 1.2776054425290138, + "learning_rate": 1.0257857605694976e-05, + "loss": 0.211, + "step": 6400 + }, + { + "epoch": 0.5071103188750248, + "grad_norm": 1.3643267209202141, + "learning_rate": 1.025529241872453e-05, + "loss": 0.2445, + "step": 6401 + }, + { + "epoch": 0.5071895424836601, + "grad_norm": 1.1899612143691012, + "learning_rate": 1.0252727214944302e-05, + "loss": 0.2256, + "step": 6402 + }, + { + "epoch": 0.5072687660922955, + "grad_norm": 1.2377676761096175, + "learning_rate": 1.0250161994523205e-05, + "loss": 0.2539, + "step": 6403 + }, + { + "epoch": 0.5073479897009309, + "grad_norm": 1.1867164070738405, + "learning_rate": 1.0247596757630147e-05, + "loss": 0.2179, + "step": 6404 + }, + { + "epoch": 0.5074272133095663, + "grad_norm": 1.584171660603048, + "learning_rate": 1.0245031504434032e-05, + "loss": 0.2577, + "step": 6405 + }, + { + "epoch": 0.5075064369182016, + "grad_norm": 1.687711130361364, + "learning_rate": 1.024246623510377e-05, + "loss": 0.3636, + "step": 6406 + }, + { + "epoch": 0.507585660526837, + "grad_norm": 1.8105725693244594, + "learning_rate": 1.0239900949808274e-05, + "loss": 0.3856, + "step": 6407 + }, + { + "epoch": 0.5076648841354724, + "grad_norm": 1.3081729232220904, + "learning_rate": 1.0237335648716456e-05, + "loss": 0.2592, + "step": 6408 + }, + { + "epoch": 0.5077441077441077, + "grad_norm": 2.158287753452955, + "learning_rate": 1.0234770331997224e-05, + "loss": 0.3329, + "step": 6409 + }, + { + "epoch": 0.5078233313527432, + "grad_norm": 1.2658435504708871, + "learning_rate": 1.02322049998195e-05, + "loss": 0.3249, + "step": 6410 + }, + { + "epoch": 0.5079025549613785, + "grad_norm": 1.1497160756670872, + "learning_rate": 1.022963965235219e-05, + "loss": 0.2197, + "step": 6411 + }, + { + "epoch": 0.5079817785700138, + "grad_norm": 2.030858136092228, + "learning_rate": 1.0227074289764216e-05, + "loss": 0.3183, + "step": 6412 + }, + { + "epoch": 0.5080610021786492, + "grad_norm": 1.5608515501929952, + "learning_rate": 1.0224508912224491e-05, + "loss": 0.296, + "step": 6413 + }, + { + "epoch": 0.5081402257872846, + "grad_norm": 1.4509740689687938, + "learning_rate": 1.0221943519901935e-05, + "loss": 0.2715, + "step": 6414 + }, + { + "epoch": 0.50821944939592, + "grad_norm": 1.4275975461074069, + "learning_rate": 1.0219378112965468e-05, + "loss": 0.3238, + "step": 6415 + }, + { + "epoch": 0.5082986730045553, + "grad_norm": 1.1466407148088318, + "learning_rate": 1.0216812691584005e-05, + "loss": 0.2091, + "step": 6416 + }, + { + "epoch": 0.5083778966131908, + "grad_norm": 1.4011407964265867, + "learning_rate": 1.021424725592647e-05, + "loss": 0.2278, + "step": 6417 + }, + { + "epoch": 0.5084571202218261, + "grad_norm": 1.415432571144129, + "learning_rate": 1.0211681806161787e-05, + "loss": 0.268, + "step": 6418 + }, + { + "epoch": 0.5085363438304614, + "grad_norm": 1.5985332967556114, + "learning_rate": 1.0209116342458872e-05, + "loss": 0.3466, + "step": 6419 + }, + { + "epoch": 0.5086155674390969, + "grad_norm": 2.9851529078817767, + "learning_rate": 1.0206550864986656e-05, + "loss": 0.2604, + "step": 6420 + }, + { + "epoch": 0.5086947910477322, + "grad_norm": 1.3778038263067551, + "learning_rate": 1.0203985373914056e-05, + "loss": 0.3411, + "step": 6421 + }, + { + "epoch": 0.5087740146563676, + "grad_norm": 1.5037392162532608, + "learning_rate": 1.0201419869410001e-05, + "loss": 0.312, + "step": 6422 + }, + { + "epoch": 0.508853238265003, + "grad_norm": 1.5963366301949247, + "learning_rate": 1.0198854351643416e-05, + "loss": 0.2743, + "step": 6423 + }, + { + "epoch": 0.5089324618736384, + "grad_norm": 1.7420541310395408, + "learning_rate": 1.0196288820783232e-05, + "loss": 0.3617, + "step": 6424 + }, + { + "epoch": 0.5090116854822737, + "grad_norm": 1.3688084086079422, + "learning_rate": 1.0193723276998371e-05, + "loss": 0.2392, + "step": 6425 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 1.629014141711662, + "learning_rate": 1.0191157720457765e-05, + "loss": 0.3443, + "step": 6426 + }, + { + "epoch": 0.5091701326995445, + "grad_norm": 1.2938522024794588, + "learning_rate": 1.0188592151330343e-05, + "loss": 0.2253, + "step": 6427 + }, + { + "epoch": 0.5092493563081798, + "grad_norm": 1.4008045975761283, + "learning_rate": 1.0186026569785037e-05, + "loss": 0.2422, + "step": 6428 + }, + { + "epoch": 0.5093285799168152, + "grad_norm": 1.5345817467949727, + "learning_rate": 1.0183460975990773e-05, + "loss": 0.422, + "step": 6429 + }, + { + "epoch": 0.5094078035254506, + "grad_norm": 2.109083287768109, + "learning_rate": 1.0180895370116488e-05, + "loss": 0.2911, + "step": 6430 + }, + { + "epoch": 0.509487027134086, + "grad_norm": 1.310915569201543, + "learning_rate": 1.0178329752331116e-05, + "loss": 0.2519, + "step": 6431 + }, + { + "epoch": 0.5095662507427213, + "grad_norm": 1.3821282776105328, + "learning_rate": 1.0175764122803584e-05, + "loss": 0.2736, + "step": 6432 + }, + { + "epoch": 0.5096454743513567, + "grad_norm": 1.181921339592671, + "learning_rate": 1.017319848170283e-05, + "loss": 0.2547, + "step": 6433 + }, + { + "epoch": 0.5097246979599921, + "grad_norm": 1.4343224517362072, + "learning_rate": 1.0170632829197792e-05, + "loss": 0.3133, + "step": 6434 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 1.2119076748344708, + "learning_rate": 1.0168067165457403e-05, + "loss": 0.2558, + "step": 6435 + }, + { + "epoch": 0.5098831451772629, + "grad_norm": 1.4787283873775698, + "learning_rate": 1.01655014906506e-05, + "loss": 0.3147, + "step": 6436 + }, + { + "epoch": 0.5099623687858982, + "grad_norm": 1.438045038420549, + "learning_rate": 1.016293580494632e-05, + "loss": 0.2881, + "step": 6437 + }, + { + "epoch": 0.5100415923945336, + "grad_norm": 1.3427077703758772, + "learning_rate": 1.0160370108513497e-05, + "loss": 0.3295, + "step": 6438 + }, + { + "epoch": 0.5101208160031689, + "grad_norm": 1.4109312458553513, + "learning_rate": 1.015780440152108e-05, + "loss": 0.2572, + "step": 6439 + }, + { + "epoch": 0.5102000396118043, + "grad_norm": 1.3785757613042255, + "learning_rate": 1.0155238684138e-05, + "loss": 0.2714, + "step": 6440 + }, + { + "epoch": 0.5102792632204397, + "grad_norm": 1.1241131454385036, + "learning_rate": 1.0152672956533198e-05, + "loss": 0.1971, + "step": 6441 + }, + { + "epoch": 0.510358486829075, + "grad_norm": 1.3099059174885048, + "learning_rate": 1.015010721887562e-05, + "loss": 0.2611, + "step": 6442 + }, + { + "epoch": 0.5104377104377105, + "grad_norm": 1.369051604179246, + "learning_rate": 1.0147541471334204e-05, + "loss": 0.2416, + "step": 6443 + }, + { + "epoch": 0.5105169340463458, + "grad_norm": 1.1581257484655514, + "learning_rate": 1.0144975714077889e-05, + "loss": 0.2493, + "step": 6444 + }, + { + "epoch": 0.5105961576549812, + "grad_norm": 1.6191615342743022, + "learning_rate": 1.0142409947275621e-05, + "loss": 0.3286, + "step": 6445 + }, + { + "epoch": 0.5106753812636166, + "grad_norm": 1.6108592678209457, + "learning_rate": 1.0139844171096345e-05, + "loss": 0.3333, + "step": 6446 + }, + { + "epoch": 0.5107546048722519, + "grad_norm": 1.51305686161588, + "learning_rate": 1.0137278385709004e-05, + "loss": 0.2652, + "step": 6447 + }, + { + "epoch": 0.5108338284808873, + "grad_norm": 1.2447413467014472, + "learning_rate": 1.0134712591282539e-05, + "loss": 0.2715, + "step": 6448 + }, + { + "epoch": 0.5109130520895226, + "grad_norm": 1.399517704221, + "learning_rate": 1.0132146787985898e-05, + "loss": 0.2783, + "step": 6449 + }, + { + "epoch": 0.5109922756981581, + "grad_norm": 1.3441536692328138, + "learning_rate": 1.0129580975988029e-05, + "loss": 0.2627, + "step": 6450 + }, + { + "epoch": 0.5110714993067934, + "grad_norm": 1.3401255907442646, + "learning_rate": 1.0127015155457875e-05, + "loss": 0.2593, + "step": 6451 + }, + { + "epoch": 0.5111507229154288, + "grad_norm": 1.6690276283876362, + "learning_rate": 1.0124449326564383e-05, + "loss": 0.2879, + "step": 6452 + }, + { + "epoch": 0.5112299465240642, + "grad_norm": 1.319519900289313, + "learning_rate": 1.0121883489476505e-05, + "loss": 0.2709, + "step": 6453 + }, + { + "epoch": 0.5113091701326995, + "grad_norm": 1.5736805615527079, + "learning_rate": 1.0119317644363182e-05, + "loss": 0.3142, + "step": 6454 + }, + { + "epoch": 0.5113883937413349, + "grad_norm": 1.3801318979259982, + "learning_rate": 1.0116751791393371e-05, + "loss": 0.2494, + "step": 6455 + }, + { + "epoch": 0.5114676173499703, + "grad_norm": 1.815866072065313, + "learning_rate": 1.011418593073601e-05, + "loss": 0.3711, + "step": 6456 + }, + { + "epoch": 0.5115468409586057, + "grad_norm": 1.3481706241405615, + "learning_rate": 1.0111620062560059e-05, + "loss": 0.2271, + "step": 6457 + }, + { + "epoch": 0.511626064567241, + "grad_norm": 1.3327789776784724, + "learning_rate": 1.0109054187034463e-05, + "loss": 0.3119, + "step": 6458 + }, + { + "epoch": 0.5117052881758765, + "grad_norm": 1.4790039403829909, + "learning_rate": 1.0106488304328175e-05, + "loss": 0.2823, + "step": 6459 + }, + { + "epoch": 0.5117845117845118, + "grad_norm": 1.3280615405659113, + "learning_rate": 1.010392241461014e-05, + "loss": 0.2116, + "step": 6460 + }, + { + "epoch": 0.5118637353931471, + "grad_norm": 1.7079129517671097, + "learning_rate": 1.010135651804932e-05, + "loss": 0.3061, + "step": 6461 + }, + { + "epoch": 0.5119429590017825, + "grad_norm": 1.401544507139228, + "learning_rate": 1.0098790614814658e-05, + "loss": 0.3032, + "step": 6462 + }, + { + "epoch": 0.5120221826104179, + "grad_norm": 1.4997603792331273, + "learning_rate": 1.009622470507511e-05, + "loss": 0.3821, + "step": 6463 + }, + { + "epoch": 0.5121014062190533, + "grad_norm": 1.3993050920884416, + "learning_rate": 1.0093658788999628e-05, + "loss": 0.335, + "step": 6464 + }, + { + "epoch": 0.5121806298276886, + "grad_norm": 1.5181080898013404, + "learning_rate": 1.0091092866757164e-05, + "loss": 0.2908, + "step": 6465 + }, + { + "epoch": 0.5122598534363241, + "grad_norm": 1.1064918374837822, + "learning_rate": 1.0088526938516676e-05, + "loss": 0.2043, + "step": 6466 + }, + { + "epoch": 0.5123390770449594, + "grad_norm": 1.1616319607176038, + "learning_rate": 1.0085961004447114e-05, + "loss": 0.2695, + "step": 6467 + }, + { + "epoch": 0.5124183006535947, + "grad_norm": 1.4197303822570853, + "learning_rate": 1.0083395064717429e-05, + "loss": 0.2873, + "step": 6468 + }, + { + "epoch": 0.5124975242622302, + "grad_norm": 1.1476501638597234, + "learning_rate": 1.0080829119496587e-05, + "loss": 0.2547, + "step": 6469 + }, + { + "epoch": 0.5125767478708655, + "grad_norm": 1.3036752913469873, + "learning_rate": 1.0078263168953532e-05, + "loss": 0.2903, + "step": 6470 + }, + { + "epoch": 0.5126559714795009, + "grad_norm": 1.3727559772529956, + "learning_rate": 1.0075697213257227e-05, + "loss": 0.2488, + "step": 6471 + }, + { + "epoch": 0.5127351950881363, + "grad_norm": 1.3520458464295788, + "learning_rate": 1.0073131252576622e-05, + "loss": 0.2675, + "step": 6472 + }, + { + "epoch": 0.5128144186967717, + "grad_norm": 1.3819109703755623, + "learning_rate": 1.0070565287080676e-05, + "loss": 0.3302, + "step": 6473 + }, + { + "epoch": 0.512893642305407, + "grad_norm": 1.4470920899326982, + "learning_rate": 1.0067999316938348e-05, + "loss": 0.4019, + "step": 6474 + }, + { + "epoch": 0.5129728659140423, + "grad_norm": 1.4741553189409777, + "learning_rate": 1.006543334231859e-05, + "loss": 0.3144, + "step": 6475 + }, + { + "epoch": 0.5130520895226778, + "grad_norm": 1.583537386558902, + "learning_rate": 1.0062867363390361e-05, + "loss": 0.2579, + "step": 6476 + }, + { + "epoch": 0.5131313131313131, + "grad_norm": 1.4580070357913764, + "learning_rate": 1.0060301380322622e-05, + "loss": 0.3546, + "step": 6477 + }, + { + "epoch": 0.5132105367399485, + "grad_norm": 1.5032710064645964, + "learning_rate": 1.0057735393284322e-05, + "loss": 0.2987, + "step": 6478 + }, + { + "epoch": 0.5132897603485839, + "grad_norm": 1.7188084790954914, + "learning_rate": 1.0055169402444429e-05, + "loss": 0.373, + "step": 6479 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 1.4069100480277812, + "learning_rate": 1.0052603407971892e-05, + "loss": 0.2905, + "step": 6480 + }, + { + "epoch": 0.5134482075658546, + "grad_norm": 1.2741138455825607, + "learning_rate": 1.0050037410035676e-05, + "loss": 0.2403, + "step": 6481 + }, + { + "epoch": 0.51352743117449, + "grad_norm": 1.5121683741613499, + "learning_rate": 1.004747140880474e-05, + "loss": 0.325, + "step": 6482 + }, + { + "epoch": 0.5136066547831254, + "grad_norm": 1.5632399529557859, + "learning_rate": 1.0044905404448037e-05, + "loss": 0.3244, + "step": 6483 + }, + { + "epoch": 0.5136858783917607, + "grad_norm": 1.2694155225637005, + "learning_rate": 1.0042339397134528e-05, + "loss": 0.2687, + "step": 6484 + }, + { + "epoch": 0.5137651020003962, + "grad_norm": 1.229779412965252, + "learning_rate": 1.0039773387033178e-05, + "loss": 0.2719, + "step": 6485 + }, + { + "epoch": 0.5138443256090315, + "grad_norm": 1.4716763089929525, + "learning_rate": 1.0037207374312936e-05, + "loss": 0.2882, + "step": 6486 + }, + { + "epoch": 0.5139235492176669, + "grad_norm": 1.7992606118391934, + "learning_rate": 1.003464135914277e-05, + "loss": 0.3414, + "step": 6487 + }, + { + "epoch": 0.5140027728263022, + "grad_norm": 1.5190185780278507, + "learning_rate": 1.0032075341691639e-05, + "loss": 0.3217, + "step": 6488 + }, + { + "epoch": 0.5140819964349376, + "grad_norm": 1.0155358847355467, + "learning_rate": 1.0029509322128499e-05, + "loss": 0.2007, + "step": 6489 + }, + { + "epoch": 0.514161220043573, + "grad_norm": 1.322341655238348, + "learning_rate": 1.0026943300622313e-05, + "loss": 0.2863, + "step": 6490 + }, + { + "epoch": 0.5142404436522083, + "grad_norm": 1.109876830347495, + "learning_rate": 1.0024377277342038e-05, + "loss": 0.2151, + "step": 6491 + }, + { + "epoch": 0.5143196672608438, + "grad_norm": 1.609858442176041, + "learning_rate": 1.002181125245664e-05, + "loss": 0.3997, + "step": 6492 + }, + { + "epoch": 0.5143988908694791, + "grad_norm": 1.6931970810371015, + "learning_rate": 1.0019245226135075e-05, + "loss": 0.4202, + "step": 6493 + }, + { + "epoch": 0.5144781144781144, + "grad_norm": 1.4407113823007818, + "learning_rate": 1.0016679198546304e-05, + "loss": 0.3357, + "step": 6494 + }, + { + "epoch": 0.5145573380867499, + "grad_norm": 1.3950102431679114, + "learning_rate": 1.0014113169859285e-05, + "loss": 0.3026, + "step": 6495 + }, + { + "epoch": 0.5146365616953852, + "grad_norm": 1.4614967907400644, + "learning_rate": 1.0011547140242987e-05, + "loss": 0.2687, + "step": 6496 + }, + { + "epoch": 0.5147157853040206, + "grad_norm": 1.3616784523328638, + "learning_rate": 1.0008981109866363e-05, + "loss": 0.2079, + "step": 6497 + }, + { + "epoch": 0.514795008912656, + "grad_norm": 1.0006941442171828, + "learning_rate": 1.0006415078898377e-05, + "loss": 0.1817, + "step": 6498 + }, + { + "epoch": 0.5148742325212914, + "grad_norm": 1.3198203608938563, + "learning_rate": 1.0003849047507987e-05, + "loss": 0.2822, + "step": 6499 + }, + { + "epoch": 0.5149534561299267, + "grad_norm": 1.245552027001811, + "learning_rate": 1.0001283015864157e-05, + "loss": 0.2904, + "step": 6500 + }, + { + "epoch": 0.515032679738562, + "grad_norm": 1.6016362034165224, + "learning_rate": 9.998716984135847e-06, + "loss": 0.4507, + "step": 6501 + }, + { + "epoch": 0.5151119033471975, + "grad_norm": 1.4845955851877697, + "learning_rate": 9.996150952492018e-06, + "loss": 0.3836, + "step": 6502 + }, + { + "epoch": 0.5151911269558328, + "grad_norm": 1.338854729799261, + "learning_rate": 9.993584921101628e-06, + "loss": 0.3241, + "step": 6503 + }, + { + "epoch": 0.5152703505644682, + "grad_norm": 1.2501295576155476, + "learning_rate": 9.991018890133642e-06, + "loss": 0.2338, + "step": 6504 + }, + { + "epoch": 0.5153495741731036, + "grad_norm": 1.367821724046173, + "learning_rate": 9.988452859757017e-06, + "loss": 0.3418, + "step": 6505 + }, + { + "epoch": 0.515428797781739, + "grad_norm": 1.1045534234665813, + "learning_rate": 9.985886830140717e-06, + "loss": 0.1483, + "step": 6506 + }, + { + "epoch": 0.5155080213903743, + "grad_norm": 1.6183770616315782, + "learning_rate": 9.983320801453702e-06, + "loss": 0.2698, + "step": 6507 + }, + { + "epoch": 0.5155872449990097, + "grad_norm": 1.432076627891755, + "learning_rate": 9.98075477386493e-06, + "loss": 0.2755, + "step": 6508 + }, + { + "epoch": 0.5156664686076451, + "grad_norm": 1.3340431764865326, + "learning_rate": 9.978188747543364e-06, + "loss": 0.2588, + "step": 6509 + }, + { + "epoch": 0.5157456922162804, + "grad_norm": 1.581196240868058, + "learning_rate": 9.975622722657965e-06, + "loss": 0.3198, + "step": 6510 + }, + { + "epoch": 0.5158249158249159, + "grad_norm": 1.2679945471071485, + "learning_rate": 9.973056699377692e-06, + "loss": 0.2705, + "step": 6511 + }, + { + "epoch": 0.5159041394335512, + "grad_norm": 1.2261073996585534, + "learning_rate": 9.970490677871506e-06, + "loss": 0.2446, + "step": 6512 + }, + { + "epoch": 0.5159833630421866, + "grad_norm": 1.891342414662232, + "learning_rate": 9.967924658308366e-06, + "loss": 0.3654, + "step": 6513 + }, + { + "epoch": 0.5160625866508219, + "grad_norm": 1.2628602047310538, + "learning_rate": 9.965358640857231e-06, + "loss": 0.1947, + "step": 6514 + }, + { + "epoch": 0.5161418102594573, + "grad_norm": 1.4322366128247757, + "learning_rate": 9.962792625687067e-06, + "loss": 0.3222, + "step": 6515 + }, + { + "epoch": 0.5162210338680927, + "grad_norm": 1.336805850325114, + "learning_rate": 9.960226612966828e-06, + "loss": 0.3178, + "step": 6516 + }, + { + "epoch": 0.516300257476728, + "grad_norm": 1.3832001781071752, + "learning_rate": 9.957660602865477e-06, + "loss": 0.2969, + "step": 6517 + }, + { + "epoch": 0.5163794810853635, + "grad_norm": 1.2823249710214122, + "learning_rate": 9.955094595551968e-06, + "loss": 0.2846, + "step": 6518 + }, + { + "epoch": 0.5164587046939988, + "grad_norm": 1.4275951606358972, + "learning_rate": 9.952528591195265e-06, + "loss": 0.237, + "step": 6519 + }, + { + "epoch": 0.5165379283026342, + "grad_norm": 1.2665494870212624, + "learning_rate": 9.949962589964327e-06, + "loss": 0.265, + "step": 6520 + }, + { + "epoch": 0.5166171519112696, + "grad_norm": 1.427985504438624, + "learning_rate": 9.94739659202811e-06, + "loss": 0.2594, + "step": 6521 + }, + { + "epoch": 0.5166963755199049, + "grad_norm": 1.3770262666103272, + "learning_rate": 9.944830597555573e-06, + "loss": 0.2566, + "step": 6522 + }, + { + "epoch": 0.5167755991285403, + "grad_norm": 1.1104599415081753, + "learning_rate": 9.94226460671568e-06, + "loss": 0.1907, + "step": 6523 + }, + { + "epoch": 0.5168548227371756, + "grad_norm": 1.4519819827409062, + "learning_rate": 9.939698619677383e-06, + "loss": 0.3019, + "step": 6524 + }, + { + "epoch": 0.5169340463458111, + "grad_norm": 1.2591534360748693, + "learning_rate": 9.937132636609642e-06, + "loss": 0.2439, + "step": 6525 + }, + { + "epoch": 0.5170132699544464, + "grad_norm": 1.3530875061296936, + "learning_rate": 9.934566657681412e-06, + "loss": 0.2697, + "step": 6526 + }, + { + "epoch": 0.5170924935630818, + "grad_norm": 1.2949836348716584, + "learning_rate": 9.932000683061654e-06, + "loss": 0.336, + "step": 6527 + }, + { + "epoch": 0.5171717171717172, + "grad_norm": 1.5396352721555138, + "learning_rate": 9.929434712919327e-06, + "loss": 0.3644, + "step": 6528 + }, + { + "epoch": 0.5172509407803525, + "grad_norm": 1.4718538655183226, + "learning_rate": 9.926868747423381e-06, + "loss": 0.3595, + "step": 6529 + }, + { + "epoch": 0.5173301643889879, + "grad_norm": 1.3030805697611099, + "learning_rate": 9.924302786742775e-06, + "loss": 0.2359, + "step": 6530 + }, + { + "epoch": 0.5174093879976233, + "grad_norm": 1.6778384483392856, + "learning_rate": 9.92173683104647e-06, + "loss": 0.333, + "step": 6531 + }, + { + "epoch": 0.5174886116062587, + "grad_norm": 1.2860284615745243, + "learning_rate": 9.919170880503416e-06, + "loss": 0.1844, + "step": 6532 + }, + { + "epoch": 0.517567835214894, + "grad_norm": 1.3396535564695895, + "learning_rate": 9.916604935282573e-06, + "loss": 0.3616, + "step": 6533 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 1.5839666192931645, + "learning_rate": 9.914038995552891e-06, + "loss": 0.3028, + "step": 6534 + }, + { + "epoch": 0.5177262824321648, + "grad_norm": 1.2158117892305256, + "learning_rate": 9.911473061483326e-06, + "loss": 0.2278, + "step": 6535 + }, + { + "epoch": 0.5178055060408001, + "grad_norm": 1.1860243331133984, + "learning_rate": 9.908907133242838e-06, + "loss": 0.217, + "step": 6536 + }, + { + "epoch": 0.5178847296494355, + "grad_norm": 1.571307756562921, + "learning_rate": 9.906341211000375e-06, + "loss": 0.3241, + "step": 6537 + }, + { + "epoch": 0.5179639532580709, + "grad_norm": 1.352736758711321, + "learning_rate": 9.903775294924892e-06, + "loss": 0.2471, + "step": 6538 + }, + { + "epoch": 0.5180431768667063, + "grad_norm": 1.1912345151225152, + "learning_rate": 9.901209385185345e-06, + "loss": 0.2186, + "step": 6539 + }, + { + "epoch": 0.5181224004753416, + "grad_norm": 1.6722376099949459, + "learning_rate": 9.898643481950683e-06, + "loss": 0.3487, + "step": 6540 + }, + { + "epoch": 0.5182016240839771, + "grad_norm": 1.34099016289351, + "learning_rate": 9.89607758538986e-06, + "loss": 0.273, + "step": 6541 + }, + { + "epoch": 0.5182808476926124, + "grad_norm": 1.337450027613429, + "learning_rate": 9.893511695671828e-06, + "loss": 0.2893, + "step": 6542 + }, + { + "epoch": 0.5183600713012477, + "grad_norm": 1.3192789259086608, + "learning_rate": 9.890945812965538e-06, + "loss": 0.2486, + "step": 6543 + }, + { + "epoch": 0.5184392949098832, + "grad_norm": 1.5135675585355135, + "learning_rate": 9.888379937439944e-06, + "loss": 0.2595, + "step": 6544 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 1.1337436770634528, + "learning_rate": 9.885814069263991e-06, + "loss": 0.1995, + "step": 6545 + }, + { + "epoch": 0.5185977421271539, + "grad_norm": 1.5365587138434578, + "learning_rate": 9.883248208606632e-06, + "loss": 0.3141, + "step": 6546 + }, + { + "epoch": 0.5186769657357893, + "grad_norm": 1.422701746607019, + "learning_rate": 9.880682355636821e-06, + "loss": 0.2765, + "step": 6547 + }, + { + "epoch": 0.5187561893444247, + "grad_norm": 1.529237671876428, + "learning_rate": 9.878116510523498e-06, + "loss": 0.3059, + "step": 6548 + }, + { + "epoch": 0.51883541295306, + "grad_norm": 1.2404879896734196, + "learning_rate": 9.87555067343562e-06, + "loss": 0.2403, + "step": 6549 + }, + { + "epoch": 0.5189146365616953, + "grad_norm": 1.6405916649431471, + "learning_rate": 9.872984844542128e-06, + "loss": 0.2724, + "step": 6550 + }, + { + "epoch": 0.5189938601703308, + "grad_norm": 1.6431578600880408, + "learning_rate": 9.870419024011973e-06, + "loss": 0.3528, + "step": 6551 + }, + { + "epoch": 0.5190730837789661, + "grad_norm": 1.4649941004090359, + "learning_rate": 9.867853212014104e-06, + "loss": 0.2248, + "step": 6552 + }, + { + "epoch": 0.5191523073876015, + "grad_norm": 1.4467380048203815, + "learning_rate": 9.865287408717464e-06, + "loss": 0.2586, + "step": 6553 + }, + { + "epoch": 0.5192315309962369, + "grad_norm": 1.3687632080923724, + "learning_rate": 9.862721614291e-06, + "loss": 0.2637, + "step": 6554 + }, + { + "epoch": 0.5193107546048723, + "grad_norm": 1.5348495927913135, + "learning_rate": 9.860155828903658e-06, + "loss": 0.282, + "step": 6555 + }, + { + "epoch": 0.5193899782135076, + "grad_norm": 1.3104011170070438, + "learning_rate": 9.85759005272438e-06, + "loss": 0.2535, + "step": 6556 + }, + { + "epoch": 0.519469201822143, + "grad_norm": 1.3679464498838536, + "learning_rate": 9.855024285922114e-06, + "loss": 0.3364, + "step": 6557 + }, + { + "epoch": 0.5195484254307784, + "grad_norm": 1.3888039704372102, + "learning_rate": 9.8524585286658e-06, + "loss": 0.3388, + "step": 6558 + }, + { + "epoch": 0.5196276490394137, + "grad_norm": 1.2903769663712974, + "learning_rate": 9.84989278112438e-06, + "loss": 0.2027, + "step": 6559 + }, + { + "epoch": 0.5197068726480492, + "grad_norm": 1.5477846072334485, + "learning_rate": 9.847327043466802e-06, + "loss": 0.2101, + "step": 6560 + }, + { + "epoch": 0.5197860962566845, + "grad_norm": 1.6518751885131344, + "learning_rate": 9.844761315862002e-06, + "loss": 0.2742, + "step": 6561 + }, + { + "epoch": 0.5198653198653199, + "grad_norm": 1.4493261938639335, + "learning_rate": 9.842195598478922e-06, + "loss": 0.2671, + "step": 6562 + }, + { + "epoch": 0.5199445434739552, + "grad_norm": 1.3792945002120043, + "learning_rate": 9.839629891486503e-06, + "loss": 0.226, + "step": 6563 + }, + { + "epoch": 0.5200237670825906, + "grad_norm": 1.2222546090097701, + "learning_rate": 9.83706419505368e-06, + "loss": 0.306, + "step": 6564 + }, + { + "epoch": 0.520102990691226, + "grad_norm": 1.4426633279180927, + "learning_rate": 9.834498509349402e-06, + "loss": 0.2855, + "step": 6565 + }, + { + "epoch": 0.5201822142998613, + "grad_norm": 1.209579870158141, + "learning_rate": 9.831932834542598e-06, + "loss": 0.3127, + "step": 6566 + }, + { + "epoch": 0.5202614379084968, + "grad_norm": 1.8676181339001057, + "learning_rate": 9.829367170802208e-06, + "loss": 0.3819, + "step": 6567 + }, + { + "epoch": 0.5203406615171321, + "grad_norm": 1.3864076170216155, + "learning_rate": 9.82680151829717e-06, + "loss": 0.3071, + "step": 6568 + }, + { + "epoch": 0.5204198851257675, + "grad_norm": 1.651451752149105, + "learning_rate": 9.824235877196418e-06, + "loss": 0.2857, + "step": 6569 + }, + { + "epoch": 0.5204991087344029, + "grad_norm": 1.2776725528422195, + "learning_rate": 9.821670247668887e-06, + "loss": 0.282, + "step": 6570 + }, + { + "epoch": 0.5205783323430382, + "grad_norm": 1.318574722385798, + "learning_rate": 9.819104629883513e-06, + "loss": 0.2155, + "step": 6571 + }, + { + "epoch": 0.5206575559516736, + "grad_norm": 1.312429714055526, + "learning_rate": 9.816539024009227e-06, + "loss": 0.3126, + "step": 6572 + }, + { + "epoch": 0.520736779560309, + "grad_norm": 1.4528507004088553, + "learning_rate": 9.813973430214965e-06, + "loss": 0.2408, + "step": 6573 + }, + { + "epoch": 0.5208160031689444, + "grad_norm": 1.5859479582591554, + "learning_rate": 9.811407848669657e-06, + "loss": 0.4102, + "step": 6574 + }, + { + "epoch": 0.5208952267775797, + "grad_norm": 1.5629767648893005, + "learning_rate": 9.808842279542235e-06, + "loss": 0.2933, + "step": 6575 + }, + { + "epoch": 0.520974450386215, + "grad_norm": 1.2234931913568277, + "learning_rate": 9.80627672300163e-06, + "loss": 0.2267, + "step": 6576 + }, + { + "epoch": 0.5210536739948505, + "grad_norm": 1.5754881580614246, + "learning_rate": 9.80371117921677e-06, + "loss": 0.3487, + "step": 6577 + }, + { + "epoch": 0.5211328976034858, + "grad_norm": 1.130792574641297, + "learning_rate": 9.801145648356585e-06, + "loss": 0.2175, + "step": 6578 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 1.2418300256654666, + "learning_rate": 9.798580130590004e-06, + "loss": 0.2637, + "step": 6579 + }, + { + "epoch": 0.5212913448207566, + "grad_norm": 1.5691711928683074, + "learning_rate": 9.79601462608595e-06, + "loss": 0.3014, + "step": 6580 + }, + { + "epoch": 0.521370568429392, + "grad_norm": 1.374551434834054, + "learning_rate": 9.79344913501335e-06, + "loss": 0.3059, + "step": 6581 + }, + { + "epoch": 0.5214497920380273, + "grad_norm": 1.3503790526494002, + "learning_rate": 9.790883657541133e-06, + "loss": 0.2988, + "step": 6582 + }, + { + "epoch": 0.5215290156466627, + "grad_norm": 1.7853621542948634, + "learning_rate": 9.788318193838218e-06, + "loss": 0.3092, + "step": 6583 + }, + { + "epoch": 0.5216082392552981, + "grad_norm": 1.1868051561832857, + "learning_rate": 9.785752744073534e-06, + "loss": 0.1813, + "step": 6584 + }, + { + "epoch": 0.5216874628639334, + "grad_norm": 1.7817175877858948, + "learning_rate": 9.783187308416e-06, + "loss": 0.4138, + "step": 6585 + }, + { + "epoch": 0.5217666864725689, + "grad_norm": 1.2703527499319964, + "learning_rate": 9.780621887034537e-06, + "loss": 0.2491, + "step": 6586 + }, + { + "epoch": 0.5218459100812042, + "grad_norm": 1.3236493636259552, + "learning_rate": 9.778056480098068e-06, + "loss": 0.2776, + "step": 6587 + }, + { + "epoch": 0.5219251336898396, + "grad_norm": 1.3979214929633739, + "learning_rate": 9.775491087775514e-06, + "loss": 0.3045, + "step": 6588 + }, + { + "epoch": 0.5220043572984749, + "grad_norm": 1.405423445149963, + "learning_rate": 9.772925710235789e-06, + "loss": 0.306, + "step": 6589 + }, + { + "epoch": 0.5220835809071103, + "grad_norm": 1.681826081372409, + "learning_rate": 9.770360347647817e-06, + "loss": 0.3646, + "step": 6590 + }, + { + "epoch": 0.5221628045157457, + "grad_norm": 1.098210308335219, + "learning_rate": 9.767795000180507e-06, + "loss": 0.1607, + "step": 6591 + }, + { + "epoch": 0.522242028124381, + "grad_norm": 1.4567167802857144, + "learning_rate": 9.76522966800278e-06, + "loss": 0.2895, + "step": 6592 + }, + { + "epoch": 0.5223212517330165, + "grad_norm": 1.7222077801597855, + "learning_rate": 9.76266435128355e-06, + "loss": 0.4386, + "step": 6593 + }, + { + "epoch": 0.5224004753416518, + "grad_norm": 1.4542394656245397, + "learning_rate": 9.76009905019173e-06, + "loss": 0.3062, + "step": 6594 + }, + { + "epoch": 0.5224796989502872, + "grad_norm": 1.3319879514273967, + "learning_rate": 9.757533764896235e-06, + "loss": 0.234, + "step": 6595 + }, + { + "epoch": 0.5225589225589226, + "grad_norm": 1.102652204594049, + "learning_rate": 9.754968495565973e-06, + "loss": 0.2139, + "step": 6596 + }, + { + "epoch": 0.5226381461675579, + "grad_norm": 1.5008788814630107, + "learning_rate": 9.752403242369857e-06, + "loss": 0.2938, + "step": 6597 + }, + { + "epoch": 0.5227173697761933, + "grad_norm": 1.3042299229602294, + "learning_rate": 9.749838005476798e-06, + "loss": 0.2318, + "step": 6598 + }, + { + "epoch": 0.5227965933848286, + "grad_norm": 1.4384754707650653, + "learning_rate": 9.7472727850557e-06, + "loss": 0.2833, + "step": 6599 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 1.5659159084691918, + "learning_rate": 9.744707581275473e-06, + "loss": 0.3028, + "step": 6600 + }, + { + "epoch": 0.5229550406020994, + "grad_norm": 1.499965713046398, + "learning_rate": 9.742142394305026e-06, + "loss": 0.3074, + "step": 6601 + }, + { + "epoch": 0.5230342642107348, + "grad_norm": 1.5575060580220301, + "learning_rate": 9.739577224313258e-06, + "loss": 0.2783, + "step": 6602 + }, + { + "epoch": 0.5231134878193702, + "grad_norm": 1.1197961427045093, + "learning_rate": 9.737012071469082e-06, + "loss": 0.2314, + "step": 6603 + }, + { + "epoch": 0.5231927114280055, + "grad_norm": 1.2926415876245017, + "learning_rate": 9.734446935941392e-06, + "loss": 0.2241, + "step": 6604 + }, + { + "epoch": 0.5232719350366409, + "grad_norm": 1.2255991018938845, + "learning_rate": 9.731881817899092e-06, + "loss": 0.2239, + "step": 6605 + }, + { + "epoch": 0.5233511586452763, + "grad_norm": 1.3570637346160763, + "learning_rate": 9.729316717511088e-06, + "loss": 0.2831, + "step": 6606 + }, + { + "epoch": 0.5234303822539117, + "grad_norm": 1.6452875735783177, + "learning_rate": 9.726751634946272e-06, + "loss": 0.2834, + "step": 6607 + }, + { + "epoch": 0.523509605862547, + "grad_norm": 1.4815357622229162, + "learning_rate": 9.724186570373548e-06, + "loss": 0.3417, + "step": 6608 + }, + { + "epoch": 0.5235888294711825, + "grad_norm": 1.1741083309154061, + "learning_rate": 9.721621523961812e-06, + "loss": 0.2388, + "step": 6609 + }, + { + "epoch": 0.5236680530798178, + "grad_norm": 1.364266079115737, + "learning_rate": 9.719056495879958e-06, + "loss": 0.2487, + "step": 6610 + }, + { + "epoch": 0.5237472766884531, + "grad_norm": 1.4003965569316312, + "learning_rate": 9.716491486296883e-06, + "loss": 0.3278, + "step": 6611 + }, + { + "epoch": 0.5238265002970885, + "grad_norm": 1.2783877658279088, + "learning_rate": 9.71392649538148e-06, + "loss": 0.2696, + "step": 6612 + }, + { + "epoch": 0.5239057239057239, + "grad_norm": 1.4169665245215126, + "learning_rate": 9.711361523302638e-06, + "loss": 0.2434, + "step": 6613 + }, + { + "epoch": 0.5239849475143593, + "grad_norm": 1.300044334645034, + "learning_rate": 9.708796570229253e-06, + "loss": 0.2377, + "step": 6614 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 1.0962981986510534, + "learning_rate": 9.706231636330212e-06, + "loss": 0.2179, + "step": 6615 + }, + { + "epoch": 0.5241433947316301, + "grad_norm": 1.4706952055186984, + "learning_rate": 9.703666721774403e-06, + "loss": 0.2953, + "step": 6616 + }, + { + "epoch": 0.5242226183402654, + "grad_norm": 1.2421938929607341, + "learning_rate": 9.701101826730718e-06, + "loss": 0.2397, + "step": 6617 + }, + { + "epoch": 0.5243018419489007, + "grad_norm": 1.3832978359905337, + "learning_rate": 9.698536951368035e-06, + "loss": 0.283, + "step": 6618 + }, + { + "epoch": 0.5243810655575362, + "grad_norm": 1.1344165727794133, + "learning_rate": 9.695972095855248e-06, + "loss": 0.2076, + "step": 6619 + }, + { + "epoch": 0.5244602891661715, + "grad_norm": 1.5036956250666393, + "learning_rate": 9.693407260361231e-06, + "loss": 0.2379, + "step": 6620 + }, + { + "epoch": 0.5245395127748069, + "grad_norm": 1.55834823894501, + "learning_rate": 9.690842445054873e-06, + "loss": 0.3146, + "step": 6621 + }, + { + "epoch": 0.5246187363834423, + "grad_norm": 1.4085930128086739, + "learning_rate": 9.688277650105053e-06, + "loss": 0.3089, + "step": 6622 + }, + { + "epoch": 0.5246979599920777, + "grad_norm": 1.3070274339398167, + "learning_rate": 9.685712875680649e-06, + "loss": 0.2517, + "step": 6623 + }, + { + "epoch": 0.524777183600713, + "grad_norm": 1.4591815620715252, + "learning_rate": 9.683148121950539e-06, + "loss": 0.2746, + "step": 6624 + }, + { + "epoch": 0.5248564072093483, + "grad_norm": 1.0482277801340272, + "learning_rate": 9.680583389083602e-06, + "loss": 0.2525, + "step": 6625 + }, + { + "epoch": 0.5249356308179838, + "grad_norm": 1.4777370681230158, + "learning_rate": 9.67801867724871e-06, + "loss": 0.3394, + "step": 6626 + }, + { + "epoch": 0.5250148544266191, + "grad_norm": 1.276657979632384, + "learning_rate": 9.675453986614743e-06, + "loss": 0.2875, + "step": 6627 + }, + { + "epoch": 0.5250940780352545, + "grad_norm": 1.36954365882781, + "learning_rate": 9.672889317350565e-06, + "loss": 0.2801, + "step": 6628 + }, + { + "epoch": 0.5251733016438899, + "grad_norm": 1.2029572516345086, + "learning_rate": 9.670324669625053e-06, + "loss": 0.2057, + "step": 6629 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 1.3923040479544802, + "learning_rate": 9.667760043607077e-06, + "loss": 0.2774, + "step": 6630 + }, + { + "epoch": 0.5253317488611606, + "grad_norm": 1.1785338348851206, + "learning_rate": 9.6651954394655e-06, + "loss": 0.2519, + "step": 6631 + }, + { + "epoch": 0.525410972469796, + "grad_norm": 1.6707379241602642, + "learning_rate": 9.662630857369194e-06, + "loss": 0.3525, + "step": 6632 + }, + { + "epoch": 0.5254901960784314, + "grad_norm": 1.296803773395515, + "learning_rate": 9.660066297487024e-06, + "loss": 0.2062, + "step": 6633 + }, + { + "epoch": 0.5255694196870667, + "grad_norm": 1.1430938427299344, + "learning_rate": 9.65750175998785e-06, + "loss": 0.2355, + "step": 6634 + }, + { + "epoch": 0.5256486432957022, + "grad_norm": 1.497283095255509, + "learning_rate": 9.65493724504054e-06, + "loss": 0.2755, + "step": 6635 + }, + { + "epoch": 0.5257278669043375, + "grad_norm": 1.4440990083113452, + "learning_rate": 9.65237275281395e-06, + "loss": 0.2768, + "step": 6636 + }, + { + "epoch": 0.5258070905129729, + "grad_norm": 1.4136335895172514, + "learning_rate": 9.64980828347694e-06, + "loss": 0.299, + "step": 6637 + }, + { + "epoch": 0.5258863141216082, + "grad_norm": 1.3887299551900778, + "learning_rate": 9.647243837198375e-06, + "loss": 0.3214, + "step": 6638 + }, + { + "epoch": 0.5259655377302436, + "grad_norm": 1.4807982323286617, + "learning_rate": 9.644679414147102e-06, + "loss": 0.2779, + "step": 6639 + }, + { + "epoch": 0.526044761338879, + "grad_norm": 1.5786787197774497, + "learning_rate": 9.64211501449198e-06, + "loss": 0.2313, + "step": 6640 + }, + { + "epoch": 0.5261239849475143, + "grad_norm": 1.6330978321519143, + "learning_rate": 9.639550638401863e-06, + "loss": 0.3444, + "step": 6641 + }, + { + "epoch": 0.5262032085561498, + "grad_norm": 1.5450908475369736, + "learning_rate": 9.6369862860456e-06, + "loss": 0.3393, + "step": 6642 + }, + { + "epoch": 0.5262824321647851, + "grad_norm": 1.2307514643855018, + "learning_rate": 9.634421957592048e-06, + "loss": 0.2625, + "step": 6643 + }, + { + "epoch": 0.5263616557734205, + "grad_norm": 1.6330040943187798, + "learning_rate": 9.631857653210048e-06, + "loss": 0.2997, + "step": 6644 + }, + { + "epoch": 0.5264408793820559, + "grad_norm": 1.377117676519559, + "learning_rate": 9.629293373068449e-06, + "loss": 0.3138, + "step": 6645 + }, + { + "epoch": 0.5265201029906912, + "grad_norm": 1.3755934024613092, + "learning_rate": 9.626729117336101e-06, + "loss": 0.3118, + "step": 6646 + }, + { + "epoch": 0.5265993265993266, + "grad_norm": 1.1537747893257073, + "learning_rate": 9.624164886181841e-06, + "loss": 0.2376, + "step": 6647 + }, + { + "epoch": 0.526678550207962, + "grad_norm": 1.1797568424380762, + "learning_rate": 9.621600679774516e-06, + "loss": 0.2726, + "step": 6648 + }, + { + "epoch": 0.5267577738165974, + "grad_norm": 1.6083939902514486, + "learning_rate": 9.619036498282968e-06, + "loss": 0.3965, + "step": 6649 + }, + { + "epoch": 0.5268369974252327, + "grad_norm": 1.195518192767305, + "learning_rate": 9.61647234187603e-06, + "loss": 0.2494, + "step": 6650 + }, + { + "epoch": 0.526916221033868, + "grad_norm": 1.4300413343079406, + "learning_rate": 9.613908210722546e-06, + "loss": 0.2458, + "step": 6651 + }, + { + "epoch": 0.5269954446425035, + "grad_norm": 1.3963632808219681, + "learning_rate": 9.611344104991346e-06, + "loss": 0.2901, + "step": 6652 + }, + { + "epoch": 0.5270746682511388, + "grad_norm": 1.2955989774072305, + "learning_rate": 9.608780024851266e-06, + "loss": 0.254, + "step": 6653 + }, + { + "epoch": 0.5271538918597742, + "grad_norm": 1.1469459577731393, + "learning_rate": 9.606215970471142e-06, + "loss": 0.1615, + "step": 6654 + }, + { + "epoch": 0.5272331154684096, + "grad_norm": 1.3524400286821538, + "learning_rate": 9.6036519420198e-06, + "loss": 0.2531, + "step": 6655 + }, + { + "epoch": 0.527312339077045, + "grad_norm": 1.288490618518072, + "learning_rate": 9.601087939666071e-06, + "loss": 0.2012, + "step": 6656 + }, + { + "epoch": 0.5273915626856803, + "grad_norm": 1.4052405040406672, + "learning_rate": 9.598523963578785e-06, + "loss": 0.326, + "step": 6657 + }, + { + "epoch": 0.5274707862943157, + "grad_norm": 1.3148652952852977, + "learning_rate": 9.595960013926761e-06, + "loss": 0.2566, + "step": 6658 + }, + { + "epoch": 0.5275500099029511, + "grad_norm": 1.3082786870586396, + "learning_rate": 9.593396090878823e-06, + "loss": 0.2181, + "step": 6659 + }, + { + "epoch": 0.5276292335115864, + "grad_norm": 1.291568255789929, + "learning_rate": 9.590832194603801e-06, + "loss": 0.2321, + "step": 6660 + }, + { + "epoch": 0.5277084571202219, + "grad_norm": 2.031491509892924, + "learning_rate": 9.588268325270506e-06, + "loss": 0.4026, + "step": 6661 + }, + { + "epoch": 0.5277876807288572, + "grad_norm": 1.2305936223155234, + "learning_rate": 9.585704483047761e-06, + "loss": 0.1739, + "step": 6662 + }, + { + "epoch": 0.5278669043374926, + "grad_norm": 1.2636170185223636, + "learning_rate": 9.583140668104387e-06, + "loss": 0.2811, + "step": 6663 + }, + { + "epoch": 0.5279461279461279, + "grad_norm": 1.521316684136875, + "learning_rate": 9.58057688060919e-06, + "loss": 0.3333, + "step": 6664 + }, + { + "epoch": 0.5280253515547633, + "grad_norm": 1.541617709181252, + "learning_rate": 9.578013120730987e-06, + "loss": 0.3052, + "step": 6665 + }, + { + "epoch": 0.5281045751633987, + "grad_norm": 1.1608552202052271, + "learning_rate": 9.575449388638592e-06, + "loss": 0.2429, + "step": 6666 + }, + { + "epoch": 0.528183798772034, + "grad_norm": 1.8317600087279482, + "learning_rate": 9.57288568450081e-06, + "loss": 0.3713, + "step": 6667 + }, + { + "epoch": 0.5282630223806695, + "grad_norm": 1.6563670838843565, + "learning_rate": 9.570322008486453e-06, + "loss": 0.3075, + "step": 6668 + }, + { + "epoch": 0.5283422459893048, + "grad_norm": 1.4928566141724477, + "learning_rate": 9.567758360764321e-06, + "loss": 0.2808, + "step": 6669 + }, + { + "epoch": 0.5284214695979402, + "grad_norm": 1.5001467356875635, + "learning_rate": 9.565194741503221e-06, + "loss": 0.3164, + "step": 6670 + }, + { + "epoch": 0.5285006932065756, + "grad_norm": 1.3537649281897015, + "learning_rate": 9.562631150871959e-06, + "loss": 0.3156, + "step": 6671 + }, + { + "epoch": 0.5285799168152109, + "grad_norm": 1.2675423099131446, + "learning_rate": 9.560067589039327e-06, + "loss": 0.2407, + "step": 6672 + }, + { + "epoch": 0.5286591404238463, + "grad_norm": 1.2573851830312253, + "learning_rate": 9.55750405617413e-06, + "loss": 0.2175, + "step": 6673 + }, + { + "epoch": 0.5287383640324816, + "grad_norm": 1.5588520506002632, + "learning_rate": 9.554940552445161e-06, + "loss": 0.2615, + "step": 6674 + }, + { + "epoch": 0.5288175876411171, + "grad_norm": 1.4332324400791656, + "learning_rate": 9.552377078021215e-06, + "loss": 0.3435, + "step": 6675 + }, + { + "epoch": 0.5288968112497524, + "grad_norm": 1.1730884861729056, + "learning_rate": 9.549813633071085e-06, + "loss": 0.2783, + "step": 6676 + }, + { + "epoch": 0.5289760348583878, + "grad_norm": 1.4439941691093812, + "learning_rate": 9.54725021776356e-06, + "loss": 0.3119, + "step": 6677 + }, + { + "epoch": 0.5290552584670232, + "grad_norm": 1.7370101088080452, + "learning_rate": 9.54468683226743e-06, + "loss": 0.3228, + "step": 6678 + }, + { + "epoch": 0.5291344820756585, + "grad_norm": 1.117600230125303, + "learning_rate": 9.542123476751484e-06, + "loss": 0.1909, + "step": 6679 + }, + { + "epoch": 0.5292137056842939, + "grad_norm": 1.1515903249132564, + "learning_rate": 9.5395601513845e-06, + "loss": 0.2222, + "step": 6680 + }, + { + "epoch": 0.5292929292929293, + "grad_norm": 1.8867139243750468, + "learning_rate": 9.536996856335269e-06, + "loss": 0.3558, + "step": 6681 + }, + { + "epoch": 0.5293721529015647, + "grad_norm": 1.6752764695045361, + "learning_rate": 9.534433591772562e-06, + "loss": 0.3092, + "step": 6682 + }, + { + "epoch": 0.5294513765102, + "grad_norm": 1.6033925630978305, + "learning_rate": 9.531870357865165e-06, + "loss": 0.2985, + "step": 6683 + }, + { + "epoch": 0.5295306001188355, + "grad_norm": 1.2077827297526118, + "learning_rate": 9.529307154781855e-06, + "loss": 0.2444, + "step": 6684 + }, + { + "epoch": 0.5296098237274708, + "grad_norm": 1.194335800866445, + "learning_rate": 9.5267439826914e-06, + "loss": 0.2148, + "step": 6685 + }, + { + "epoch": 0.5296890473361061, + "grad_norm": 1.4376889664760628, + "learning_rate": 9.524180841762577e-06, + "loss": 0.289, + "step": 6686 + }, + { + "epoch": 0.5297682709447415, + "grad_norm": 1.8093120720204678, + "learning_rate": 9.52161773216416e-06, + "loss": 0.3367, + "step": 6687 + }, + { + "epoch": 0.5298474945533769, + "grad_norm": 1.333271514764325, + "learning_rate": 9.519054654064909e-06, + "loss": 0.2511, + "step": 6688 + }, + { + "epoch": 0.5299267181620123, + "grad_norm": 1.3692322595912834, + "learning_rate": 9.5164916076336e-06, + "loss": 0.2392, + "step": 6689 + }, + { + "epoch": 0.5300059417706476, + "grad_norm": 1.43380209821574, + "learning_rate": 9.513928593038987e-06, + "loss": 0.3539, + "step": 6690 + }, + { + "epoch": 0.5300851653792831, + "grad_norm": 1.5233257978255093, + "learning_rate": 9.51136561044984e-06, + "loss": 0.2776, + "step": 6691 + }, + { + "epoch": 0.5301643889879184, + "grad_norm": 1.4027923408473233, + "learning_rate": 9.508802660034915e-06, + "loss": 0.2725, + "step": 6692 + }, + { + "epoch": 0.5302436125965537, + "grad_norm": 1.7700973357251542, + "learning_rate": 9.506239741962971e-06, + "loss": 0.4102, + "step": 6693 + }, + { + "epoch": 0.5303228362051892, + "grad_norm": 1.7352410949755996, + "learning_rate": 9.503676856402764e-06, + "loss": 0.3326, + "step": 6694 + }, + { + "epoch": 0.5304020598138245, + "grad_norm": 1.145217628127054, + "learning_rate": 9.50111400352305e-06, + "loss": 0.2194, + "step": 6695 + }, + { + "epoch": 0.5304812834224599, + "grad_norm": 1.4212117018478085, + "learning_rate": 9.498551183492578e-06, + "loss": 0.2609, + "step": 6696 + }, + { + "epoch": 0.5305605070310953, + "grad_norm": 1.3006076346746003, + "learning_rate": 9.495988396480097e-06, + "loss": 0.2996, + "step": 6697 + }, + { + "epoch": 0.5306397306397307, + "grad_norm": 1.5917593329743487, + "learning_rate": 9.493425642654356e-06, + "loss": 0.3809, + "step": 6698 + }, + { + "epoch": 0.530718954248366, + "grad_norm": 1.653836267235538, + "learning_rate": 9.490862922184096e-06, + "loss": 0.3099, + "step": 6699 + }, + { + "epoch": 0.5307981778570013, + "grad_norm": 1.6133510340217554, + "learning_rate": 9.488300235238067e-06, + "loss": 0.3062, + "step": 6700 + }, + { + "epoch": 0.5308774014656368, + "grad_norm": 1.302576952429008, + "learning_rate": 9.485737581985002e-06, + "loss": 0.2653, + "step": 6701 + }, + { + "epoch": 0.5309566250742721, + "grad_norm": 1.0055575979666276, + "learning_rate": 9.483174962593644e-06, + "loss": 0.1029, + "step": 6702 + }, + { + "epoch": 0.5310358486829075, + "grad_norm": 1.1682445847740925, + "learning_rate": 9.480612377232728e-06, + "loss": 0.227, + "step": 6703 + }, + { + "epoch": 0.5311150722915429, + "grad_norm": 1.1652027436505905, + "learning_rate": 9.478049826070988e-06, + "loss": 0.2405, + "step": 6704 + }, + { + "epoch": 0.5311942959001783, + "grad_norm": 1.4147847379924263, + "learning_rate": 9.475487309277156e-06, + "loss": 0.2778, + "step": 6705 + }, + { + "epoch": 0.5312735195088136, + "grad_norm": 1.1377388074760229, + "learning_rate": 9.472924827019959e-06, + "loss": 0.2499, + "step": 6706 + }, + { + "epoch": 0.531352743117449, + "grad_norm": 1.1330378433656962, + "learning_rate": 9.470362379468125e-06, + "loss": 0.238, + "step": 6707 + }, + { + "epoch": 0.5314319667260844, + "grad_norm": 1.5014673745498222, + "learning_rate": 9.467799966790384e-06, + "loss": 0.2448, + "step": 6708 + }, + { + "epoch": 0.5315111903347197, + "grad_norm": 1.3099047654609604, + "learning_rate": 9.465237589155452e-06, + "loss": 0.2557, + "step": 6709 + }, + { + "epoch": 0.5315904139433552, + "grad_norm": 1.3190927619450001, + "learning_rate": 9.462675246732051e-06, + "loss": 0.2355, + "step": 6710 + }, + { + "epoch": 0.5316696375519905, + "grad_norm": 1.379249246333291, + "learning_rate": 9.460112939688901e-06, + "loss": 0.3392, + "step": 6711 + }, + { + "epoch": 0.5317488611606259, + "grad_norm": 1.345709745976777, + "learning_rate": 9.457550668194714e-06, + "loss": 0.2665, + "step": 6712 + }, + { + "epoch": 0.5318280847692612, + "grad_norm": 1.4863562892142994, + "learning_rate": 9.45498843241821e-06, + "loss": 0.3, + "step": 6713 + }, + { + "epoch": 0.5319073083778966, + "grad_norm": 1.3036253857552202, + "learning_rate": 9.452426232528092e-06, + "loss": 0.2613, + "step": 6714 + }, + { + "epoch": 0.531986531986532, + "grad_norm": 1.2974735397661186, + "learning_rate": 9.449864068693072e-06, + "loss": 0.2025, + "step": 6715 + }, + { + "epoch": 0.5320657555951673, + "grad_norm": 1.4594821946395855, + "learning_rate": 9.447301941081856e-06, + "loss": 0.3271, + "step": 6716 + }, + { + "epoch": 0.5321449792038028, + "grad_norm": 1.4438487748657014, + "learning_rate": 9.444739849863146e-06, + "loss": 0.3451, + "step": 6717 + }, + { + "epoch": 0.5322242028124381, + "grad_norm": 1.1641043139213456, + "learning_rate": 9.442177795205647e-06, + "loss": 0.2383, + "step": 6718 + }, + { + "epoch": 0.5323034264210735, + "grad_norm": 1.3098938833785447, + "learning_rate": 9.439615777278059e-06, + "loss": 0.2169, + "step": 6719 + }, + { + "epoch": 0.5323826500297089, + "grad_norm": 1.1825511914267253, + "learning_rate": 9.437053796249071e-06, + "loss": 0.2354, + "step": 6720 + }, + { + "epoch": 0.5324618736383442, + "grad_norm": 1.28364833820355, + "learning_rate": 9.434491852287385e-06, + "loss": 0.2251, + "step": 6721 + }, + { + "epoch": 0.5325410972469796, + "grad_norm": 1.4253957329378735, + "learning_rate": 9.431929945561688e-06, + "loss": 0.2498, + "step": 6722 + }, + { + "epoch": 0.532620320855615, + "grad_norm": 1.426455864989217, + "learning_rate": 9.429368076240669e-06, + "loss": 0.2302, + "step": 6723 + }, + { + "epoch": 0.5326995444642504, + "grad_norm": 1.679541425187411, + "learning_rate": 9.42680624449302e-06, + "loss": 0.3232, + "step": 6724 + }, + { + "epoch": 0.5327787680728857, + "grad_norm": 1.7291603177588906, + "learning_rate": 9.42424445048742e-06, + "loss": 0.3309, + "step": 6725 + }, + { + "epoch": 0.5328579916815211, + "grad_norm": 1.4696512636751686, + "learning_rate": 9.42168269439255e-06, + "loss": 0.2258, + "step": 6726 + }, + { + "epoch": 0.5329372152901565, + "grad_norm": 1.259514963648564, + "learning_rate": 9.419120976377098e-06, + "loss": 0.237, + "step": 6727 + }, + { + "epoch": 0.5330164388987918, + "grad_norm": 1.4603238518593502, + "learning_rate": 9.41655929660973e-06, + "loss": 0.3005, + "step": 6728 + }, + { + "epoch": 0.5330956625074272, + "grad_norm": 1.2798535373106266, + "learning_rate": 9.413997655259126e-06, + "loss": 0.277, + "step": 6729 + }, + { + "epoch": 0.5331748861160626, + "grad_norm": 1.390691093683762, + "learning_rate": 9.411436052493957e-06, + "loss": 0.283, + "step": 6730 + }, + { + "epoch": 0.533254109724698, + "grad_norm": 1.4693044352301712, + "learning_rate": 9.40887448848289e-06, + "loss": 0.2664, + "step": 6731 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2935168541655746, + "learning_rate": 9.406312963394598e-06, + "loss": 0.2298, + "step": 6732 + }, + { + "epoch": 0.5334125569419687, + "grad_norm": 1.568669002572422, + "learning_rate": 9.403751477397738e-06, + "loss": 0.387, + "step": 6733 + }, + { + "epoch": 0.5334917805506041, + "grad_norm": 1.4895470401983604, + "learning_rate": 9.401190030660975e-06, + "loss": 0.317, + "step": 6734 + }, + { + "epoch": 0.5335710041592394, + "grad_norm": 1.2190220465897612, + "learning_rate": 9.398628623352969e-06, + "loss": 0.1942, + "step": 6735 + }, + { + "epoch": 0.5336502277678749, + "grad_norm": 1.1690231128980542, + "learning_rate": 9.396067255642373e-06, + "loss": 0.2146, + "step": 6736 + }, + { + "epoch": 0.5337294513765102, + "grad_norm": 1.3757233061086984, + "learning_rate": 9.39350592769784e-06, + "loss": 0.306, + "step": 6737 + }, + { + "epoch": 0.5338086749851456, + "grad_norm": 0.9724917279421184, + "learning_rate": 9.390944639688027e-06, + "loss": 0.2174, + "step": 6738 + }, + { + "epoch": 0.5338878985937809, + "grad_norm": 1.272320542768353, + "learning_rate": 9.388383391781576e-06, + "loss": 0.2596, + "step": 6739 + }, + { + "epoch": 0.5339671222024163, + "grad_norm": 1.411424576761223, + "learning_rate": 9.385822184147136e-06, + "loss": 0.2908, + "step": 6740 + }, + { + "epoch": 0.5340463458110517, + "grad_norm": 1.6186305961905039, + "learning_rate": 9.383261016953351e-06, + "loss": 0.2893, + "step": 6741 + }, + { + "epoch": 0.534125569419687, + "grad_norm": 1.5198495930305322, + "learning_rate": 9.38069989036886e-06, + "loss": 0.3601, + "step": 6742 + }, + { + "epoch": 0.5342047930283225, + "grad_norm": 1.1213343399907656, + "learning_rate": 9.3781388045623e-06, + "loss": 0.1846, + "step": 6743 + }, + { + "epoch": 0.5342840166369578, + "grad_norm": 1.5926127864248008, + "learning_rate": 9.37557775970231e-06, + "loss": 0.3638, + "step": 6744 + }, + { + "epoch": 0.5343632402455932, + "grad_norm": 1.582921130618739, + "learning_rate": 9.373016755957519e-06, + "loss": 0.3654, + "step": 6745 + }, + { + "epoch": 0.5344424638542286, + "grad_norm": 1.7729412170555228, + "learning_rate": 9.370455793496558e-06, + "loss": 0.348, + "step": 6746 + }, + { + "epoch": 0.5345216874628639, + "grad_norm": 1.5847045242356583, + "learning_rate": 9.367894872488053e-06, + "loss": 0.3971, + "step": 6747 + }, + { + "epoch": 0.5346009110714993, + "grad_norm": 1.3307159193732085, + "learning_rate": 9.365333993100628e-06, + "loss": 0.2686, + "step": 6748 + }, + { + "epoch": 0.5346801346801346, + "grad_norm": 1.5361109802613226, + "learning_rate": 9.362773155502909e-06, + "loss": 0.3109, + "step": 6749 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 1.8976565026585643, + "learning_rate": 9.360212359863508e-06, + "loss": 0.3811, + "step": 6750 + }, + { + "epoch": 0.5348385818974054, + "grad_norm": 1.0991247901775503, + "learning_rate": 9.357651606351047e-06, + "loss": 0.1772, + "step": 6751 + }, + { + "epoch": 0.5349178055060408, + "grad_norm": 1.233848250402271, + "learning_rate": 9.355090895134138e-06, + "loss": 0.2636, + "step": 6752 + }, + { + "epoch": 0.5349970291146762, + "grad_norm": 1.4341967560828675, + "learning_rate": 9.352530226381388e-06, + "loss": 0.3195, + "step": 6753 + }, + { + "epoch": 0.5350762527233115, + "grad_norm": 1.2225326727225299, + "learning_rate": 9.349969600261408e-06, + "loss": 0.2383, + "step": 6754 + }, + { + "epoch": 0.5351554763319469, + "grad_norm": 1.324038486267805, + "learning_rate": 9.347409016942803e-06, + "loss": 0.3049, + "step": 6755 + }, + { + "epoch": 0.5352346999405823, + "grad_norm": 1.7870197223119775, + "learning_rate": 9.344848476594172e-06, + "loss": 0.3703, + "step": 6756 + }, + { + "epoch": 0.5353139235492177, + "grad_norm": 1.253002325033724, + "learning_rate": 9.342287979384118e-06, + "loss": 0.2114, + "step": 6757 + }, + { + "epoch": 0.535393147157853, + "grad_norm": 1.340280887441604, + "learning_rate": 9.339727525481234e-06, + "loss": 0.319, + "step": 6758 + }, + { + "epoch": 0.5354723707664885, + "grad_norm": 1.5554091223712008, + "learning_rate": 9.33716711505412e-06, + "loss": 0.2711, + "step": 6759 + }, + { + "epoch": 0.5355515943751238, + "grad_norm": 1.4827656168956844, + "learning_rate": 9.334606748271357e-06, + "loss": 0.248, + "step": 6760 + }, + { + "epoch": 0.5356308179837591, + "grad_norm": 1.6205721083917508, + "learning_rate": 9.33204642530154e-06, + "loss": 0.2556, + "step": 6761 + }, + { + "epoch": 0.5357100415923945, + "grad_norm": 1.186600864266174, + "learning_rate": 9.329486146313254e-06, + "loss": 0.2268, + "step": 6762 + }, + { + "epoch": 0.5357892652010299, + "grad_norm": 1.5137481023407515, + "learning_rate": 9.326925911475075e-06, + "loss": 0.2565, + "step": 6763 + }, + { + "epoch": 0.5358684888096653, + "grad_norm": 1.5513750237024808, + "learning_rate": 9.324365720955589e-06, + "loss": 0.3307, + "step": 6764 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 1.7423238094637552, + "learning_rate": 9.321805574923369e-06, + "loss": 0.2504, + "step": 6765 + }, + { + "epoch": 0.5360269360269361, + "grad_norm": 1.1898912719658348, + "learning_rate": 9.319245473546987e-06, + "loss": 0.1922, + "step": 6766 + }, + { + "epoch": 0.5361061596355714, + "grad_norm": 1.2772269424767866, + "learning_rate": 9.316685416995017e-06, + "loss": 0.2592, + "step": 6767 + }, + { + "epoch": 0.5361853832442067, + "grad_norm": 1.2448787430388477, + "learning_rate": 9.314125405436023e-06, + "loss": 0.2139, + "step": 6768 + }, + { + "epoch": 0.5362646068528422, + "grad_norm": 1.3890908283455774, + "learning_rate": 9.311565439038571e-06, + "loss": 0.2366, + "step": 6769 + }, + { + "epoch": 0.5363438304614775, + "grad_norm": 1.317728037985118, + "learning_rate": 9.309005517971222e-06, + "loss": 0.3225, + "step": 6770 + }, + { + "epoch": 0.5364230540701129, + "grad_norm": 1.6431510979972728, + "learning_rate": 9.306445642402534e-06, + "loss": 0.2214, + "step": 6771 + }, + { + "epoch": 0.5365022776787483, + "grad_norm": 1.5371766928566697, + "learning_rate": 9.303885812501064e-06, + "loss": 0.2985, + "step": 6772 + }, + { + "epoch": 0.5365815012873837, + "grad_norm": 1.1854091490886594, + "learning_rate": 9.301326028435367e-06, + "loss": 0.2215, + "step": 6773 + }, + { + "epoch": 0.536660724896019, + "grad_norm": 1.6841690095207198, + "learning_rate": 9.298766290373986e-06, + "loss": 0.2578, + "step": 6774 + }, + { + "epoch": 0.5367399485046543, + "grad_norm": 1.3203426464633086, + "learning_rate": 9.296206598485471e-06, + "loss": 0.304, + "step": 6775 + }, + { + "epoch": 0.5368191721132898, + "grad_norm": 1.5355814729158108, + "learning_rate": 9.293646952938365e-06, + "loss": 0.3116, + "step": 6776 + }, + { + "epoch": 0.5368983957219251, + "grad_norm": 1.2267660463598158, + "learning_rate": 9.291087353901208e-06, + "loss": 0.2722, + "step": 6777 + }, + { + "epoch": 0.5369776193305605, + "grad_norm": 1.237091427193348, + "learning_rate": 9.28852780154254e-06, + "loss": 0.2164, + "step": 6778 + }, + { + "epoch": 0.5370568429391959, + "grad_norm": 1.363068516064216, + "learning_rate": 9.285968296030891e-06, + "loss": 0.2532, + "step": 6779 + }, + { + "epoch": 0.5371360665478313, + "grad_norm": 1.5448556293682676, + "learning_rate": 9.283408837534793e-06, + "loss": 0.2658, + "step": 6780 + }, + { + "epoch": 0.5372152901564666, + "grad_norm": 1.273167080040899, + "learning_rate": 9.280849426222778e-06, + "loss": 0.2862, + "step": 6781 + }, + { + "epoch": 0.537294513765102, + "grad_norm": 1.251335087647091, + "learning_rate": 9.278290062263364e-06, + "loss": 0.2154, + "step": 6782 + }, + { + "epoch": 0.5373737373737374, + "grad_norm": 1.2054285202032164, + "learning_rate": 9.27573074582508e-06, + "loss": 0.2095, + "step": 6783 + }, + { + "epoch": 0.5374529609823727, + "grad_norm": 1.1427525509151666, + "learning_rate": 9.27317147707644e-06, + "loss": 0.1762, + "step": 6784 + }, + { + "epoch": 0.5375321845910082, + "grad_norm": 1.2848903398800302, + "learning_rate": 9.270612256185962e-06, + "loss": 0.2461, + "step": 6785 + }, + { + "epoch": 0.5376114081996435, + "grad_norm": 1.5186389469273869, + "learning_rate": 9.268053083322157e-06, + "loss": 0.3463, + "step": 6786 + }, + { + "epoch": 0.5376906318082789, + "grad_norm": 1.2931255254029264, + "learning_rate": 9.265493958653533e-06, + "loss": 0.2371, + "step": 6787 + }, + { + "epoch": 0.5377698554169142, + "grad_norm": 1.518304574791514, + "learning_rate": 9.262934882348599e-06, + "loss": 0.2528, + "step": 6788 + }, + { + "epoch": 0.5378490790255496, + "grad_norm": 1.2644482639293273, + "learning_rate": 9.260375854575857e-06, + "loss": 0.2101, + "step": 6789 + }, + { + "epoch": 0.537928302634185, + "grad_norm": 1.1753587189304724, + "learning_rate": 9.257816875503805e-06, + "loss": 0.2433, + "step": 6790 + }, + { + "epoch": 0.5380075262428203, + "grad_norm": 1.4517862240575137, + "learning_rate": 9.255257945300941e-06, + "loss": 0.2491, + "step": 6791 + }, + { + "epoch": 0.5380867498514558, + "grad_norm": 1.2343677282261236, + "learning_rate": 9.252699064135759e-06, + "loss": 0.1576, + "step": 6792 + }, + { + "epoch": 0.5381659734600911, + "grad_norm": 1.3733817026100017, + "learning_rate": 9.250140232176746e-06, + "loss": 0.2631, + "step": 6793 + }, + { + "epoch": 0.5382451970687265, + "grad_norm": 1.303021594187281, + "learning_rate": 9.247581449592392e-06, + "loss": 0.2661, + "step": 6794 + }, + { + "epoch": 0.5383244206773619, + "grad_norm": 1.4113178441734109, + "learning_rate": 9.245022716551178e-06, + "loss": 0.3083, + "step": 6795 + }, + { + "epoch": 0.5384036442859972, + "grad_norm": 1.2252960972557936, + "learning_rate": 9.242464033221584e-06, + "loss": 0.2834, + "step": 6796 + }, + { + "epoch": 0.5384828678946326, + "grad_norm": 1.6811551424151405, + "learning_rate": 9.239905399772092e-06, + "loss": 0.3737, + "step": 6797 + }, + { + "epoch": 0.538562091503268, + "grad_norm": 1.668810809116935, + "learning_rate": 9.237346816371169e-06, + "loss": 0.3503, + "step": 6798 + }, + { + "epoch": 0.5386413151119034, + "grad_norm": 1.7557551341166222, + "learning_rate": 9.234788283187291e-06, + "loss": 0.3591, + "step": 6799 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 1.2810845849598966, + "learning_rate": 9.23222980038892e-06, + "loss": 0.2647, + "step": 6800 + }, + { + "epoch": 0.5387997623291741, + "grad_norm": 1.6291312883364715, + "learning_rate": 9.229671368144524e-06, + "loss": 0.3946, + "step": 6801 + }, + { + "epoch": 0.5388789859378095, + "grad_norm": 1.5480873934523631, + "learning_rate": 9.227112986622562e-06, + "loss": 0.3245, + "step": 6802 + }, + { + "epoch": 0.5389582095464448, + "grad_norm": 1.2804502791430201, + "learning_rate": 9.224554655991492e-06, + "loss": 0.2534, + "step": 6803 + }, + { + "epoch": 0.5390374331550802, + "grad_norm": 1.7414359878820775, + "learning_rate": 9.221996376419763e-06, + "loss": 0.3345, + "step": 6804 + }, + { + "epoch": 0.5391166567637156, + "grad_norm": 1.3947994486707425, + "learning_rate": 9.219438148075834e-06, + "loss": 0.2538, + "step": 6805 + }, + { + "epoch": 0.539195880372351, + "grad_norm": 1.9177309540266438, + "learning_rate": 9.216879971128142e-06, + "loss": 0.3725, + "step": 6806 + }, + { + "epoch": 0.5392751039809863, + "grad_norm": 1.3845811015587115, + "learning_rate": 9.21432184574514e-06, + "loss": 0.2476, + "step": 6807 + }, + { + "epoch": 0.5393543275896218, + "grad_norm": 1.469212269016225, + "learning_rate": 9.21176377209526e-06, + "loss": 0.2908, + "step": 6808 + }, + { + "epoch": 0.5394335511982571, + "grad_norm": 1.4807062791577446, + "learning_rate": 9.209205750346945e-06, + "loss": 0.3041, + "step": 6809 + }, + { + "epoch": 0.5395127748068924, + "grad_norm": 1.563196760277186, + "learning_rate": 9.206647780668629e-06, + "loss": 0.3051, + "step": 6810 + }, + { + "epoch": 0.5395919984155279, + "grad_norm": 1.2496748411728456, + "learning_rate": 9.204089863228736e-06, + "loss": 0.2645, + "step": 6811 + }, + { + "epoch": 0.5396712220241632, + "grad_norm": 1.209543218466997, + "learning_rate": 9.201531998195697e-06, + "loss": 0.2135, + "step": 6812 + }, + { + "epoch": 0.5397504456327986, + "grad_norm": 1.1895299618052124, + "learning_rate": 9.198974185737934e-06, + "loss": 0.2273, + "step": 6813 + }, + { + "epoch": 0.5398296692414339, + "grad_norm": 1.3328687189871387, + "learning_rate": 9.196416426023868e-06, + "loss": 0.2711, + "step": 6814 + }, + { + "epoch": 0.5399088928500693, + "grad_norm": 1.3232598161038789, + "learning_rate": 9.193858719221912e-06, + "loss": 0.2928, + "step": 6815 + }, + { + "epoch": 0.5399881164587047, + "grad_norm": 1.4064068710974462, + "learning_rate": 9.19130106550048e-06, + "loss": 0.2485, + "step": 6816 + }, + { + "epoch": 0.54006734006734, + "grad_norm": 1.0767575670440195, + "learning_rate": 9.188743465027981e-06, + "loss": 0.1788, + "step": 6817 + }, + { + "epoch": 0.5401465636759755, + "grad_norm": 1.2195140124731305, + "learning_rate": 9.186185917972821e-06, + "loss": 0.2083, + "step": 6818 + }, + { + "epoch": 0.5402257872846108, + "grad_norm": 1.4999438641182867, + "learning_rate": 9.183628424503405e-06, + "loss": 0.2912, + "step": 6819 + }, + { + "epoch": 0.5403050108932462, + "grad_norm": 1.2406112321295946, + "learning_rate": 9.181070984788127e-06, + "loss": 0.2556, + "step": 6820 + }, + { + "epoch": 0.5403842345018816, + "grad_norm": 1.4732235368618307, + "learning_rate": 9.178513598995384e-06, + "loss": 0.2402, + "step": 6821 + }, + { + "epoch": 0.5404634581105169, + "grad_norm": 1.2827621289604894, + "learning_rate": 9.17595626729357e-06, + "loss": 0.2637, + "step": 6822 + }, + { + "epoch": 0.5405426817191523, + "grad_norm": 1.3969848391487683, + "learning_rate": 9.17339898985107e-06, + "loss": 0.2759, + "step": 6823 + }, + { + "epoch": 0.5406219053277876, + "grad_norm": 1.5036426892098076, + "learning_rate": 9.170841766836268e-06, + "loss": 0.3368, + "step": 6824 + }, + { + "epoch": 0.5407011289364231, + "grad_norm": 1.7783090548963019, + "learning_rate": 9.168284598417547e-06, + "loss": 0.3611, + "step": 6825 + }, + { + "epoch": 0.5407803525450584, + "grad_norm": 1.144402790331783, + "learning_rate": 9.165727484763283e-06, + "loss": 0.2365, + "step": 6826 + }, + { + "epoch": 0.5408595761536938, + "grad_norm": 1.6916031090588752, + "learning_rate": 9.16317042604185e-06, + "loss": 0.3724, + "step": 6827 + }, + { + "epoch": 0.5409387997623292, + "grad_norm": 1.3850273636397326, + "learning_rate": 9.160613422421616e-06, + "loss": 0.275, + "step": 6828 + }, + { + "epoch": 0.5410180233709645, + "grad_norm": 1.3759927025814456, + "learning_rate": 9.158056474070952e-06, + "loss": 0.235, + "step": 6829 + }, + { + "epoch": 0.5410972469795999, + "grad_norm": 1.1254229791677905, + "learning_rate": 9.155499581158217e-06, + "loss": 0.2024, + "step": 6830 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 1.0948319037781131, + "learning_rate": 9.152942743851771e-06, + "loss": 0.1867, + "step": 6831 + }, + { + "epoch": 0.5412556941968707, + "grad_norm": 1.3607300192864529, + "learning_rate": 9.15038596231997e-06, + "loss": 0.2031, + "step": 6832 + }, + { + "epoch": 0.541334917805506, + "grad_norm": 1.3352107419750603, + "learning_rate": 9.147829236731164e-06, + "loss": 0.2802, + "step": 6833 + }, + { + "epoch": 0.5414141414141415, + "grad_norm": 1.6159081358685348, + "learning_rate": 9.145272567253703e-06, + "loss": 0.304, + "step": 6834 + }, + { + "epoch": 0.5414933650227768, + "grad_norm": 1.3659771126025533, + "learning_rate": 9.142715954055932e-06, + "loss": 0.2851, + "step": 6835 + }, + { + "epoch": 0.5415725886314121, + "grad_norm": 1.2891282544941869, + "learning_rate": 9.140159397306188e-06, + "loss": 0.1974, + "step": 6836 + }, + { + "epoch": 0.5416518122400475, + "grad_norm": 1.1340869499362822, + "learning_rate": 9.137602897172814e-06, + "loss": 0.1817, + "step": 6837 + }, + { + "epoch": 0.5417310358486829, + "grad_norm": 1.1750921698952466, + "learning_rate": 9.135046453824136e-06, + "loss": 0.2026, + "step": 6838 + }, + { + "epoch": 0.5418102594573183, + "grad_norm": 1.411146416343359, + "learning_rate": 9.132490067428488e-06, + "loss": 0.3022, + "step": 6839 + }, + { + "epoch": 0.5418894830659536, + "grad_norm": 1.5103095912909645, + "learning_rate": 9.129933738154196e-06, + "loss": 0.2528, + "step": 6840 + }, + { + "epoch": 0.5419687066745891, + "grad_norm": 1.5544842207029321, + "learning_rate": 9.12737746616958e-06, + "loss": 0.3342, + "step": 6841 + }, + { + "epoch": 0.5420479302832244, + "grad_norm": 1.302758909001281, + "learning_rate": 9.124821251642959e-06, + "loss": 0.2255, + "step": 6842 + }, + { + "epoch": 0.5421271538918597, + "grad_norm": 1.4149142708560833, + "learning_rate": 9.122265094742648e-06, + "loss": 0.2909, + "step": 6843 + }, + { + "epoch": 0.5422063775004952, + "grad_norm": 1.2476464510760097, + "learning_rate": 9.119708995636957e-06, + "loss": 0.2584, + "step": 6844 + }, + { + "epoch": 0.5422856011091305, + "grad_norm": 1.4439780239021418, + "learning_rate": 9.117152954494195e-06, + "loss": 0.2923, + "step": 6845 + }, + { + "epoch": 0.5423648247177659, + "grad_norm": 1.1680376514327657, + "learning_rate": 9.114596971482658e-06, + "loss": 0.2736, + "step": 6846 + }, + { + "epoch": 0.5424440483264013, + "grad_norm": 1.4547670965966417, + "learning_rate": 9.112041046770653e-06, + "loss": 0.2747, + "step": 6847 + }, + { + "epoch": 0.5425232719350367, + "grad_norm": 1.3594226843958468, + "learning_rate": 9.109485180526474e-06, + "loss": 0.2747, + "step": 6848 + }, + { + "epoch": 0.542602495543672, + "grad_norm": 1.4465127403555254, + "learning_rate": 9.106929372918408e-06, + "loss": 0.305, + "step": 6849 + }, + { + "epoch": 0.5426817191523073, + "grad_norm": 1.48632715850047, + "learning_rate": 9.104373624114746e-06, + "loss": 0.2501, + "step": 6850 + }, + { + "epoch": 0.5427609427609428, + "grad_norm": 1.2631158319662426, + "learning_rate": 9.101817934283775e-06, + "loss": 0.2577, + "step": 6851 + }, + { + "epoch": 0.5428401663695781, + "grad_norm": 1.567934137488143, + "learning_rate": 9.099262303593768e-06, + "loss": 0.3261, + "step": 6852 + }, + { + "epoch": 0.5429193899782135, + "grad_norm": 1.4867969867070314, + "learning_rate": 9.096706732213005e-06, + "loss": 0.3252, + "step": 6853 + }, + { + "epoch": 0.5429986135868489, + "grad_norm": 1.339329468674581, + "learning_rate": 9.094151220309757e-06, + "loss": 0.3003, + "step": 6854 + }, + { + "epoch": 0.5430778371954843, + "grad_norm": 1.681644925970197, + "learning_rate": 9.091595768052291e-06, + "loss": 0.437, + "step": 6855 + }, + { + "epoch": 0.5431570608041196, + "grad_norm": 1.1846300182515337, + "learning_rate": 9.089040375608876e-06, + "loss": 0.2448, + "step": 6856 + }, + { + "epoch": 0.543236284412755, + "grad_norm": 1.7565882770308017, + "learning_rate": 9.086485043147768e-06, + "loss": 0.4709, + "step": 6857 + }, + { + "epoch": 0.5433155080213904, + "grad_norm": 1.3354437476459637, + "learning_rate": 9.083929770837222e-06, + "loss": 0.26, + "step": 6858 + }, + { + "epoch": 0.5433947316300257, + "grad_norm": 1.159526254455517, + "learning_rate": 9.081374558845496e-06, + "loss": 0.2099, + "step": 6859 + }, + { + "epoch": 0.5434739552386612, + "grad_norm": 1.0995538827632811, + "learning_rate": 9.078819407340833e-06, + "loss": 0.2467, + "step": 6860 + }, + { + "epoch": 0.5435531788472965, + "grad_norm": 1.4389166382750294, + "learning_rate": 9.07626431649148e-06, + "loss": 0.3022, + "step": 6861 + }, + { + "epoch": 0.5436324024559319, + "grad_norm": 1.476952265784712, + "learning_rate": 9.073709286465678e-06, + "loss": 0.3213, + "step": 6862 + }, + { + "epoch": 0.5437116260645672, + "grad_norm": 1.6442495749699753, + "learning_rate": 9.071154317431661e-06, + "loss": 0.2802, + "step": 6863 + }, + { + "epoch": 0.5437908496732026, + "grad_norm": 1.408720907713713, + "learning_rate": 9.068599409557664e-06, + "loss": 0.2776, + "step": 6864 + }, + { + "epoch": 0.543870073281838, + "grad_norm": 1.153009535378826, + "learning_rate": 9.066044563011914e-06, + "loss": 0.2973, + "step": 6865 + }, + { + "epoch": 0.5439492968904733, + "grad_norm": 1.5159670829023433, + "learning_rate": 9.063489777962634e-06, + "loss": 0.3822, + "step": 6866 + }, + { + "epoch": 0.5440285204991088, + "grad_norm": 1.06561274879749, + "learning_rate": 9.06093505457805e-06, + "loss": 0.1977, + "step": 6867 + }, + { + "epoch": 0.5441077441077441, + "grad_norm": 1.3917428484823267, + "learning_rate": 9.058380393026369e-06, + "loss": 0.2851, + "step": 6868 + }, + { + "epoch": 0.5441869677163795, + "grad_norm": 1.6141982997119226, + "learning_rate": 9.055825793475814e-06, + "loss": 0.3156, + "step": 6869 + }, + { + "epoch": 0.5442661913250149, + "grad_norm": 1.4909132465592059, + "learning_rate": 9.053271256094582e-06, + "loss": 0.301, + "step": 6870 + }, + { + "epoch": 0.5443454149336502, + "grad_norm": 1.6480605354528746, + "learning_rate": 9.050716781050885e-06, + "loss": 0.3054, + "step": 6871 + }, + { + "epoch": 0.5444246385422856, + "grad_norm": 1.1320509743061042, + "learning_rate": 9.04816236851292e-06, + "loss": 0.253, + "step": 6872 + }, + { + "epoch": 0.544503862150921, + "grad_norm": 1.3349196762230269, + "learning_rate": 9.045608018648884e-06, + "loss": 0.2674, + "step": 6873 + }, + { + "epoch": 0.5445830857595564, + "grad_norm": 1.2001703711965126, + "learning_rate": 9.043053731626964e-06, + "loss": 0.22, + "step": 6874 + }, + { + "epoch": 0.5446623093681917, + "grad_norm": 1.482132402507571, + "learning_rate": 9.040499507615356e-06, + "loss": 0.305, + "step": 6875 + }, + { + "epoch": 0.5447415329768271, + "grad_norm": 1.4862296537832334, + "learning_rate": 9.037945346782236e-06, + "loss": 0.222, + "step": 6876 + }, + { + "epoch": 0.5448207565854625, + "grad_norm": 1.3081674056000887, + "learning_rate": 9.035391249295788e-06, + "loss": 0.2588, + "step": 6877 + }, + { + "epoch": 0.5448999801940978, + "grad_norm": 1.5066008134959405, + "learning_rate": 9.032837215324183e-06, + "loss": 0.279, + "step": 6878 + }, + { + "epoch": 0.5449792038027332, + "grad_norm": 1.4267615465247832, + "learning_rate": 9.030283245035594e-06, + "loss": 0.3288, + "step": 6879 + }, + { + "epoch": 0.5450584274113686, + "grad_norm": 1.3099286772753327, + "learning_rate": 9.027729338598188e-06, + "loss": 0.2788, + "step": 6880 + }, + { + "epoch": 0.545137651020004, + "grad_norm": 1.0935041609545688, + "learning_rate": 9.025175496180125e-06, + "loss": 0.1947, + "step": 6881 + }, + { + "epoch": 0.5452168746286393, + "grad_norm": 1.195419440790593, + "learning_rate": 9.022621717949566e-06, + "loss": 0.2279, + "step": 6882 + }, + { + "epoch": 0.5452960982372748, + "grad_norm": 1.3532331676248126, + "learning_rate": 9.020068004074665e-06, + "loss": 0.2427, + "step": 6883 + }, + { + "epoch": 0.5453753218459101, + "grad_norm": 1.6185665690603852, + "learning_rate": 9.01751435472357e-06, + "loss": 0.3573, + "step": 6884 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.5977971488027076, + "learning_rate": 9.014960770064429e-06, + "loss": 0.3424, + "step": 6885 + }, + { + "epoch": 0.5455337690631809, + "grad_norm": 1.5083925112010113, + "learning_rate": 9.012407250265377e-06, + "loss": 0.2995, + "step": 6886 + }, + { + "epoch": 0.5456129926718162, + "grad_norm": 1.623482216406467, + "learning_rate": 9.009853795494558e-06, + "loss": 0.2871, + "step": 6887 + }, + { + "epoch": 0.5456922162804516, + "grad_norm": 1.1919516117091917, + "learning_rate": 9.007300405920105e-06, + "loss": 0.2815, + "step": 6888 + }, + { + "epoch": 0.5457714398890869, + "grad_norm": 1.4866016171699366, + "learning_rate": 9.00474708171014e-06, + "loss": 0.2689, + "step": 6889 + }, + { + "epoch": 0.5458506634977223, + "grad_norm": 1.3101945873144103, + "learning_rate": 9.002193823032791e-06, + "loss": 0.2582, + "step": 6890 + }, + { + "epoch": 0.5459298871063577, + "grad_norm": 1.3014135712972907, + "learning_rate": 8.999640630056183e-06, + "loss": 0.2487, + "step": 6891 + }, + { + "epoch": 0.546009110714993, + "grad_norm": 1.2856734768354838, + "learning_rate": 8.997087502948423e-06, + "loss": 0.2811, + "step": 6892 + }, + { + "epoch": 0.5460883343236285, + "grad_norm": 1.240251168182242, + "learning_rate": 8.994534441877625e-06, + "loss": 0.2886, + "step": 6893 + }, + { + "epoch": 0.5461675579322638, + "grad_norm": 1.5874360894265425, + "learning_rate": 8.991981447011896e-06, + "loss": 0.3011, + "step": 6894 + }, + { + "epoch": 0.5462467815408992, + "grad_norm": 1.4112277136789018, + "learning_rate": 8.989428518519336e-06, + "loss": 0.285, + "step": 6895 + }, + { + "epoch": 0.5463260051495346, + "grad_norm": 1.4425651170508136, + "learning_rate": 8.986875656568047e-06, + "loss": 0.3021, + "step": 6896 + }, + { + "epoch": 0.5464052287581699, + "grad_norm": 1.4728773843795968, + "learning_rate": 8.984322861326122e-06, + "loss": 0.3289, + "step": 6897 + }, + { + "epoch": 0.5464844523668053, + "grad_norm": 1.263702375281725, + "learning_rate": 8.981770132961649e-06, + "loss": 0.2609, + "step": 6898 + }, + { + "epoch": 0.5465636759754406, + "grad_norm": 1.2971754602789756, + "learning_rate": 8.979217471642712e-06, + "loss": 0.29, + "step": 6899 + }, + { + "epoch": 0.5466428995840761, + "grad_norm": 1.3128668393061134, + "learning_rate": 8.976664877537395e-06, + "loss": 0.2386, + "step": 6900 + }, + { + "epoch": 0.5467221231927114, + "grad_norm": 1.5249679057038013, + "learning_rate": 8.974112350813771e-06, + "loss": 0.3373, + "step": 6901 + }, + { + "epoch": 0.5468013468013468, + "grad_norm": 1.2538269437771639, + "learning_rate": 8.971559891639913e-06, + "loss": 0.2874, + "step": 6902 + }, + { + "epoch": 0.5468805704099822, + "grad_norm": 1.1265054139187154, + "learning_rate": 8.969007500183886e-06, + "loss": 0.1964, + "step": 6903 + }, + { + "epoch": 0.5469597940186175, + "grad_norm": 1.8250436705146966, + "learning_rate": 8.966455176613754e-06, + "loss": 0.3165, + "step": 6904 + }, + { + "epoch": 0.5470390176272529, + "grad_norm": 1.1358280277227444, + "learning_rate": 8.963902921097579e-06, + "loss": 0.3061, + "step": 6905 + }, + { + "epoch": 0.5471182412358883, + "grad_norm": 1.422213027748279, + "learning_rate": 8.961350733803406e-06, + "loss": 0.2477, + "step": 6906 + }, + { + "epoch": 0.5471974648445237, + "grad_norm": 1.3136773494120368, + "learning_rate": 8.958798614899291e-06, + "loss": 0.2944, + "step": 6907 + }, + { + "epoch": 0.547276688453159, + "grad_norm": 1.2763049861047902, + "learning_rate": 8.956246564553282e-06, + "loss": 0.2564, + "step": 6908 + }, + { + "epoch": 0.5473559120617945, + "grad_norm": 1.4557728586838161, + "learning_rate": 8.95369458293341e-06, + "loss": 0.1645, + "step": 6909 + }, + { + "epoch": 0.5474351356704298, + "grad_norm": 1.5100263834540892, + "learning_rate": 8.951142670207718e-06, + "loss": 0.2934, + "step": 6910 + }, + { + "epoch": 0.5475143592790651, + "grad_norm": 1.5802000441624515, + "learning_rate": 8.948590826544232e-06, + "loss": 0.3578, + "step": 6911 + }, + { + "epoch": 0.5475935828877005, + "grad_norm": 1.3998515076163713, + "learning_rate": 8.94603905211098e-06, + "loss": 0.2114, + "step": 6912 + }, + { + "epoch": 0.5476728064963359, + "grad_norm": 1.7569387517582804, + "learning_rate": 8.943487347075988e-06, + "loss": 0.3745, + "step": 6913 + }, + { + "epoch": 0.5477520301049713, + "grad_norm": 1.3209345165525932, + "learning_rate": 8.94093571160727e-06, + "loss": 0.2668, + "step": 6914 + }, + { + "epoch": 0.5478312537136066, + "grad_norm": 1.2811728368418742, + "learning_rate": 8.938384145872838e-06, + "loss": 0.2379, + "step": 6915 + }, + { + "epoch": 0.5479104773222421, + "grad_norm": 1.2708678179518775, + "learning_rate": 8.935832650040703e-06, + "loss": 0.2316, + "step": 6916 + }, + { + "epoch": 0.5479897009308774, + "grad_norm": 1.4508650297979424, + "learning_rate": 8.933281224278867e-06, + "loss": 0.2966, + "step": 6917 + }, + { + "epoch": 0.5480689245395127, + "grad_norm": 1.4211166759044775, + "learning_rate": 8.930729868755333e-06, + "loss": 0.2614, + "step": 6918 + }, + { + "epoch": 0.5481481481481482, + "grad_norm": 1.2284427625263326, + "learning_rate": 8.928178583638088e-06, + "loss": 0.2387, + "step": 6919 + }, + { + "epoch": 0.5482273717567835, + "grad_norm": 1.504665184186934, + "learning_rate": 8.925627369095125e-06, + "loss": 0.2822, + "step": 6920 + }, + { + "epoch": 0.5483065953654189, + "grad_norm": 1.473054408741151, + "learning_rate": 8.923076225294434e-06, + "loss": 0.2943, + "step": 6921 + }, + { + "epoch": 0.5483858189740543, + "grad_norm": 1.2828874915525819, + "learning_rate": 8.920525152403989e-06, + "loss": 0.212, + "step": 6922 + }, + { + "epoch": 0.5484650425826897, + "grad_norm": 1.8004652671248182, + "learning_rate": 8.917974150591772e-06, + "loss": 0.3856, + "step": 6923 + }, + { + "epoch": 0.548544266191325, + "grad_norm": 1.2126109056910321, + "learning_rate": 8.915423220025747e-06, + "loss": 0.1608, + "step": 6924 + }, + { + "epoch": 0.5486234897999603, + "grad_norm": 1.3678070054734122, + "learning_rate": 8.912872360873885e-06, + "loss": 0.2497, + "step": 6925 + }, + { + "epoch": 0.5487027134085958, + "grad_norm": 1.1358820890931334, + "learning_rate": 8.91032157330415e-06, + "loss": 0.2253, + "step": 6926 + }, + { + "epoch": 0.5487819370172311, + "grad_norm": 1.4626639327172861, + "learning_rate": 8.907770857484493e-06, + "loss": 0.2999, + "step": 6927 + }, + { + "epoch": 0.5488611606258665, + "grad_norm": 1.4087092746931544, + "learning_rate": 8.90522021358287e-06, + "loss": 0.2805, + "step": 6928 + }, + { + "epoch": 0.5489403842345019, + "grad_norm": 1.7609525402754054, + "learning_rate": 8.90266964176723e-06, + "loss": 0.3395, + "step": 6929 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 1.1523568283379255, + "learning_rate": 8.90011914220551e-06, + "loss": 0.2809, + "step": 6930 + }, + { + "epoch": 0.5490988314517726, + "grad_norm": 1.2190853828788084, + "learning_rate": 8.897568715065658e-06, + "loss": 0.1968, + "step": 6931 + }, + { + "epoch": 0.549178055060408, + "grad_norm": 1.6419662613464567, + "learning_rate": 8.895018360515597e-06, + "loss": 0.3171, + "step": 6932 + }, + { + "epoch": 0.5492572786690434, + "grad_norm": 1.448724775478952, + "learning_rate": 8.892468078723262e-06, + "loss": 0.2071, + "step": 6933 + }, + { + "epoch": 0.5493365022776787, + "grad_norm": 1.8891208577997054, + "learning_rate": 8.889917869856576e-06, + "loss": 0.3859, + "step": 6934 + }, + { + "epoch": 0.5494157258863142, + "grad_norm": 1.040836904582798, + "learning_rate": 8.887367734083454e-06, + "loss": 0.1659, + "step": 6935 + }, + { + "epoch": 0.5494949494949495, + "grad_norm": 1.258803127289834, + "learning_rate": 8.884817671571815e-06, + "loss": 0.2696, + "step": 6936 + }, + { + "epoch": 0.5495741731035849, + "grad_norm": 1.2084857038180408, + "learning_rate": 8.882267682489566e-06, + "loss": 0.249, + "step": 6937 + }, + { + "epoch": 0.5496533967122202, + "grad_norm": 1.2568175200065401, + "learning_rate": 8.879717767004613e-06, + "loss": 0.2094, + "step": 6938 + }, + { + "epoch": 0.5497326203208556, + "grad_norm": 1.23275359100953, + "learning_rate": 8.877167925284855e-06, + "loss": 0.2281, + "step": 6939 + }, + { + "epoch": 0.549811843929491, + "grad_norm": 1.263428836065982, + "learning_rate": 8.874618157498183e-06, + "loss": 0.2671, + "step": 6940 + }, + { + "epoch": 0.5498910675381263, + "grad_norm": 1.3501202071721725, + "learning_rate": 8.872068463812492e-06, + "loss": 0.288, + "step": 6941 + }, + { + "epoch": 0.5499702911467618, + "grad_norm": 1.3474008197919445, + "learning_rate": 8.869518844395667e-06, + "loss": 0.28, + "step": 6942 + }, + { + "epoch": 0.5500495147553971, + "grad_norm": 1.411981560808872, + "learning_rate": 8.866969299415585e-06, + "loss": 0.3008, + "step": 6943 + }, + { + "epoch": 0.5501287383640325, + "grad_norm": 1.204544116518048, + "learning_rate": 8.864419829040122e-06, + "loss": 0.207, + "step": 6944 + }, + { + "epoch": 0.5502079619726679, + "grad_norm": 1.0562696657413404, + "learning_rate": 8.86187043343715e-06, + "loss": 0.1735, + "step": 6945 + }, + { + "epoch": 0.5502871855813032, + "grad_norm": 1.5935942836737471, + "learning_rate": 8.859321112774535e-06, + "loss": 0.2379, + "step": 6946 + }, + { + "epoch": 0.5503664091899386, + "grad_norm": 1.5108345229367142, + "learning_rate": 8.856771867220135e-06, + "loss": 0.2932, + "step": 6947 + }, + { + "epoch": 0.550445632798574, + "grad_norm": 1.2633926974575818, + "learning_rate": 8.854222696941807e-06, + "loss": 0.2405, + "step": 6948 + }, + { + "epoch": 0.5505248564072094, + "grad_norm": 1.3106072137752853, + "learning_rate": 8.8516736021074e-06, + "loss": 0.2203, + "step": 6949 + }, + { + "epoch": 0.5506040800158447, + "grad_norm": 1.526450625325032, + "learning_rate": 8.849124582884762e-06, + "loss": 0.2552, + "step": 6950 + }, + { + "epoch": 0.5506833036244801, + "grad_norm": 1.5382908631266865, + "learning_rate": 8.846575639441732e-06, + "loss": 0.366, + "step": 6951 + }, + { + "epoch": 0.5507625272331155, + "grad_norm": 1.7432571829080619, + "learning_rate": 8.844026771946148e-06, + "loss": 0.431, + "step": 6952 + }, + { + "epoch": 0.5508417508417508, + "grad_norm": 1.199965665640729, + "learning_rate": 8.841477980565838e-06, + "loss": 0.2772, + "step": 6953 + }, + { + "epoch": 0.5509209744503862, + "grad_norm": 1.9614952545636384, + "learning_rate": 8.838929265468627e-06, + "loss": 0.325, + "step": 6954 + }, + { + "epoch": 0.5510001980590216, + "grad_norm": 1.6247552178977018, + "learning_rate": 8.836380626822339e-06, + "loss": 0.2707, + "step": 6955 + }, + { + "epoch": 0.551079421667657, + "grad_norm": 1.756983936814514, + "learning_rate": 8.833832064794787e-06, + "loss": 0.4087, + "step": 6956 + }, + { + "epoch": 0.5511586452762923, + "grad_norm": 1.1659653018660858, + "learning_rate": 8.831283579553781e-06, + "loss": 0.1787, + "step": 6957 + }, + { + "epoch": 0.5512378688849278, + "grad_norm": 1.344745218583525, + "learning_rate": 8.828735171267131e-06, + "loss": 0.2938, + "step": 6958 + }, + { + "epoch": 0.5513170924935631, + "grad_norm": 1.1213942986982695, + "learning_rate": 8.82618684010263e-06, + "loss": 0.2263, + "step": 6959 + }, + { + "epoch": 0.5513963161021984, + "grad_norm": 1.456644804639871, + "learning_rate": 8.823638586228081e-06, + "loss": 0.3442, + "step": 6960 + }, + { + "epoch": 0.5514755397108339, + "grad_norm": 1.443779846323453, + "learning_rate": 8.82109040981127e-06, + "loss": 0.2701, + "step": 6961 + }, + { + "epoch": 0.5515547633194692, + "grad_norm": 1.3833112925058488, + "learning_rate": 8.818542311019982e-06, + "loss": 0.2554, + "step": 6962 + }, + { + "epoch": 0.5516339869281046, + "grad_norm": 1.4962457347240226, + "learning_rate": 8.815994290022e-06, + "loss": 0.325, + "step": 6963 + }, + { + "epoch": 0.5517132105367399, + "grad_norm": 1.2654105139461969, + "learning_rate": 8.813446346985095e-06, + "loss": 0.1987, + "step": 6964 + }, + { + "epoch": 0.5517924341453754, + "grad_norm": 1.662239387594392, + "learning_rate": 8.810898482077038e-06, + "loss": 0.3357, + "step": 6965 + }, + { + "epoch": 0.5518716577540107, + "grad_norm": 1.3287944208256632, + "learning_rate": 8.808350695465597e-06, + "loss": 0.2602, + "step": 6966 + }, + { + "epoch": 0.551950881362646, + "grad_norm": 1.4833147334650705, + "learning_rate": 8.805802987318527e-06, + "loss": 0.293, + "step": 6967 + }, + { + "epoch": 0.5520301049712815, + "grad_norm": 1.161244069502438, + "learning_rate": 8.803255357803584e-06, + "loss": 0.2576, + "step": 6968 + }, + { + "epoch": 0.5521093285799168, + "grad_norm": 1.4312068122365944, + "learning_rate": 8.800707807088521e-06, + "loss": 0.2397, + "step": 6969 + }, + { + "epoch": 0.5521885521885522, + "grad_norm": 1.5833591166841852, + "learning_rate": 8.798160335341078e-06, + "loss": 0.3399, + "step": 6970 + }, + { + "epoch": 0.5522677757971876, + "grad_norm": 1.6048424508706616, + "learning_rate": 8.795612942728989e-06, + "loss": 0.3027, + "step": 6971 + }, + { + "epoch": 0.5523469994058229, + "grad_norm": 1.286220152449889, + "learning_rate": 8.793065629419996e-06, + "loss": 0.2849, + "step": 6972 + }, + { + "epoch": 0.5524262230144583, + "grad_norm": 1.212823327888148, + "learning_rate": 8.790518395581823e-06, + "loss": 0.2762, + "step": 6973 + }, + { + "epoch": 0.5525054466230936, + "grad_norm": 1.431174705529821, + "learning_rate": 8.787971241382193e-06, + "loss": 0.3096, + "step": 6974 + }, + { + "epoch": 0.5525846702317291, + "grad_norm": 1.3257544534779628, + "learning_rate": 8.785424166988827e-06, + "loss": 0.25, + "step": 6975 + }, + { + "epoch": 0.5526638938403644, + "grad_norm": 1.151711056263223, + "learning_rate": 8.782877172569433e-06, + "loss": 0.2076, + "step": 6976 + }, + { + "epoch": 0.5527431174489998, + "grad_norm": 1.3300492256841259, + "learning_rate": 8.78033025829172e-06, + "loss": 0.2822, + "step": 6977 + }, + { + "epoch": 0.5528223410576352, + "grad_norm": 1.3911495408390682, + "learning_rate": 8.777783424323396e-06, + "loss": 0.253, + "step": 6978 + }, + { + "epoch": 0.5529015646662705, + "grad_norm": 1.121397823657544, + "learning_rate": 8.775236670832146e-06, + "loss": 0.2275, + "step": 6979 + }, + { + "epoch": 0.5529807882749059, + "grad_norm": 1.4714846754801767, + "learning_rate": 8.772689997985674e-06, + "loss": 0.2749, + "step": 6980 + }, + { + "epoch": 0.5530600118835413, + "grad_norm": 1.6130036809663622, + "learning_rate": 8.770143405951657e-06, + "loss": 0.3361, + "step": 6981 + }, + { + "epoch": 0.5531392354921767, + "grad_norm": 1.1722354047997954, + "learning_rate": 8.76759689489778e-06, + "loss": 0.2264, + "step": 6982 + }, + { + "epoch": 0.553218459100812, + "grad_norm": 1.2197168007108004, + "learning_rate": 8.765050464991716e-06, + "loss": 0.2576, + "step": 6983 + }, + { + "epoch": 0.5532976827094475, + "grad_norm": 1.363851834758067, + "learning_rate": 8.762504116401137e-06, + "loss": 0.2786, + "step": 6984 + }, + { + "epoch": 0.5533769063180828, + "grad_norm": 1.3580455369340627, + "learning_rate": 8.759957849293707e-06, + "loss": 0.2213, + "step": 6985 + }, + { + "epoch": 0.5534561299267181, + "grad_norm": 1.6290460759122434, + "learning_rate": 8.75741166383709e-06, + "loss": 0.3579, + "step": 6986 + }, + { + "epoch": 0.5535353535353535, + "grad_norm": 1.4451655439647537, + "learning_rate": 8.754865560198932e-06, + "loss": 0.2448, + "step": 6987 + }, + { + "epoch": 0.5536145771439889, + "grad_norm": 1.5497196325318428, + "learning_rate": 8.752319538546888e-06, + "loss": 0.3126, + "step": 6988 + }, + { + "epoch": 0.5536938007526243, + "grad_norm": 1.630666797480555, + "learning_rate": 8.749773599048597e-06, + "loss": 0.3639, + "step": 6989 + }, + { + "epoch": 0.5537730243612596, + "grad_norm": 1.0604334692389863, + "learning_rate": 8.747227741871698e-06, + "loss": 0.1621, + "step": 6990 + }, + { + "epoch": 0.5538522479698951, + "grad_norm": 1.381439156081259, + "learning_rate": 8.744681967183826e-06, + "loss": 0.2841, + "step": 6991 + }, + { + "epoch": 0.5539314715785304, + "grad_norm": 1.5024721739927875, + "learning_rate": 8.742136275152606e-06, + "loss": 0.3204, + "step": 6992 + }, + { + "epoch": 0.5540106951871657, + "grad_norm": 1.4006014004183762, + "learning_rate": 8.73959066594566e-06, + "loss": 0.2688, + "step": 6993 + }, + { + "epoch": 0.5540899187958012, + "grad_norm": 1.3557176773863153, + "learning_rate": 8.737045139730605e-06, + "loss": 0.2556, + "step": 6994 + }, + { + "epoch": 0.5541691424044365, + "grad_norm": 1.0664299028816309, + "learning_rate": 8.734499696675048e-06, + "loss": 0.2105, + "step": 6995 + }, + { + "epoch": 0.5542483660130719, + "grad_norm": 1.2018334964232642, + "learning_rate": 8.731954336946599e-06, + "loss": 0.1992, + "step": 6996 + }, + { + "epoch": 0.5543275896217073, + "grad_norm": 1.3688962400292068, + "learning_rate": 8.729409060712855e-06, + "loss": 0.3049, + "step": 6997 + }, + { + "epoch": 0.5544068132303427, + "grad_norm": 1.4765175368660743, + "learning_rate": 8.726863868141408e-06, + "loss": 0.2436, + "step": 6998 + }, + { + "epoch": 0.554486036838978, + "grad_norm": 1.627207577853215, + "learning_rate": 8.724318759399853e-06, + "loss": 0.3119, + "step": 6999 + }, + { + "epoch": 0.5545652604476133, + "grad_norm": 1.3067980713195708, + "learning_rate": 8.721773734655768e-06, + "loss": 0.2407, + "step": 7000 + }, + { + "epoch": 0.5546444840562488, + "grad_norm": 1.2555764831978828, + "learning_rate": 8.719228794076733e-06, + "loss": 0.1954, + "step": 7001 + }, + { + "epoch": 0.5547237076648841, + "grad_norm": 1.2926151986598575, + "learning_rate": 8.716683937830318e-06, + "loss": 0.2673, + "step": 7002 + }, + { + "epoch": 0.5548029312735195, + "grad_norm": 1.3538085249155307, + "learning_rate": 8.71413916608409e-06, + "loss": 0.2841, + "step": 7003 + }, + { + "epoch": 0.5548821548821549, + "grad_norm": 1.2621199109801466, + "learning_rate": 8.711594479005614e-06, + "loss": 0.2291, + "step": 7004 + }, + { + "epoch": 0.5549613784907903, + "grad_norm": 1.1922708851372084, + "learning_rate": 8.709049876762438e-06, + "loss": 0.1892, + "step": 7005 + }, + { + "epoch": 0.5550406020994256, + "grad_norm": 1.575841138124968, + "learning_rate": 8.706505359522119e-06, + "loss": 0.3524, + "step": 7006 + }, + { + "epoch": 0.555119825708061, + "grad_norm": 1.6571406227612384, + "learning_rate": 8.703960927452197e-06, + "loss": 0.2215, + "step": 7007 + }, + { + "epoch": 0.5551990493166964, + "grad_norm": 1.6755694622196637, + "learning_rate": 8.701416580720212e-06, + "loss": 0.3882, + "step": 7008 + }, + { + "epoch": 0.5552782729253317, + "grad_norm": 1.5195066673599789, + "learning_rate": 8.698872319493698e-06, + "loss": 0.3143, + "step": 7009 + }, + { + "epoch": 0.5553574965339672, + "grad_norm": 1.2318127103165126, + "learning_rate": 8.69632814394018e-06, + "loss": 0.2163, + "step": 7010 + }, + { + "epoch": 0.5554367201426025, + "grad_norm": 1.1943752928933378, + "learning_rate": 8.693784054227179e-06, + "loss": 0.2625, + "step": 7011 + }, + { + "epoch": 0.5555159437512379, + "grad_norm": 2.2911132376557894, + "learning_rate": 8.691240050522215e-06, + "loss": 0.4775, + "step": 7012 + }, + { + "epoch": 0.5555951673598732, + "grad_norm": 1.496897603650919, + "learning_rate": 8.688696132992797e-06, + "loss": 0.2984, + "step": 7013 + }, + { + "epoch": 0.5556743909685086, + "grad_norm": 1.4058859323284971, + "learning_rate": 8.686152301806427e-06, + "loss": 0.3652, + "step": 7014 + }, + { + "epoch": 0.555753614577144, + "grad_norm": 1.2391362714854297, + "learning_rate": 8.683608557130608e-06, + "loss": 0.2354, + "step": 7015 + }, + { + "epoch": 0.5558328381857793, + "grad_norm": 1.494118678644517, + "learning_rate": 8.681064899132831e-06, + "loss": 0.3046, + "step": 7016 + }, + { + "epoch": 0.5559120617944148, + "grad_norm": 0.9862226319404613, + "learning_rate": 8.678521327980585e-06, + "loss": 0.2155, + "step": 7017 + }, + { + "epoch": 0.5559912854030501, + "grad_norm": 1.1660430842016558, + "learning_rate": 8.675977843841347e-06, + "loss": 0.2464, + "step": 7018 + }, + { + "epoch": 0.5560705090116855, + "grad_norm": 1.4253487223919135, + "learning_rate": 8.673434446882601e-06, + "loss": 0.2247, + "step": 7019 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 1.2574712667766956, + "learning_rate": 8.670891137271814e-06, + "loss": 0.2092, + "step": 7020 + }, + { + "epoch": 0.5562289562289562, + "grad_norm": 1.339563100725818, + "learning_rate": 8.668347915176448e-06, + "loss": 0.2391, + "step": 7021 + }, + { + "epoch": 0.5563081798375916, + "grad_norm": 1.466115126696012, + "learning_rate": 8.665804780763963e-06, + "loss": 0.3085, + "step": 7022 + }, + { + "epoch": 0.556387403446227, + "grad_norm": 1.128684611561647, + "learning_rate": 8.663261734201818e-06, + "loss": 0.2409, + "step": 7023 + }, + { + "epoch": 0.5564666270548624, + "grad_norm": 1.3268103481091216, + "learning_rate": 8.660718775657453e-06, + "loss": 0.2964, + "step": 7024 + }, + { + "epoch": 0.5565458506634977, + "grad_norm": 1.384930534251322, + "learning_rate": 8.658175905298314e-06, + "loss": 0.3002, + "step": 7025 + }, + { + "epoch": 0.5566250742721331, + "grad_norm": 1.2477565063461078, + "learning_rate": 8.655633123291833e-06, + "loss": 0.2268, + "step": 7026 + }, + { + "epoch": 0.5567042978807685, + "grad_norm": 1.317272146662216, + "learning_rate": 8.653090429805442e-06, + "loss": 0.2726, + "step": 7027 + }, + { + "epoch": 0.5567835214894038, + "grad_norm": 1.1613292359285394, + "learning_rate": 8.650547825006568e-06, + "loss": 0.2131, + "step": 7028 + }, + { + "epoch": 0.5568627450980392, + "grad_norm": 1.2324465875388209, + "learning_rate": 8.648005309062623e-06, + "loss": 0.2526, + "step": 7029 + }, + { + "epoch": 0.5569419687066746, + "grad_norm": 1.1495201235057504, + "learning_rate": 8.645462882141026e-06, + "loss": 0.2579, + "step": 7030 + }, + { + "epoch": 0.55702119231531, + "grad_norm": 1.3929608438990937, + "learning_rate": 8.64292054440918e-06, + "loss": 0.2303, + "step": 7031 + }, + { + "epoch": 0.5571004159239453, + "grad_norm": 1.2622642989531188, + "learning_rate": 8.640378296034486e-06, + "loss": 0.1915, + "step": 7032 + }, + { + "epoch": 0.5571796395325808, + "grad_norm": 1.5058902247650723, + "learning_rate": 8.63783613718434e-06, + "loss": 0.3413, + "step": 7033 + }, + { + "epoch": 0.5572588631412161, + "grad_norm": 1.6088861569034487, + "learning_rate": 8.63529406802613e-06, + "loss": 0.2887, + "step": 7034 + }, + { + "epoch": 0.5573380867498514, + "grad_norm": 1.5210899799877822, + "learning_rate": 8.632752088727237e-06, + "loss": 0.2557, + "step": 7035 + }, + { + "epoch": 0.5574173103584869, + "grad_norm": 1.1025287746717471, + "learning_rate": 8.63021019945504e-06, + "loss": 0.2294, + "step": 7036 + }, + { + "epoch": 0.5574965339671222, + "grad_norm": 1.3529702188346702, + "learning_rate": 8.627668400376914e-06, + "loss": 0.2395, + "step": 7037 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 1.5041655223487767, + "learning_rate": 8.625126691660216e-06, + "loss": 0.3156, + "step": 7038 + }, + { + "epoch": 0.5576549811843929, + "grad_norm": 1.502161549165598, + "learning_rate": 8.622585073472314e-06, + "loss": 0.2776, + "step": 7039 + }, + { + "epoch": 0.5577342047930284, + "grad_norm": 1.578697230875441, + "learning_rate": 8.620043545980554e-06, + "loss": 0.3452, + "step": 7040 + }, + { + "epoch": 0.5578134284016637, + "grad_norm": 1.5495976089780932, + "learning_rate": 8.61750210935229e-06, + "loss": 0.2586, + "step": 7041 + }, + { + "epoch": 0.557892652010299, + "grad_norm": 1.5769415575724814, + "learning_rate": 8.614960763754857e-06, + "loss": 0.3021, + "step": 7042 + }, + { + "epoch": 0.5579718756189345, + "grad_norm": 1.1811405929681862, + "learning_rate": 8.612419509355593e-06, + "loss": 0.2002, + "step": 7043 + }, + { + "epoch": 0.5580510992275698, + "grad_norm": 1.3489136868126388, + "learning_rate": 8.60987834632183e-06, + "loss": 0.2774, + "step": 7044 + }, + { + "epoch": 0.5581303228362052, + "grad_norm": 1.5170222762852672, + "learning_rate": 8.607337274820888e-06, + "loss": 0.3278, + "step": 7045 + }, + { + "epoch": 0.5582095464448406, + "grad_norm": 1.3484185129395854, + "learning_rate": 8.604796295020085e-06, + "loss": 0.3032, + "step": 7046 + }, + { + "epoch": 0.558288770053476, + "grad_norm": 1.492528151391354, + "learning_rate": 8.602255407086736e-06, + "loss": 0.2857, + "step": 7047 + }, + { + "epoch": 0.5583679936621113, + "grad_norm": 1.5481470859715287, + "learning_rate": 8.599714611188141e-06, + "loss": 0.3363, + "step": 7048 + }, + { + "epoch": 0.5584472172707466, + "grad_norm": 1.1986021562974838, + "learning_rate": 8.5971739074916e-06, + "loss": 0.215, + "step": 7049 + }, + { + "epoch": 0.5585264408793821, + "grad_norm": 1.310784351988188, + "learning_rate": 8.594633296164409e-06, + "loss": 0.2689, + "step": 7050 + }, + { + "epoch": 0.5586056644880174, + "grad_norm": 1.3452235131316717, + "learning_rate": 8.59209277737385e-06, + "loss": 0.2468, + "step": 7051 + }, + { + "epoch": 0.5586848880966528, + "grad_norm": 1.4714512244632645, + "learning_rate": 8.58955235128721e-06, + "loss": 0.33, + "step": 7052 + }, + { + "epoch": 0.5587641117052882, + "grad_norm": 1.5061138538914511, + "learning_rate": 8.58701201807176e-06, + "loss": 0.3156, + "step": 7053 + }, + { + "epoch": 0.5588433353139235, + "grad_norm": 1.2522530177300015, + "learning_rate": 8.584471777894768e-06, + "loss": 0.1907, + "step": 7054 + }, + { + "epoch": 0.5589225589225589, + "grad_norm": 1.3235660810518188, + "learning_rate": 8.581931630923499e-06, + "loss": 0.2377, + "step": 7055 + }, + { + "epoch": 0.5590017825311943, + "grad_norm": 1.4326187422574033, + "learning_rate": 8.57939157732521e-06, + "loss": 0.2429, + "step": 7056 + }, + { + "epoch": 0.5590810061398297, + "grad_norm": 2.109050067093894, + "learning_rate": 8.576851617267151e-06, + "loss": 0.2673, + "step": 7057 + }, + { + "epoch": 0.559160229748465, + "grad_norm": 1.5605114478778377, + "learning_rate": 8.574311750916565e-06, + "loss": 0.2436, + "step": 7058 + }, + { + "epoch": 0.5592394533571005, + "grad_norm": 1.584420623030401, + "learning_rate": 8.571771978440689e-06, + "loss": 0.3708, + "step": 7059 + }, + { + "epoch": 0.5593186769657358, + "grad_norm": 1.2531242869135015, + "learning_rate": 8.569232300006756e-06, + "loss": 0.2125, + "step": 7060 + }, + { + "epoch": 0.5593979005743711, + "grad_norm": 1.3675348283572362, + "learning_rate": 8.566692715781992e-06, + "loss": 0.2191, + "step": 7061 + }, + { + "epoch": 0.5594771241830065, + "grad_norm": 1.4878703623488974, + "learning_rate": 8.564153225933616e-06, + "loss": 0.3401, + "step": 7062 + }, + { + "epoch": 0.5595563477916419, + "grad_norm": 1.2066271452322694, + "learning_rate": 8.56161383062884e-06, + "loss": 0.2523, + "step": 7063 + }, + { + "epoch": 0.5596355714002773, + "grad_norm": 1.1984780159012218, + "learning_rate": 8.559074530034875e-06, + "loss": 0.2059, + "step": 7064 + }, + { + "epoch": 0.5597147950089126, + "grad_norm": 1.1447762422016563, + "learning_rate": 8.556535324318916e-06, + "loss": 0.1965, + "step": 7065 + }, + { + "epoch": 0.5597940186175481, + "grad_norm": 1.3860454828574094, + "learning_rate": 8.553996213648164e-06, + "loss": 0.2709, + "step": 7066 + }, + { + "epoch": 0.5598732422261834, + "grad_norm": 1.3432315884063428, + "learning_rate": 8.551457198189799e-06, + "loss": 0.2254, + "step": 7067 + }, + { + "epoch": 0.5599524658348187, + "grad_norm": 1.0012160317613514, + "learning_rate": 8.54891827811101e-06, + "loss": 0.132, + "step": 7068 + }, + { + "epoch": 0.5600316894434542, + "grad_norm": 2.055167057385489, + "learning_rate": 8.546379453578972e-06, + "loss": 0.2837, + "step": 7069 + }, + { + "epoch": 0.5601109130520895, + "grad_norm": 1.7372481377277484, + "learning_rate": 8.543840724760848e-06, + "loss": 0.3502, + "step": 7070 + }, + { + "epoch": 0.5601901366607249, + "grad_norm": 1.292778420881115, + "learning_rate": 8.541302091823809e-06, + "loss": 0.2589, + "step": 7071 + }, + { + "epoch": 0.5602693602693603, + "grad_norm": 1.380709257864959, + "learning_rate": 8.538763554935008e-06, + "loss": 0.2285, + "step": 7072 + }, + { + "epoch": 0.5603485838779957, + "grad_norm": 1.1594019575687378, + "learning_rate": 8.536225114261597e-06, + "loss": 0.2189, + "step": 7073 + }, + { + "epoch": 0.560427807486631, + "grad_norm": 1.530342225639762, + "learning_rate": 8.533686769970717e-06, + "loss": 0.3002, + "step": 7074 + }, + { + "epoch": 0.5605070310952663, + "grad_norm": 1.269648241788725, + "learning_rate": 8.531148522229509e-06, + "loss": 0.1908, + "step": 7075 + }, + { + "epoch": 0.5605862547039018, + "grad_norm": 1.4729501156002238, + "learning_rate": 8.528610371205102e-06, + "loss": 0.2881, + "step": 7076 + }, + { + "epoch": 0.5606654783125371, + "grad_norm": 1.719618071760003, + "learning_rate": 8.526072317064623e-06, + "loss": 0.3844, + "step": 7077 + }, + { + "epoch": 0.5607447019211725, + "grad_norm": 1.3779970254440097, + "learning_rate": 8.52353435997519e-06, + "loss": 0.3204, + "step": 7078 + }, + { + "epoch": 0.5608239255298079, + "grad_norm": 1.2054239120729657, + "learning_rate": 8.520996500103915e-06, + "loss": 0.2207, + "step": 7079 + }, + { + "epoch": 0.5609031491384433, + "grad_norm": 1.671092167969114, + "learning_rate": 8.518458737617903e-06, + "loss": 0.3179, + "step": 7080 + }, + { + "epoch": 0.5609823727470786, + "grad_norm": 1.3908866856142577, + "learning_rate": 8.515921072684255e-06, + "loss": 0.2189, + "step": 7081 + }, + { + "epoch": 0.561061596355714, + "grad_norm": 1.236534212257595, + "learning_rate": 8.513383505470065e-06, + "loss": 0.2744, + "step": 7082 + }, + { + "epoch": 0.5611408199643494, + "grad_norm": 1.4982742546327068, + "learning_rate": 8.510846036142415e-06, + "loss": 0.3714, + "step": 7083 + }, + { + "epoch": 0.5612200435729847, + "grad_norm": 1.1582012330895561, + "learning_rate": 8.50830866486839e-06, + "loss": 0.2065, + "step": 7084 + }, + { + "epoch": 0.5612992671816202, + "grad_norm": 1.4457075553700014, + "learning_rate": 8.505771391815061e-06, + "loss": 0.3379, + "step": 7085 + }, + { + "epoch": 0.5613784907902555, + "grad_norm": 1.4555233570329484, + "learning_rate": 8.503234217149496e-06, + "loss": 0.2955, + "step": 7086 + }, + { + "epoch": 0.5614577143988909, + "grad_norm": 1.3949290716945484, + "learning_rate": 8.500697141038758e-06, + "loss": 0.2615, + "step": 7087 + }, + { + "epoch": 0.5615369380075262, + "grad_norm": 1.1355941681619464, + "learning_rate": 8.498160163649896e-06, + "loss": 0.2227, + "step": 7088 + }, + { + "epoch": 0.5616161616161616, + "grad_norm": 1.2371204767851705, + "learning_rate": 8.495623285149962e-06, + "loss": 0.2551, + "step": 7089 + }, + { + "epoch": 0.561695385224797, + "grad_norm": 1.187872217152119, + "learning_rate": 8.493086505705998e-06, + "loss": 0.2334, + "step": 7090 + }, + { + "epoch": 0.5617746088334323, + "grad_norm": 1.441722024434417, + "learning_rate": 8.490549825485036e-06, + "loss": 0.3307, + "step": 7091 + }, + { + "epoch": 0.5618538324420678, + "grad_norm": 1.4849448881758496, + "learning_rate": 8.488013244654103e-06, + "loss": 0.2316, + "step": 7092 + }, + { + "epoch": 0.5619330560507031, + "grad_norm": 1.4975482818674597, + "learning_rate": 8.485476763380224e-06, + "loss": 0.3159, + "step": 7093 + }, + { + "epoch": 0.5620122796593385, + "grad_norm": 1.2180345404673107, + "learning_rate": 8.482940381830412e-06, + "loss": 0.2262, + "step": 7094 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 1.555724455160656, + "learning_rate": 8.480404100171677e-06, + "loss": 0.3019, + "step": 7095 + }, + { + "epoch": 0.5621707268766092, + "grad_norm": 1.3002829453316416, + "learning_rate": 8.47786791857102e-06, + "loss": 0.2415, + "step": 7096 + }, + { + "epoch": 0.5622499504852446, + "grad_norm": 1.4261023194091624, + "learning_rate": 8.475331837195435e-06, + "loss": 0.2855, + "step": 7097 + }, + { + "epoch": 0.56232917409388, + "grad_norm": 1.9370986011708122, + "learning_rate": 8.472795856211916e-06, + "loss": 0.3187, + "step": 7098 + }, + { + "epoch": 0.5624083977025154, + "grad_norm": 1.3045949246779922, + "learning_rate": 8.470259975787438e-06, + "loss": 0.2766, + "step": 7099 + }, + { + "epoch": 0.5624876213111507, + "grad_norm": 1.4013713986669258, + "learning_rate": 8.46772419608898e-06, + "loss": 0.2362, + "step": 7100 + }, + { + "epoch": 0.5625668449197861, + "grad_norm": 1.5170180070855386, + "learning_rate": 8.465188517283514e-06, + "loss": 0.2804, + "step": 7101 + }, + { + "epoch": 0.5626460685284215, + "grad_norm": 1.4147934229144523, + "learning_rate": 8.462652939537996e-06, + "loss": 0.2835, + "step": 7102 + }, + { + "epoch": 0.5627252921370568, + "grad_norm": 1.4312492575172449, + "learning_rate": 8.460117463019387e-06, + "loss": 0.2496, + "step": 7103 + }, + { + "epoch": 0.5628045157456922, + "grad_norm": 1.2889102522545566, + "learning_rate": 8.457582087894631e-06, + "loss": 0.2368, + "step": 7104 + }, + { + "epoch": 0.5628837393543276, + "grad_norm": 1.4258407587054147, + "learning_rate": 8.455046814330674e-06, + "loss": 0.336, + "step": 7105 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 1.3449280204438927, + "learning_rate": 8.452511642494453e-06, + "loss": 0.2886, + "step": 7106 + }, + { + "epoch": 0.5630421865715983, + "grad_norm": 1.4418812595000428, + "learning_rate": 8.449976572552891e-06, + "loss": 0.3594, + "step": 7107 + }, + { + "epoch": 0.5631214101802338, + "grad_norm": 1.1109709699757517, + "learning_rate": 8.447441604672913e-06, + "loss": 0.1827, + "step": 7108 + }, + { + "epoch": 0.5632006337888691, + "grad_norm": 1.1345466575351015, + "learning_rate": 8.444906739021438e-06, + "loss": 0.242, + "step": 7109 + }, + { + "epoch": 0.5632798573975044, + "grad_norm": 1.5383389449091234, + "learning_rate": 8.442371975765368e-06, + "loss": 0.2311, + "step": 7110 + }, + { + "epoch": 0.5633590810061399, + "grad_norm": 1.2962654047524944, + "learning_rate": 8.439837315071612e-06, + "loss": 0.2936, + "step": 7111 + }, + { + "epoch": 0.5634383046147752, + "grad_norm": 1.2769144612709133, + "learning_rate": 8.43730275710706e-06, + "loss": 0.3124, + "step": 7112 + }, + { + "epoch": 0.5635175282234106, + "grad_norm": 1.2440169016945453, + "learning_rate": 8.434768302038602e-06, + "loss": 0.1866, + "step": 7113 + }, + { + "epoch": 0.5635967518320459, + "grad_norm": 1.319345246272552, + "learning_rate": 8.432233950033122e-06, + "loss": 0.2753, + "step": 7114 + }, + { + "epoch": 0.5636759754406814, + "grad_norm": 1.6019990992716826, + "learning_rate": 8.42969970125749e-06, + "loss": 0.3592, + "step": 7115 + }, + { + "epoch": 0.5637551990493167, + "grad_norm": 1.2272841558527292, + "learning_rate": 8.427165555878577e-06, + "loss": 0.198, + "step": 7116 + }, + { + "epoch": 0.563834422657952, + "grad_norm": 1.3109970490466731, + "learning_rate": 8.424631514063247e-06, + "loss": 0.2084, + "step": 7117 + }, + { + "epoch": 0.5639136462665875, + "grad_norm": 1.482911107116727, + "learning_rate": 8.422097575978349e-06, + "loss": 0.3294, + "step": 7118 + }, + { + "epoch": 0.5639928698752228, + "grad_norm": 1.2879923661507742, + "learning_rate": 8.419563741790735e-06, + "loss": 0.1595, + "step": 7119 + }, + { + "epoch": 0.5640720934838582, + "grad_norm": 1.302712408204975, + "learning_rate": 8.417030011667241e-06, + "loss": 0.2373, + "step": 7120 + }, + { + "epoch": 0.5641513170924936, + "grad_norm": 1.776325427910239, + "learning_rate": 8.414496385774706e-06, + "loss": 0.3503, + "step": 7121 + }, + { + "epoch": 0.564230540701129, + "grad_norm": 1.1671649423199169, + "learning_rate": 8.411962864279957e-06, + "loss": 0.1674, + "step": 7122 + }, + { + "epoch": 0.5643097643097643, + "grad_norm": 1.246239001050154, + "learning_rate": 8.409429447349811e-06, + "loss": 0.2581, + "step": 7123 + }, + { + "epoch": 0.5643889879183996, + "grad_norm": 1.4502591271514103, + "learning_rate": 8.406896135151081e-06, + "loss": 0.2861, + "step": 7124 + }, + { + "epoch": 0.5644682115270351, + "grad_norm": 1.3177987693624105, + "learning_rate": 8.40436292785058e-06, + "loss": 0.2516, + "step": 7125 + }, + { + "epoch": 0.5645474351356704, + "grad_norm": 1.4517774883527377, + "learning_rate": 8.401829825615098e-06, + "loss": 0.305, + "step": 7126 + }, + { + "epoch": 0.5646266587443058, + "grad_norm": 1.3020506327010801, + "learning_rate": 8.399296828611433e-06, + "loss": 0.2396, + "step": 7127 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 1.147628002492846, + "learning_rate": 8.396763937006369e-06, + "loss": 0.2475, + "step": 7128 + }, + { + "epoch": 0.5647851059615765, + "grad_norm": 1.4342675699159348, + "learning_rate": 8.394231150966685e-06, + "loss": 0.3366, + "step": 7129 + }, + { + "epoch": 0.5648643295702119, + "grad_norm": 1.410060897074877, + "learning_rate": 8.391698470659154e-06, + "loss": 0.2977, + "step": 7130 + }, + { + "epoch": 0.5649435531788473, + "grad_norm": 1.2769640257127037, + "learning_rate": 8.38916589625054e-06, + "loss": 0.2648, + "step": 7131 + }, + { + "epoch": 0.5650227767874827, + "grad_norm": 1.4588174200269972, + "learning_rate": 8.3866334279076e-06, + "loss": 0.3581, + "step": 7132 + }, + { + "epoch": 0.565102000396118, + "grad_norm": 1.5954168370806279, + "learning_rate": 8.384101065797087e-06, + "loss": 0.3426, + "step": 7133 + }, + { + "epoch": 0.5651812240047535, + "grad_norm": 1.321265250035742, + "learning_rate": 8.381568810085745e-06, + "loss": 0.192, + "step": 7134 + }, + { + "epoch": 0.5652604476133888, + "grad_norm": 1.5972391729437088, + "learning_rate": 8.379036660940306e-06, + "loss": 0.3712, + "step": 7135 + }, + { + "epoch": 0.5653396712220241, + "grad_norm": 1.2563572724512664, + "learning_rate": 8.376504618527505e-06, + "loss": 0.2066, + "step": 7136 + }, + { + "epoch": 0.5654188948306595, + "grad_norm": 1.4855726786431513, + "learning_rate": 8.373972683014063e-06, + "loss": 0.3124, + "step": 7137 + }, + { + "epoch": 0.5654981184392949, + "grad_norm": 1.3577756788371746, + "learning_rate": 8.371440854566696e-06, + "loss": 0.2428, + "step": 7138 + }, + { + "epoch": 0.5655773420479303, + "grad_norm": 1.2510886976573299, + "learning_rate": 8.368909133352114e-06, + "loss": 0.2725, + "step": 7139 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 1.4216449113950804, + "learning_rate": 8.366377519537015e-06, + "loss": 0.2716, + "step": 7140 + }, + { + "epoch": 0.5657357892652011, + "grad_norm": 1.4204817175435025, + "learning_rate": 8.363846013288096e-06, + "loss": 0.2205, + "step": 7141 + }, + { + "epoch": 0.5658150128738364, + "grad_norm": 1.453708105307023, + "learning_rate": 8.361314614772047e-06, + "loss": 0.3083, + "step": 7142 + }, + { + "epoch": 0.5658942364824717, + "grad_norm": 1.4690151780580532, + "learning_rate": 8.358783324155542e-06, + "loss": 0.2427, + "step": 7143 + }, + { + "epoch": 0.5659734600911072, + "grad_norm": 1.6338805263042093, + "learning_rate": 8.35625214160526e-06, + "loss": 0.2832, + "step": 7144 + }, + { + "epoch": 0.5660526836997425, + "grad_norm": 1.6266711394477922, + "learning_rate": 8.353721067287865e-06, + "loss": 0.244, + "step": 7145 + }, + { + "epoch": 0.5661319073083779, + "grad_norm": 1.2933271179787933, + "learning_rate": 8.351190101370016e-06, + "loss": 0.2426, + "step": 7146 + }, + { + "epoch": 0.5662111309170133, + "grad_norm": 1.414727881963241, + "learning_rate": 8.348659244018367e-06, + "loss": 0.2928, + "step": 7147 + }, + { + "epoch": 0.5662903545256487, + "grad_norm": 1.71388820430358, + "learning_rate": 8.34612849539956e-06, + "loss": 0.3315, + "step": 7148 + }, + { + "epoch": 0.566369578134284, + "grad_norm": 1.537900409905728, + "learning_rate": 8.343597855680231e-06, + "loss": 0.2701, + "step": 7149 + }, + { + "epoch": 0.5664488017429193, + "grad_norm": 1.1662149283403267, + "learning_rate": 8.341067325027017e-06, + "loss": 0.2017, + "step": 7150 + }, + { + "epoch": 0.5665280253515548, + "grad_norm": 1.293059542700422, + "learning_rate": 8.338536903606535e-06, + "loss": 0.224, + "step": 7151 + }, + { + "epoch": 0.5666072489601901, + "grad_norm": 1.1617754791487092, + "learning_rate": 8.336006591585406e-06, + "loss": 0.1951, + "step": 7152 + }, + { + "epoch": 0.5666864725688255, + "grad_norm": 1.2810911116043564, + "learning_rate": 8.333476389130234e-06, + "loss": 0.2979, + "step": 7153 + }, + { + "epoch": 0.5667656961774609, + "grad_norm": 1.1212942231610585, + "learning_rate": 8.330946296407622e-06, + "loss": 0.2234, + "step": 7154 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 1.063026537010679, + "learning_rate": 8.328416313584169e-06, + "loss": 0.1765, + "step": 7155 + }, + { + "epoch": 0.5669241433947316, + "grad_norm": 1.3412520990949484, + "learning_rate": 8.325886440826457e-06, + "loss": 0.218, + "step": 7156 + }, + { + "epoch": 0.567003367003367, + "grad_norm": 1.3402338232670137, + "learning_rate": 8.323356678301067e-06, + "loss": 0.2222, + "step": 7157 + }, + { + "epoch": 0.5670825906120024, + "grad_norm": 1.6127387694985609, + "learning_rate": 8.320827026174572e-06, + "loss": 0.2746, + "step": 7158 + }, + { + "epoch": 0.5671618142206377, + "grad_norm": 1.5460391460996852, + "learning_rate": 8.318297484613538e-06, + "loss": 0.2691, + "step": 7159 + }, + { + "epoch": 0.5672410378292732, + "grad_norm": 1.418653938323995, + "learning_rate": 8.315768053784524e-06, + "loss": 0.2663, + "step": 7160 + }, + { + "epoch": 0.5673202614379085, + "grad_norm": 1.444854532476563, + "learning_rate": 8.313238733854076e-06, + "loss": 0.326, + "step": 7161 + }, + { + "epoch": 0.5673994850465439, + "grad_norm": 1.325639453001486, + "learning_rate": 8.310709524988743e-06, + "loss": 0.2246, + "step": 7162 + }, + { + "epoch": 0.5674787086551792, + "grad_norm": 1.822703768895834, + "learning_rate": 8.308180427355062e-06, + "loss": 0.4237, + "step": 7163 + }, + { + "epoch": 0.5675579322638146, + "grad_norm": 1.4074230301700068, + "learning_rate": 8.305651441119558e-06, + "loss": 0.2216, + "step": 7164 + }, + { + "epoch": 0.56763715587245, + "grad_norm": 1.2876411145605942, + "learning_rate": 8.303122566448754e-06, + "loss": 0.2105, + "step": 7165 + }, + { + "epoch": 0.5677163794810853, + "grad_norm": 1.4467519075522708, + "learning_rate": 8.300593803509163e-06, + "loss": 0.2845, + "step": 7166 + }, + { + "epoch": 0.5677956030897208, + "grad_norm": 1.6318526228488663, + "learning_rate": 8.298065152467293e-06, + "loss": 0.3196, + "step": 7167 + }, + { + "epoch": 0.5678748266983561, + "grad_norm": 1.2155098910124864, + "learning_rate": 8.295536613489645e-06, + "loss": 0.231, + "step": 7168 + }, + { + "epoch": 0.5679540503069915, + "grad_norm": 1.241556015195051, + "learning_rate": 8.293008186742708e-06, + "loss": 0.2237, + "step": 7169 + }, + { + "epoch": 0.5680332739156269, + "grad_norm": 1.196469630551233, + "learning_rate": 8.290479872392969e-06, + "loss": 0.2088, + "step": 7170 + }, + { + "epoch": 0.5681124975242622, + "grad_norm": 1.4211634554594965, + "learning_rate": 8.287951670606905e-06, + "loss": 0.2663, + "step": 7171 + }, + { + "epoch": 0.5681917211328976, + "grad_norm": 1.464335679270111, + "learning_rate": 8.285423581550985e-06, + "loss": 0.2598, + "step": 7172 + }, + { + "epoch": 0.568270944741533, + "grad_norm": 1.1856924588774251, + "learning_rate": 8.282895605391674e-06, + "loss": 0.2235, + "step": 7173 + }, + { + "epoch": 0.5683501683501684, + "grad_norm": 1.297086944415625, + "learning_rate": 8.280367742295424e-06, + "loss": 0.2838, + "step": 7174 + }, + { + "epoch": 0.5684293919588037, + "grad_norm": 1.6851109871436516, + "learning_rate": 8.277839992428683e-06, + "loss": 0.3499, + "step": 7175 + }, + { + "epoch": 0.5685086155674391, + "grad_norm": 1.326205667278145, + "learning_rate": 8.275312355957893e-06, + "loss": 0.2485, + "step": 7176 + }, + { + "epoch": 0.5685878391760745, + "grad_norm": 1.5782902163343802, + "learning_rate": 8.272784833049485e-06, + "loss": 0.2873, + "step": 7177 + }, + { + "epoch": 0.5686670627847098, + "grad_norm": 1.4622161714162363, + "learning_rate": 8.270257423869885e-06, + "loss": 0.2728, + "step": 7178 + }, + { + "epoch": 0.5687462863933452, + "grad_norm": 1.247821175670185, + "learning_rate": 8.267730128585511e-06, + "loss": 0.2187, + "step": 7179 + }, + { + "epoch": 0.5688255100019806, + "grad_norm": 1.5908534638055083, + "learning_rate": 8.265202947362772e-06, + "loss": 0.3639, + "step": 7180 + }, + { + "epoch": 0.568904733610616, + "grad_norm": 1.4814417374129667, + "learning_rate": 8.262675880368074e-06, + "loss": 0.2928, + "step": 7181 + }, + { + "epoch": 0.5689839572192513, + "grad_norm": 1.4497222458343686, + "learning_rate": 8.260148927767807e-06, + "loss": 0.3371, + "step": 7182 + }, + { + "epoch": 0.5690631808278868, + "grad_norm": 1.559701533795049, + "learning_rate": 8.257622089728362e-06, + "loss": 0.3846, + "step": 7183 + }, + { + "epoch": 0.5691424044365221, + "grad_norm": 1.6780123835529779, + "learning_rate": 8.255095366416122e-06, + "loss": 0.3641, + "step": 7184 + }, + { + "epoch": 0.5692216280451574, + "grad_norm": 1.4272228181886968, + "learning_rate": 8.25256875799745e-06, + "loss": 0.2371, + "step": 7185 + }, + { + "epoch": 0.5693008516537928, + "grad_norm": 1.3055538620491638, + "learning_rate": 8.250042264638721e-06, + "loss": 0.2882, + "step": 7186 + }, + { + "epoch": 0.5693800752624282, + "grad_norm": 1.3411328009937957, + "learning_rate": 8.24751588650629e-06, + "loss": 0.3031, + "step": 7187 + }, + { + "epoch": 0.5694592988710636, + "grad_norm": 1.208254297411466, + "learning_rate": 8.244989623766502e-06, + "loss": 0.2666, + "step": 7188 + }, + { + "epoch": 0.5695385224796989, + "grad_norm": 1.6953329772989298, + "learning_rate": 8.242463476585707e-06, + "loss": 0.2908, + "step": 7189 + }, + { + "epoch": 0.5696177460883344, + "grad_norm": 1.2408710405995895, + "learning_rate": 8.239937445130232e-06, + "loss": 0.2431, + "step": 7190 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 1.3685469666742551, + "learning_rate": 8.237411529566407e-06, + "loss": 0.269, + "step": 7191 + }, + { + "epoch": 0.569776193305605, + "grad_norm": 1.4202059129376712, + "learning_rate": 8.234885730060554e-06, + "loss": 0.2933, + "step": 7192 + }, + { + "epoch": 0.5698554169142405, + "grad_norm": 1.2637095296332983, + "learning_rate": 8.232360046778982e-06, + "loss": 0.244, + "step": 7193 + }, + { + "epoch": 0.5699346405228758, + "grad_norm": 1.3178977720511544, + "learning_rate": 8.229834479887992e-06, + "loss": 0.2497, + "step": 7194 + }, + { + "epoch": 0.5700138641315112, + "grad_norm": 0.99659848784542, + "learning_rate": 8.227309029553889e-06, + "loss": 0.146, + "step": 7195 + }, + { + "epoch": 0.5700930877401466, + "grad_norm": 1.443904702111047, + "learning_rate": 8.224783695942954e-06, + "loss": 0.2321, + "step": 7196 + }, + { + "epoch": 0.570172311348782, + "grad_norm": 1.2633576251709218, + "learning_rate": 8.222258479221473e-06, + "loss": 0.2736, + "step": 7197 + }, + { + "epoch": 0.5702515349574173, + "grad_norm": 1.4130574257940338, + "learning_rate": 8.219733379555715e-06, + "loss": 0.2393, + "step": 7198 + }, + { + "epoch": 0.5703307585660526, + "grad_norm": 1.3170891729454701, + "learning_rate": 8.217208397111948e-06, + "loss": 0.2423, + "step": 7199 + }, + { + "epoch": 0.5704099821746881, + "grad_norm": 1.4416706987205348, + "learning_rate": 8.21468353205643e-06, + "loss": 0.3085, + "step": 7200 + }, + { + "epoch": 0.5704892057833234, + "grad_norm": 1.4266503880870656, + "learning_rate": 8.212158784555412e-06, + "loss": 0.3428, + "step": 7201 + }, + { + "epoch": 0.5705684293919588, + "grad_norm": 1.3676641336902051, + "learning_rate": 8.209634154775134e-06, + "loss": 0.2048, + "step": 7202 + }, + { + "epoch": 0.5706476530005942, + "grad_norm": 1.2522747428937528, + "learning_rate": 8.207109642881836e-06, + "loss": 0.2733, + "step": 7203 + }, + { + "epoch": 0.5707268766092296, + "grad_norm": 1.5419887458613233, + "learning_rate": 8.20458524904174e-06, + "loss": 0.3115, + "step": 7204 + }, + { + "epoch": 0.5708061002178649, + "grad_norm": 1.3122436431156401, + "learning_rate": 8.202060973421064e-06, + "loss": 0.242, + "step": 7205 + }, + { + "epoch": 0.5708853238265003, + "grad_norm": 2.0007015289307954, + "learning_rate": 8.199536816186025e-06, + "loss": 0.2695, + "step": 7206 + }, + { + "epoch": 0.5709645474351357, + "grad_norm": 1.6146837054048737, + "learning_rate": 8.197012777502819e-06, + "loss": 0.3655, + "step": 7207 + }, + { + "epoch": 0.571043771043771, + "grad_norm": 1.4620019476941544, + "learning_rate": 8.194488857537646e-06, + "loss": 0.2243, + "step": 7208 + }, + { + "epoch": 0.5711229946524065, + "grad_norm": 1.565684945572946, + "learning_rate": 8.191965056456699e-06, + "loss": 0.2865, + "step": 7209 + }, + { + "epoch": 0.5712022182610418, + "grad_norm": 1.5097716303231314, + "learning_rate": 8.18944137442615e-06, + "loss": 0.3214, + "step": 7210 + }, + { + "epoch": 0.5712814418696771, + "grad_norm": 1.6280593313768643, + "learning_rate": 8.186917811612173e-06, + "loss": 0.3614, + "step": 7211 + }, + { + "epoch": 0.5713606654783125, + "grad_norm": 1.5799601981222253, + "learning_rate": 8.184394368180937e-06, + "loss": 0.2193, + "step": 7212 + }, + { + "epoch": 0.5714398890869479, + "grad_norm": 1.2025324196971896, + "learning_rate": 8.181871044298594e-06, + "loss": 0.2427, + "step": 7213 + }, + { + "epoch": 0.5715191126955833, + "grad_norm": 1.1725811005798168, + "learning_rate": 8.179347840131297e-06, + "loss": 0.1896, + "step": 7214 + }, + { + "epoch": 0.5715983363042186, + "grad_norm": 1.3815789900598912, + "learning_rate": 8.176824755845183e-06, + "loss": 0.3027, + "step": 7215 + }, + { + "epoch": 0.5716775599128541, + "grad_norm": 1.4111195877702958, + "learning_rate": 8.174301791606384e-06, + "loss": 0.2699, + "step": 7216 + }, + { + "epoch": 0.5717567835214894, + "grad_norm": 1.4074945015660592, + "learning_rate": 8.171778947581032e-06, + "loss": 0.2813, + "step": 7217 + }, + { + "epoch": 0.5718360071301247, + "grad_norm": 1.3231085750695826, + "learning_rate": 8.169256223935236e-06, + "loss": 0.2581, + "step": 7218 + }, + { + "epoch": 0.5719152307387602, + "grad_norm": 1.3957289223898761, + "learning_rate": 8.166733620835107e-06, + "loss": 0.3323, + "step": 7219 + }, + { + "epoch": 0.5719944543473955, + "grad_norm": 1.2571696708555646, + "learning_rate": 8.164211138446753e-06, + "loss": 0.3602, + "step": 7220 + }, + { + "epoch": 0.5720736779560309, + "grad_norm": 1.4226328827209267, + "learning_rate": 8.161688776936259e-06, + "loss": 0.2949, + "step": 7221 + }, + { + "epoch": 0.5721529015646662, + "grad_norm": 1.4221200719212066, + "learning_rate": 8.159166536469717e-06, + "loss": 0.2628, + "step": 7222 + }, + { + "epoch": 0.5722321251733017, + "grad_norm": 1.530727459920596, + "learning_rate": 8.156644417213196e-06, + "loss": 0.284, + "step": 7223 + }, + { + "epoch": 0.572311348781937, + "grad_norm": 1.0356838349467306, + "learning_rate": 8.154122419332772e-06, + "loss": 0.1774, + "step": 7224 + }, + { + "epoch": 0.5723905723905723, + "grad_norm": 1.4099866280482791, + "learning_rate": 8.151600542994506e-06, + "loss": 0.2923, + "step": 7225 + }, + { + "epoch": 0.5724697959992078, + "grad_norm": 1.2734690809589717, + "learning_rate": 8.149078788364451e-06, + "loss": 0.2387, + "step": 7226 + }, + { + "epoch": 0.5725490196078431, + "grad_norm": 1.8625116504696313, + "learning_rate": 8.14655715560865e-06, + "loss": 0.2665, + "step": 7227 + }, + { + "epoch": 0.5726282432164785, + "grad_norm": 1.3887418279282464, + "learning_rate": 8.144035644893143e-06, + "loss": 0.3241, + "step": 7228 + }, + { + "epoch": 0.5727074668251139, + "grad_norm": 1.120719755707284, + "learning_rate": 8.141514256383957e-06, + "loss": 0.234, + "step": 7229 + }, + { + "epoch": 0.5727866904337493, + "grad_norm": 1.2241228828981099, + "learning_rate": 8.138992990247119e-06, + "loss": 0.248, + "step": 7230 + }, + { + "epoch": 0.5728659140423846, + "grad_norm": 1.5606421941372555, + "learning_rate": 8.136471846648633e-06, + "loss": 0.3318, + "step": 7231 + }, + { + "epoch": 0.57294513765102, + "grad_norm": 1.3763982346812798, + "learning_rate": 8.133950825754511e-06, + "loss": 0.3385, + "step": 7232 + }, + { + "epoch": 0.5730243612596554, + "grad_norm": 1.1966355672843383, + "learning_rate": 8.13142992773075e-06, + "loss": 0.263, + "step": 7233 + }, + { + "epoch": 0.5731035848682907, + "grad_norm": 1.4915149852159055, + "learning_rate": 8.128909152743334e-06, + "loss": 0.336, + "step": 7234 + }, + { + "epoch": 0.5731828084769262, + "grad_norm": 1.4870137648498836, + "learning_rate": 8.12638850095825e-06, + "loss": 0.2419, + "step": 7235 + }, + { + "epoch": 0.5732620320855615, + "grad_norm": 1.4582273836264925, + "learning_rate": 8.123867972541466e-06, + "loss": 0.2685, + "step": 7236 + }, + { + "epoch": 0.5733412556941969, + "grad_norm": 1.8251157920250018, + "learning_rate": 8.12134756765895e-06, + "loss": 0.3216, + "step": 7237 + }, + { + "epoch": 0.5734204793028322, + "grad_norm": 1.320568846679935, + "learning_rate": 8.118827286476658e-06, + "loss": 0.3292, + "step": 7238 + }, + { + "epoch": 0.5734997029114676, + "grad_norm": 1.574098794814502, + "learning_rate": 8.116307129160535e-06, + "loss": 0.3553, + "step": 7239 + }, + { + "epoch": 0.573578926520103, + "grad_norm": 1.4276291328495054, + "learning_rate": 8.113787095876525e-06, + "loss": 0.272, + "step": 7240 + }, + { + "epoch": 0.5736581501287383, + "grad_norm": 1.2988499104065252, + "learning_rate": 8.11126718679056e-06, + "loss": 0.2592, + "step": 7241 + }, + { + "epoch": 0.5737373737373738, + "grad_norm": 1.374199117147126, + "learning_rate": 8.10874740206856e-06, + "loss": 0.3144, + "step": 7242 + }, + { + "epoch": 0.5738165973460091, + "grad_norm": 1.5806091196897165, + "learning_rate": 8.106227741876447e-06, + "loss": 0.3672, + "step": 7243 + }, + { + "epoch": 0.5738958209546445, + "grad_norm": 1.480670373717225, + "learning_rate": 8.103708206380123e-06, + "loss": 0.3067, + "step": 7244 + }, + { + "epoch": 0.5739750445632799, + "grad_norm": 1.124948587926902, + "learning_rate": 8.101188795745489e-06, + "loss": 0.2215, + "step": 7245 + }, + { + "epoch": 0.5740542681719152, + "grad_norm": 1.3452714486215582, + "learning_rate": 8.098669510138438e-06, + "loss": 0.2509, + "step": 7246 + }, + { + "epoch": 0.5741334917805506, + "grad_norm": 1.3241386500206564, + "learning_rate": 8.09615034972485e-06, + "loss": 0.3039, + "step": 7247 + }, + { + "epoch": 0.574212715389186, + "grad_norm": 1.6940291424289489, + "learning_rate": 8.093631314670598e-06, + "loss": 0.3379, + "step": 7248 + }, + { + "epoch": 0.5742919389978214, + "grad_norm": 1.3042791043100272, + "learning_rate": 8.091112405141555e-06, + "loss": 0.2406, + "step": 7249 + }, + { + "epoch": 0.5743711626064567, + "grad_norm": 1.4138797091392086, + "learning_rate": 8.088593621303573e-06, + "loss": 0.2819, + "step": 7250 + }, + { + "epoch": 0.5744503862150921, + "grad_norm": 1.1609871664453206, + "learning_rate": 8.086074963322505e-06, + "loss": 0.2709, + "step": 7251 + }, + { + "epoch": 0.5745296098237275, + "grad_norm": 1.417221535180172, + "learning_rate": 8.083556431364191e-06, + "loss": 0.3196, + "step": 7252 + }, + { + "epoch": 0.5746088334323628, + "grad_norm": 1.2280668767312273, + "learning_rate": 8.081038025594464e-06, + "loss": 0.2553, + "step": 7253 + }, + { + "epoch": 0.5746880570409982, + "grad_norm": 1.4324194840009161, + "learning_rate": 8.078519746179153e-06, + "loss": 0.2554, + "step": 7254 + }, + { + "epoch": 0.5747672806496336, + "grad_norm": 1.206722546459733, + "learning_rate": 8.076001593284066e-06, + "loss": 0.2186, + "step": 7255 + }, + { + "epoch": 0.574846504258269, + "grad_norm": 1.2200824157767334, + "learning_rate": 8.073483567075018e-06, + "loss": 0.2474, + "step": 7256 + }, + { + "epoch": 0.5749257278669043, + "grad_norm": 1.1086685057097463, + "learning_rate": 8.070965667717809e-06, + "loss": 0.1861, + "step": 7257 + }, + { + "epoch": 0.5750049514755398, + "grad_norm": 1.4588593881686898, + "learning_rate": 8.06844789537823e-06, + "loss": 0.3247, + "step": 7258 + }, + { + "epoch": 0.5750841750841751, + "grad_norm": 1.6119223480152143, + "learning_rate": 8.065930250222061e-06, + "loss": 0.2647, + "step": 7259 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 1.3166410465710123, + "learning_rate": 8.063412732415077e-06, + "loss": 0.2338, + "step": 7260 + }, + { + "epoch": 0.5752426223014458, + "grad_norm": 1.6071503680308619, + "learning_rate": 8.060895342123049e-06, + "loss": 0.2933, + "step": 7261 + }, + { + "epoch": 0.5753218459100812, + "grad_norm": 1.5054721075611828, + "learning_rate": 8.058378079511732e-06, + "loss": 0.3032, + "step": 7262 + }, + { + "epoch": 0.5754010695187166, + "grad_norm": 1.2524042612027355, + "learning_rate": 8.055860944746876e-06, + "loss": 0.252, + "step": 7263 + }, + { + "epoch": 0.5754802931273519, + "grad_norm": 1.4458466798033494, + "learning_rate": 8.05334393799422e-06, + "loss": 0.3023, + "step": 7264 + }, + { + "epoch": 0.5755595167359874, + "grad_norm": 1.165159191844685, + "learning_rate": 8.050827059419502e-06, + "loss": 0.2117, + "step": 7265 + }, + { + "epoch": 0.5756387403446227, + "grad_norm": 1.9643178697548813, + "learning_rate": 8.04831030918844e-06, + "loss": 0.4498, + "step": 7266 + }, + { + "epoch": 0.575717963953258, + "grad_norm": 1.3179042212133283, + "learning_rate": 8.045793687466757e-06, + "loss": 0.2585, + "step": 7267 + }, + { + "epoch": 0.5757971875618935, + "grad_norm": 1.216473730912966, + "learning_rate": 8.043277194420155e-06, + "loss": 0.2093, + "step": 7268 + }, + { + "epoch": 0.5758764111705288, + "grad_norm": 1.497106052931288, + "learning_rate": 8.040760830214334e-06, + "loss": 0.272, + "step": 7269 + }, + { + "epoch": 0.5759556347791642, + "grad_norm": 1.1778245759364987, + "learning_rate": 8.038244595014986e-06, + "loss": 0.2396, + "step": 7270 + }, + { + "epoch": 0.5760348583877996, + "grad_norm": 1.183938231519052, + "learning_rate": 8.03572848898779e-06, + "loss": 0.2436, + "step": 7271 + }, + { + "epoch": 0.576114081996435, + "grad_norm": 1.2965263685314983, + "learning_rate": 8.033212512298422e-06, + "loss": 0.2648, + "step": 7272 + }, + { + "epoch": 0.5761933056050703, + "grad_norm": 1.5701755646348086, + "learning_rate": 8.03069666511255e-06, + "loss": 0.3472, + "step": 7273 + }, + { + "epoch": 0.5762725292137056, + "grad_norm": 1.3982646831566523, + "learning_rate": 8.028180947595823e-06, + "loss": 0.2603, + "step": 7274 + }, + { + "epoch": 0.5763517528223411, + "grad_norm": 1.2916567721489904, + "learning_rate": 8.025665359913897e-06, + "loss": 0.2129, + "step": 7275 + }, + { + "epoch": 0.5764309764309764, + "grad_norm": 1.4876092167672121, + "learning_rate": 8.023149902232404e-06, + "loss": 0.3081, + "step": 7276 + }, + { + "epoch": 0.5765102000396118, + "grad_norm": 1.2315801610526786, + "learning_rate": 8.020634574716976e-06, + "loss": 0.1866, + "step": 7277 + }, + { + "epoch": 0.5765894236482472, + "grad_norm": 1.3507424955428773, + "learning_rate": 8.018119377533243e-06, + "loss": 0.2636, + "step": 7278 + }, + { + "epoch": 0.5766686472568826, + "grad_norm": 1.2268210331017644, + "learning_rate": 8.015604310846807e-06, + "loss": 0.2522, + "step": 7279 + }, + { + "epoch": 0.5767478708655179, + "grad_norm": 1.3586737812127547, + "learning_rate": 8.013089374823281e-06, + "loss": 0.2634, + "step": 7280 + }, + { + "epoch": 0.5768270944741533, + "grad_norm": 1.4984117980761424, + "learning_rate": 8.010574569628263e-06, + "loss": 0.2553, + "step": 7281 + }, + { + "epoch": 0.5769063180827887, + "grad_norm": 1.131431007506256, + "learning_rate": 8.008059895427334e-06, + "loss": 0.2147, + "step": 7282 + }, + { + "epoch": 0.576985541691424, + "grad_norm": 1.3008946211435684, + "learning_rate": 8.005545352386077e-06, + "loss": 0.2217, + "step": 7283 + }, + { + "epoch": 0.5770647653000595, + "grad_norm": 1.2098053682786098, + "learning_rate": 8.003030940670061e-06, + "loss": 0.1826, + "step": 7284 + }, + { + "epoch": 0.5771439889086948, + "grad_norm": 1.6230686913290742, + "learning_rate": 8.000516660444848e-06, + "loss": 0.3414, + "step": 7285 + }, + { + "epoch": 0.5772232125173302, + "grad_norm": 1.4830237338734515, + "learning_rate": 7.99800251187599e-06, + "loss": 0.2118, + "step": 7286 + }, + { + "epoch": 0.5773024361259655, + "grad_norm": 1.5285111750596734, + "learning_rate": 7.995488495129039e-06, + "loss": 0.317, + "step": 7287 + }, + { + "epoch": 0.5773816597346009, + "grad_norm": 1.6543401085003322, + "learning_rate": 7.992974610369521e-06, + "loss": 0.3133, + "step": 7288 + }, + { + "epoch": 0.5774608833432363, + "grad_norm": 1.334644616991732, + "learning_rate": 7.990460857762969e-06, + "loss": 0.2308, + "step": 7289 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 1.1066664730660816, + "learning_rate": 7.987947237474903e-06, + "loss": 0.2713, + "step": 7290 + }, + { + "epoch": 0.5776193305605071, + "grad_norm": 1.4540309533179654, + "learning_rate": 7.985433749670825e-06, + "loss": 0.2909, + "step": 7291 + }, + { + "epoch": 0.5776985541691424, + "grad_norm": 1.3043693364598066, + "learning_rate": 7.982920394516247e-06, + "loss": 0.2544, + "step": 7292 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 1.501627106588649, + "learning_rate": 7.98040717217665e-06, + "loss": 0.3318, + "step": 7293 + }, + { + "epoch": 0.5778570013864132, + "grad_norm": 1.3796591933032745, + "learning_rate": 7.977894082817524e-06, + "loss": 0.3004, + "step": 7294 + }, + { + "epoch": 0.5779362249950485, + "grad_norm": 1.2097643865654226, + "learning_rate": 7.975381126604346e-06, + "loss": 0.2233, + "step": 7295 + }, + { + "epoch": 0.5780154486036839, + "grad_norm": 1.224305721588464, + "learning_rate": 7.972868303702576e-06, + "loss": 0.2541, + "step": 7296 + }, + { + "epoch": 0.5780946722123192, + "grad_norm": 1.365044953551066, + "learning_rate": 7.970355614277674e-06, + "loss": 0.2544, + "step": 7297 + }, + { + "epoch": 0.5781738958209547, + "grad_norm": 1.7013580617683293, + "learning_rate": 7.967843058495092e-06, + "loss": 0.3387, + "step": 7298 + }, + { + "epoch": 0.57825311942959, + "grad_norm": 1.3916278534074553, + "learning_rate": 7.965330636520262e-06, + "loss": 0.2722, + "step": 7299 + }, + { + "epoch": 0.5783323430382253, + "grad_norm": 1.3446890157497295, + "learning_rate": 7.962818348518623e-06, + "loss": 0.3173, + "step": 7300 + }, + { + "epoch": 0.5784115666468608, + "grad_norm": 1.3413083270373292, + "learning_rate": 7.960306194655593e-06, + "loss": 0.223, + "step": 7301 + }, + { + "epoch": 0.5784907902554961, + "grad_norm": 1.3595201204389589, + "learning_rate": 7.957794175096585e-06, + "loss": 0.2963, + "step": 7302 + }, + { + "epoch": 0.5785700138641315, + "grad_norm": 2.071475850875579, + "learning_rate": 7.955282290007006e-06, + "loss": 0.3452, + "step": 7303 + }, + { + "epoch": 0.5786492374727669, + "grad_norm": 1.2413333177478025, + "learning_rate": 7.952770539552246e-06, + "loss": 0.2479, + "step": 7304 + }, + { + "epoch": 0.5787284610814023, + "grad_norm": 1.2790926854684344, + "learning_rate": 7.950258923897695e-06, + "loss": 0.2207, + "step": 7305 + }, + { + "epoch": 0.5788076846900376, + "grad_norm": 1.391275476848362, + "learning_rate": 7.947747443208735e-06, + "loss": 0.2766, + "step": 7306 + }, + { + "epoch": 0.578886908298673, + "grad_norm": 1.1987126672239472, + "learning_rate": 7.945236097650729e-06, + "loss": 0.204, + "step": 7307 + }, + { + "epoch": 0.5789661319073084, + "grad_norm": 1.4113514379631897, + "learning_rate": 7.942724887389041e-06, + "loss": 0.2143, + "step": 7308 + }, + { + "epoch": 0.5790453555159437, + "grad_norm": 1.439208753432546, + "learning_rate": 7.940213812589018e-06, + "loss": 0.2548, + "step": 7309 + }, + { + "epoch": 0.5791245791245792, + "grad_norm": 1.4574424181193621, + "learning_rate": 7.937702873416005e-06, + "loss": 0.2846, + "step": 7310 + }, + { + "epoch": 0.5792038027332145, + "grad_norm": 1.2870828021658083, + "learning_rate": 7.935192070035335e-06, + "loss": 0.2058, + "step": 7311 + }, + { + "epoch": 0.5792830263418499, + "grad_norm": 2.0664833564101626, + "learning_rate": 7.932681402612332e-06, + "loss": 0.5155, + "step": 7312 + }, + { + "epoch": 0.5793622499504852, + "grad_norm": 1.2530629242543592, + "learning_rate": 7.93017087131231e-06, + "loss": 0.1958, + "step": 7313 + }, + { + "epoch": 0.5794414735591206, + "grad_norm": 1.2947756905494958, + "learning_rate": 7.927660476300578e-06, + "loss": 0.2186, + "step": 7314 + }, + { + "epoch": 0.579520697167756, + "grad_norm": 1.2896377557764966, + "learning_rate": 7.925150217742431e-06, + "loss": 0.2673, + "step": 7315 + }, + { + "epoch": 0.5795999207763913, + "grad_norm": 1.4059878447564047, + "learning_rate": 7.92264009580316e-06, + "loss": 0.2946, + "step": 7316 + }, + { + "epoch": 0.5796791443850268, + "grad_norm": 1.495302782265638, + "learning_rate": 7.920130110648044e-06, + "loss": 0.2782, + "step": 7317 + }, + { + "epoch": 0.5797583679936621, + "grad_norm": 1.126325338876518, + "learning_rate": 7.917620262442349e-06, + "loss": 0.1944, + "step": 7318 + }, + { + "epoch": 0.5798375916022975, + "grad_norm": 1.1030712262967364, + "learning_rate": 7.915110551351344e-06, + "loss": 0.1917, + "step": 7319 + }, + { + "epoch": 0.5799168152109329, + "grad_norm": 1.4976207307270517, + "learning_rate": 7.912600977540275e-06, + "loss": 0.3116, + "step": 7320 + }, + { + "epoch": 0.5799960388195682, + "grad_norm": 1.655132309868297, + "learning_rate": 7.910091541174388e-06, + "loss": 0.3465, + "step": 7321 + }, + { + "epoch": 0.5800752624282036, + "grad_norm": 1.689741170290245, + "learning_rate": 7.907582242418916e-06, + "loss": 0.3135, + "step": 7322 + }, + { + "epoch": 0.580154486036839, + "grad_norm": 1.5758540796190785, + "learning_rate": 7.905073081439087e-06, + "loss": 0.2983, + "step": 7323 + }, + { + "epoch": 0.5802337096454744, + "grad_norm": 1.2937678167635196, + "learning_rate": 7.902564058400116e-06, + "loss": 0.2352, + "step": 7324 + }, + { + "epoch": 0.5803129332541097, + "grad_norm": 1.3636209054955986, + "learning_rate": 7.900055173467207e-06, + "loss": 0.2677, + "step": 7325 + }, + { + "epoch": 0.5803921568627451, + "grad_norm": 1.424779845363799, + "learning_rate": 7.897546426805561e-06, + "loss": 0.3104, + "step": 7326 + }, + { + "epoch": 0.5804713804713805, + "grad_norm": 1.3146003884776032, + "learning_rate": 7.89503781858037e-06, + "loss": 0.2458, + "step": 7327 + }, + { + "epoch": 0.5805506040800158, + "grad_norm": 1.4547270498172822, + "learning_rate": 7.892529348956805e-06, + "loss": 0.3257, + "step": 7328 + }, + { + "epoch": 0.5806298276886512, + "grad_norm": 1.0780251596447976, + "learning_rate": 7.890021018100045e-06, + "loss": 0.1597, + "step": 7329 + }, + { + "epoch": 0.5807090512972866, + "grad_norm": 1.325176772734977, + "learning_rate": 7.887512826175247e-06, + "loss": 0.2586, + "step": 7330 + }, + { + "epoch": 0.580788274905922, + "grad_norm": 1.3332015474402934, + "learning_rate": 7.885004773347565e-06, + "loss": 0.228, + "step": 7331 + }, + { + "epoch": 0.5808674985145573, + "grad_norm": 1.6020725920790337, + "learning_rate": 7.882496859782145e-06, + "loss": 0.2523, + "step": 7332 + }, + { + "epoch": 0.5809467221231928, + "grad_norm": 1.161064535292935, + "learning_rate": 7.879989085644114e-06, + "loss": 0.1763, + "step": 7333 + }, + { + "epoch": 0.5810259457318281, + "grad_norm": 1.4079571047952557, + "learning_rate": 7.877481451098602e-06, + "loss": 0.292, + "step": 7334 + }, + { + "epoch": 0.5811051693404634, + "grad_norm": 1.129454161260187, + "learning_rate": 7.874973956310726e-06, + "loss": 0.2076, + "step": 7335 + }, + { + "epoch": 0.5811843929490988, + "grad_norm": 1.5015655702099426, + "learning_rate": 7.872466601445587e-06, + "loss": 0.3853, + "step": 7336 + }, + { + "epoch": 0.5812636165577342, + "grad_norm": 1.2535513976915917, + "learning_rate": 7.869959386668286e-06, + "loss": 0.2212, + "step": 7337 + }, + { + "epoch": 0.5813428401663696, + "grad_norm": 1.4237868641733555, + "learning_rate": 7.86745231214391e-06, + "loss": 0.2518, + "step": 7338 + }, + { + "epoch": 0.5814220637750049, + "grad_norm": 1.541267534677451, + "learning_rate": 7.864945378037538e-06, + "loss": 0.3293, + "step": 7339 + }, + { + "epoch": 0.5815012873836404, + "grad_norm": 1.7285119828871987, + "learning_rate": 7.862438584514242e-06, + "loss": 0.2707, + "step": 7340 + }, + { + "epoch": 0.5815805109922757, + "grad_norm": 1.25944636404642, + "learning_rate": 7.859931931739077e-06, + "loss": 0.2585, + "step": 7341 + }, + { + "epoch": 0.581659734600911, + "grad_norm": 1.2411482866610528, + "learning_rate": 7.857425419877097e-06, + "loss": 0.2326, + "step": 7342 + }, + { + "epoch": 0.5817389582095465, + "grad_norm": 1.203996177566958, + "learning_rate": 7.854919049093345e-06, + "loss": 0.1799, + "step": 7343 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 1.2776306644459905, + "learning_rate": 7.852412819552853e-06, + "loss": 0.1868, + "step": 7344 + }, + { + "epoch": 0.5818974054268172, + "grad_norm": 1.4108138126115257, + "learning_rate": 7.849906731420642e-06, + "loss": 0.2265, + "step": 7345 + }, + { + "epoch": 0.5819766290354526, + "grad_norm": 1.013774624060726, + "learning_rate": 7.847400784861727e-06, + "loss": 0.1401, + "step": 7346 + }, + { + "epoch": 0.582055852644088, + "grad_norm": 0.9857985470839145, + "learning_rate": 7.844894980041112e-06, + "loss": 0.1526, + "step": 7347 + }, + { + "epoch": 0.5821350762527233, + "grad_norm": 1.1736385670994127, + "learning_rate": 7.842389317123795e-06, + "loss": 0.2057, + "step": 7348 + }, + { + "epoch": 0.5822142998613586, + "grad_norm": 1.2513554917974397, + "learning_rate": 7.839883796274758e-06, + "loss": 0.218, + "step": 7349 + }, + { + "epoch": 0.5822935234699941, + "grad_norm": 1.3033839660867366, + "learning_rate": 7.83737841765898e-06, + "loss": 0.3215, + "step": 7350 + }, + { + "epoch": 0.5823727470786294, + "grad_norm": 1.5224865053561822, + "learning_rate": 7.834873181441426e-06, + "loss": 0.2809, + "step": 7351 + }, + { + "epoch": 0.5824519706872648, + "grad_norm": 1.373630114415987, + "learning_rate": 7.832368087787056e-06, + "loss": 0.2496, + "step": 7352 + }, + { + "epoch": 0.5825311942959002, + "grad_norm": 1.236517881219902, + "learning_rate": 7.82986313686082e-06, + "loss": 0.2146, + "step": 7353 + }, + { + "epoch": 0.5826104179045356, + "grad_norm": 1.3183533586950056, + "learning_rate": 7.82735832882765e-06, + "loss": 0.2503, + "step": 7354 + }, + { + "epoch": 0.5826896415131709, + "grad_norm": 1.3460888533703654, + "learning_rate": 7.824853663852482e-06, + "loss": 0.2735, + "step": 7355 + }, + { + "epoch": 0.5827688651218063, + "grad_norm": 1.1530308237518005, + "learning_rate": 7.822349142100236e-06, + "loss": 0.2242, + "step": 7356 + }, + { + "epoch": 0.5828480887304417, + "grad_norm": 1.1942893244455353, + "learning_rate": 7.819844763735818e-06, + "loss": 0.2023, + "step": 7357 + }, + { + "epoch": 0.582927312339077, + "grad_norm": 1.3560379317248987, + "learning_rate": 7.817340528924132e-06, + "loss": 0.2989, + "step": 7358 + }, + { + "epoch": 0.5830065359477125, + "grad_norm": 1.1286278903322118, + "learning_rate": 7.814836437830074e-06, + "loss": 0.2243, + "step": 7359 + }, + { + "epoch": 0.5830857595563478, + "grad_norm": 1.2320040972737354, + "learning_rate": 7.812332490618521e-06, + "loss": 0.2617, + "step": 7360 + }, + { + "epoch": 0.5831649831649832, + "grad_norm": 1.282660667798146, + "learning_rate": 7.809828687454343e-06, + "loss": 0.2566, + "step": 7361 + }, + { + "epoch": 0.5832442067736185, + "grad_norm": 1.259016238667985, + "learning_rate": 7.807325028502412e-06, + "loss": 0.2135, + "step": 7362 + }, + { + "epoch": 0.5833234303822539, + "grad_norm": 1.879730743599573, + "learning_rate": 7.804821513927574e-06, + "loss": 0.3322, + "step": 7363 + }, + { + "epoch": 0.5834026539908893, + "grad_norm": 1.373046379468216, + "learning_rate": 7.802318143894678e-06, + "loss": 0.259, + "step": 7364 + }, + { + "epoch": 0.5834818775995246, + "grad_norm": 1.1224787103862752, + "learning_rate": 7.799814918568559e-06, + "loss": 0.2102, + "step": 7365 + }, + { + "epoch": 0.5835611012081601, + "grad_norm": 1.2441747236734526, + "learning_rate": 7.797311838114038e-06, + "loss": 0.2351, + "step": 7366 + }, + { + "epoch": 0.5836403248167954, + "grad_norm": 1.2537666663779703, + "learning_rate": 7.794808902695935e-06, + "loss": 0.1883, + "step": 7367 + }, + { + "epoch": 0.5837195484254307, + "grad_norm": 1.31655449917612, + "learning_rate": 7.792306112479055e-06, + "loss": 0.3199, + "step": 7368 + }, + { + "epoch": 0.5837987720340662, + "grad_norm": 1.365052154501949, + "learning_rate": 7.789803467628196e-06, + "loss": 0.2828, + "step": 7369 + }, + { + "epoch": 0.5838779956427015, + "grad_norm": 1.1945704099352312, + "learning_rate": 7.787300968308144e-06, + "loss": 0.2337, + "step": 7370 + }, + { + "epoch": 0.5839572192513369, + "grad_norm": 1.4190714073996595, + "learning_rate": 7.784798614683675e-06, + "loss": 0.2485, + "step": 7371 + }, + { + "epoch": 0.5840364428599722, + "grad_norm": 1.3220178221670316, + "learning_rate": 7.782296406919557e-06, + "loss": 0.221, + "step": 7372 + }, + { + "epoch": 0.5841156664686077, + "grad_norm": 1.045638236908882, + "learning_rate": 7.779794345180552e-06, + "loss": 0.2106, + "step": 7373 + }, + { + "epoch": 0.584194890077243, + "grad_norm": 1.3382782502913058, + "learning_rate": 7.777292429631405e-06, + "loss": 0.1874, + "step": 7374 + }, + { + "epoch": 0.5842741136858783, + "grad_norm": 1.5283453665670712, + "learning_rate": 7.774790660436857e-06, + "loss": 0.3345, + "step": 7375 + }, + { + "epoch": 0.5843533372945138, + "grad_norm": 1.3236706558559976, + "learning_rate": 7.772289037761639e-06, + "loss": 0.2906, + "step": 7376 + }, + { + "epoch": 0.5844325609031491, + "grad_norm": 1.4051159634469799, + "learning_rate": 7.769787561770466e-06, + "loss": 0.2629, + "step": 7377 + }, + { + "epoch": 0.5845117845117845, + "grad_norm": 1.7113763808153784, + "learning_rate": 7.767286232628054e-06, + "loss": 0.4199, + "step": 7378 + }, + { + "epoch": 0.5845910081204199, + "grad_norm": 1.4889223924081287, + "learning_rate": 7.764785050499098e-06, + "loss": 0.2545, + "step": 7379 + }, + { + "epoch": 0.5846702317290553, + "grad_norm": 1.2217819978861575, + "learning_rate": 7.76228401554829e-06, + "loss": 0.2117, + "step": 7380 + }, + { + "epoch": 0.5847494553376906, + "grad_norm": 1.157869752827776, + "learning_rate": 7.759783127940315e-06, + "loss": 0.2249, + "step": 7381 + }, + { + "epoch": 0.584828678946326, + "grad_norm": 1.369051172674697, + "learning_rate": 7.757282387839842e-06, + "loss": 0.3094, + "step": 7382 + }, + { + "epoch": 0.5849079025549614, + "grad_norm": 1.3898218579430637, + "learning_rate": 7.75478179541153e-06, + "loss": 0.2696, + "step": 7383 + }, + { + "epoch": 0.5849871261635967, + "grad_norm": 1.6501107646663669, + "learning_rate": 7.752281350820037e-06, + "loss": 0.3156, + "step": 7384 + }, + { + "epoch": 0.5850663497722322, + "grad_norm": 1.647438200342468, + "learning_rate": 7.749781054229998e-06, + "loss": 0.3584, + "step": 7385 + }, + { + "epoch": 0.5851455733808675, + "grad_norm": 1.4161007283074485, + "learning_rate": 7.747280905806051e-06, + "loss": 0.3003, + "step": 7386 + }, + { + "epoch": 0.5852247969895029, + "grad_norm": 1.2375726643132374, + "learning_rate": 7.744780905712818e-06, + "loss": 0.2281, + "step": 7387 + }, + { + "epoch": 0.5853040205981382, + "grad_norm": 1.3188851682562714, + "learning_rate": 7.742281054114909e-06, + "loss": 0.2709, + "step": 7388 + }, + { + "epoch": 0.5853832442067736, + "grad_norm": 1.447484092421129, + "learning_rate": 7.73978135117693e-06, + "loss": 0.3015, + "step": 7389 + }, + { + "epoch": 0.585462467815409, + "grad_norm": 1.2320822986661275, + "learning_rate": 7.737281797063473e-06, + "loss": 0.2331, + "step": 7390 + }, + { + "epoch": 0.5855416914240443, + "grad_norm": 1.5933305415863923, + "learning_rate": 7.734782391939123e-06, + "loss": 0.263, + "step": 7391 + }, + { + "epoch": 0.5856209150326798, + "grad_norm": 0.9928643675783463, + "learning_rate": 7.732283135968452e-06, + "loss": 0.1524, + "step": 7392 + }, + { + "epoch": 0.5857001386413151, + "grad_norm": 1.309058063745589, + "learning_rate": 7.729784029316025e-06, + "loss": 0.2484, + "step": 7393 + }, + { + "epoch": 0.5857793622499505, + "grad_norm": 1.316573126287946, + "learning_rate": 7.7272850721464e-06, + "loss": 0.3259, + "step": 7394 + }, + { + "epoch": 0.5858585858585859, + "grad_norm": 1.8782328421029293, + "learning_rate": 7.724786264624112e-06, + "loss": 0.4171, + "step": 7395 + }, + { + "epoch": 0.5859378094672212, + "grad_norm": 1.3319756342825644, + "learning_rate": 7.722287606913703e-06, + "loss": 0.2834, + "step": 7396 + }, + { + "epoch": 0.5860170330758566, + "grad_norm": 1.1335744816448825, + "learning_rate": 7.719789099179696e-06, + "loss": 0.2151, + "step": 7397 + }, + { + "epoch": 0.586096256684492, + "grad_norm": 1.2438505256062882, + "learning_rate": 7.717290741586602e-06, + "loss": 0.2575, + "step": 7398 + }, + { + "epoch": 0.5861754802931274, + "grad_norm": 1.3637738624434186, + "learning_rate": 7.714792534298934e-06, + "loss": 0.294, + "step": 7399 + }, + { + "epoch": 0.5862547039017627, + "grad_norm": 1.7378979538084858, + "learning_rate": 7.712294477481177e-06, + "loss": 0.4004, + "step": 7400 + }, + { + "epoch": 0.5863339275103981, + "grad_norm": 1.4891553505268718, + "learning_rate": 7.709796571297823e-06, + "loss": 0.228, + "step": 7401 + }, + { + "epoch": 0.5864131511190335, + "grad_norm": 1.4406014757288916, + "learning_rate": 7.707298815913346e-06, + "loss": 0.3156, + "step": 7402 + }, + { + "epoch": 0.5864923747276688, + "grad_norm": 1.5959946042731206, + "learning_rate": 7.70480121149221e-06, + "loss": 0.3355, + "step": 7403 + }, + { + "epoch": 0.5865715983363042, + "grad_norm": 1.0837926506559399, + "learning_rate": 7.702303758198868e-06, + "loss": 0.2148, + "step": 7404 + }, + { + "epoch": 0.5866508219449396, + "grad_norm": 1.0914135582763453, + "learning_rate": 7.699806456197771e-06, + "loss": 0.1996, + "step": 7405 + }, + { + "epoch": 0.586730045553575, + "grad_norm": 1.84398259220701, + "learning_rate": 7.697309305653348e-06, + "loss": 0.3525, + "step": 7406 + }, + { + "epoch": 0.5868092691622103, + "grad_norm": 1.4811872452462012, + "learning_rate": 7.694812306730031e-06, + "loss": 0.2625, + "step": 7407 + }, + { + "epoch": 0.5868884927708458, + "grad_norm": 1.4527479795667464, + "learning_rate": 7.69231545959223e-06, + "loss": 0.3628, + "step": 7408 + }, + { + "epoch": 0.5869677163794811, + "grad_norm": 1.4336202808042589, + "learning_rate": 7.689818764404351e-06, + "loss": 0.3084, + "step": 7409 + }, + { + "epoch": 0.5870469399881164, + "grad_norm": 1.078452761860379, + "learning_rate": 7.687322221330794e-06, + "loss": 0.1709, + "step": 7410 + }, + { + "epoch": 0.5871261635967518, + "grad_norm": 1.306351426232991, + "learning_rate": 7.684825830535935e-06, + "loss": 0.2292, + "step": 7411 + }, + { + "epoch": 0.5872053872053872, + "grad_norm": 1.3203699356716867, + "learning_rate": 7.682329592184158e-06, + "loss": 0.267, + "step": 7412 + }, + { + "epoch": 0.5872846108140226, + "grad_norm": 1.4417812450917131, + "learning_rate": 7.679833506439826e-06, + "loss": 0.273, + "step": 7413 + }, + { + "epoch": 0.5873638344226579, + "grad_norm": 1.3155191690084824, + "learning_rate": 7.677337573467294e-06, + "loss": 0.2662, + "step": 7414 + }, + { + "epoch": 0.5874430580312934, + "grad_norm": 1.446292602686977, + "learning_rate": 7.674841793430907e-06, + "loss": 0.3361, + "step": 7415 + }, + { + "epoch": 0.5875222816399287, + "grad_norm": 1.6835814421649804, + "learning_rate": 7.672346166494999e-06, + "loss": 0.3915, + "step": 7416 + }, + { + "epoch": 0.587601505248564, + "grad_norm": 1.252841167850413, + "learning_rate": 7.669850692823895e-06, + "loss": 0.2524, + "step": 7417 + }, + { + "epoch": 0.5876807288571995, + "grad_norm": 1.7175642424581115, + "learning_rate": 7.667355372581913e-06, + "loss": 0.2831, + "step": 7418 + }, + { + "epoch": 0.5877599524658348, + "grad_norm": 1.4702081195470056, + "learning_rate": 7.664860205933356e-06, + "loss": 0.3117, + "step": 7419 + }, + { + "epoch": 0.5878391760744702, + "grad_norm": 1.1536574519932967, + "learning_rate": 7.662365193042516e-06, + "loss": 0.2058, + "step": 7420 + }, + { + "epoch": 0.5879183996831056, + "grad_norm": 1.3982714578969981, + "learning_rate": 7.659870334073683e-06, + "loss": 0.2512, + "step": 7421 + }, + { + "epoch": 0.587997623291741, + "grad_norm": 1.1853779223638359, + "learning_rate": 7.657375629191126e-06, + "loss": 0.2034, + "step": 7422 + }, + { + "epoch": 0.5880768469003763, + "grad_norm": 1.4861810625649794, + "learning_rate": 7.654881078559112e-06, + "loss": 0.2838, + "step": 7423 + }, + { + "epoch": 0.5881560705090116, + "grad_norm": 1.385320203350565, + "learning_rate": 7.652386682341895e-06, + "loss": 0.2644, + "step": 7424 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.5555083791737685, + "learning_rate": 7.64989244070372e-06, + "loss": 0.2881, + "step": 7425 + }, + { + "epoch": 0.5883145177262824, + "grad_norm": 1.3076146519454994, + "learning_rate": 7.647398353808822e-06, + "loss": 0.2205, + "step": 7426 + }, + { + "epoch": 0.5883937413349178, + "grad_norm": 1.316769571768223, + "learning_rate": 7.644904421821418e-06, + "loss": 0.2126, + "step": 7427 + }, + { + "epoch": 0.5884729649435532, + "grad_norm": 1.3689127671483172, + "learning_rate": 7.642410644905726e-06, + "loss": 0.2671, + "step": 7428 + }, + { + "epoch": 0.5885521885521886, + "grad_norm": 1.0943543827417543, + "learning_rate": 7.639917023225953e-06, + "loss": 0.1929, + "step": 7429 + }, + { + "epoch": 0.5886314121608239, + "grad_norm": 1.6173908518413211, + "learning_rate": 7.637423556946284e-06, + "loss": 0.2325, + "step": 7430 + }, + { + "epoch": 0.5887106357694593, + "grad_norm": 1.3764563617056509, + "learning_rate": 7.63493024623091e-06, + "loss": 0.2506, + "step": 7431 + }, + { + "epoch": 0.5887898593780947, + "grad_norm": 1.284653619218265, + "learning_rate": 7.632437091243996e-06, + "loss": 0.254, + "step": 7432 + }, + { + "epoch": 0.58886908298673, + "grad_norm": 1.3515770516345202, + "learning_rate": 7.629944092149707e-06, + "loss": 0.2726, + "step": 7433 + }, + { + "epoch": 0.5889483065953655, + "grad_norm": 1.270990809515771, + "learning_rate": 7.627451249112199e-06, + "loss": 0.1839, + "step": 7434 + }, + { + "epoch": 0.5890275302040008, + "grad_norm": 1.207246841455816, + "learning_rate": 7.624958562295607e-06, + "loss": 0.2001, + "step": 7435 + }, + { + "epoch": 0.5891067538126362, + "grad_norm": 1.3501580352354754, + "learning_rate": 7.622466031864066e-06, + "loss": 0.1898, + "step": 7436 + }, + { + "epoch": 0.5891859774212715, + "grad_norm": 1.0902484685565283, + "learning_rate": 7.6199736579817005e-06, + "loss": 0.2176, + "step": 7437 + }, + { + "epoch": 0.5892652010299069, + "grad_norm": 1.5473770654886636, + "learning_rate": 7.617481440812617e-06, + "loss": 0.2818, + "step": 7438 + }, + { + "epoch": 0.5893444246385423, + "grad_norm": 1.805836870565018, + "learning_rate": 7.614989380520914e-06, + "loss": 0.3491, + "step": 7439 + }, + { + "epoch": 0.5894236482471776, + "grad_norm": 1.3799479293153125, + "learning_rate": 7.612497477270686e-06, + "loss": 0.2464, + "step": 7440 + }, + { + "epoch": 0.5895028718558131, + "grad_norm": 1.3944719720163752, + "learning_rate": 7.610005731226009e-06, + "loss": 0.2698, + "step": 7441 + }, + { + "epoch": 0.5895820954644484, + "grad_norm": 1.5419217738308628, + "learning_rate": 7.607514142550955e-06, + "loss": 0.3906, + "step": 7442 + }, + { + "epoch": 0.5896613190730838, + "grad_norm": 1.337667194925046, + "learning_rate": 7.605022711409585e-06, + "loss": 0.2321, + "step": 7443 + }, + { + "epoch": 0.5897405426817192, + "grad_norm": 1.2600107445664657, + "learning_rate": 7.602531437965943e-06, + "loss": 0.2493, + "step": 7444 + }, + { + "epoch": 0.5898197662903545, + "grad_norm": 1.3604319618866716, + "learning_rate": 7.6000403223840714e-06, + "loss": 0.2597, + "step": 7445 + }, + { + "epoch": 0.5898989898989899, + "grad_norm": 1.2765888641035044, + "learning_rate": 7.597549364827997e-06, + "loss": 0.242, + "step": 7446 + }, + { + "epoch": 0.5899782135076252, + "grad_norm": 1.3756647494275633, + "learning_rate": 7.595058565461736e-06, + "loss": 0.2702, + "step": 7447 + }, + { + "epoch": 0.5900574371162607, + "grad_norm": 1.5965004058795826, + "learning_rate": 7.5925679244492985e-06, + "loss": 0.3563, + "step": 7448 + }, + { + "epoch": 0.590136660724896, + "grad_norm": 1.2983133756519158, + "learning_rate": 7.5900774419546775e-06, + "loss": 0.3042, + "step": 7449 + }, + { + "epoch": 0.5902158843335313, + "grad_norm": 1.2219598943563936, + "learning_rate": 7.58758711814186e-06, + "loss": 0.173, + "step": 7450 + }, + { + "epoch": 0.5902951079421668, + "grad_norm": 1.486462609571466, + "learning_rate": 7.585096953174827e-06, + "loss": 0.3226, + "step": 7451 + }, + { + "epoch": 0.5903743315508021, + "grad_norm": 1.2695946551624102, + "learning_rate": 7.582606947217537e-06, + "loss": 0.2291, + "step": 7452 + }, + { + "epoch": 0.5904535551594375, + "grad_norm": 1.4351904370970332, + "learning_rate": 7.580117100433947e-06, + "loss": 0.3575, + "step": 7453 + }, + { + "epoch": 0.5905327787680729, + "grad_norm": 1.3127303331222588, + "learning_rate": 7.577627412988005e-06, + "loss": 0.2212, + "step": 7454 + }, + { + "epoch": 0.5906120023767083, + "grad_norm": 1.2318312775641598, + "learning_rate": 7.57513788504364e-06, + "loss": 0.2877, + "step": 7455 + }, + { + "epoch": 0.5906912259853436, + "grad_norm": 2.6279604482854646, + "learning_rate": 7.572648516764778e-06, + "loss": 0.2538, + "step": 7456 + }, + { + "epoch": 0.590770449593979, + "grad_norm": 1.3803850522874588, + "learning_rate": 7.570159308315331e-06, + "loss": 0.2509, + "step": 7457 + }, + { + "epoch": 0.5908496732026144, + "grad_norm": 1.2948777977724644, + "learning_rate": 7.5676702598592025e-06, + "loss": 0.2837, + "step": 7458 + }, + { + "epoch": 0.5909288968112497, + "grad_norm": 1.576605562566521, + "learning_rate": 7.5651813715602855e-06, + "loss": 0.2639, + "step": 7459 + }, + { + "epoch": 0.5910081204198852, + "grad_norm": 1.1023186007731323, + "learning_rate": 7.562692643582456e-06, + "loss": 0.2042, + "step": 7460 + }, + { + "epoch": 0.5910873440285205, + "grad_norm": 1.2512309018769499, + "learning_rate": 7.56020407608959e-06, + "loss": 0.2917, + "step": 7461 + }, + { + "epoch": 0.5911665676371559, + "grad_norm": 1.4407503336677954, + "learning_rate": 7.557715669245547e-06, + "loss": 0.3002, + "step": 7462 + }, + { + "epoch": 0.5912457912457912, + "grad_norm": 1.489865731951598, + "learning_rate": 7.555227423214174e-06, + "loss": 0.2233, + "step": 7463 + }, + { + "epoch": 0.5913250148544266, + "grad_norm": 1.2316773407667512, + "learning_rate": 7.552739338159314e-06, + "loss": 0.167, + "step": 7464 + }, + { + "epoch": 0.591404238463062, + "grad_norm": 1.4010405204446073, + "learning_rate": 7.550251414244791e-06, + "loss": 0.236, + "step": 7465 + }, + { + "epoch": 0.5914834620716973, + "grad_norm": 1.6825773584782147, + "learning_rate": 7.5477636516344255e-06, + "loss": 0.2709, + "step": 7466 + }, + { + "epoch": 0.5915626856803328, + "grad_norm": 1.257420176577219, + "learning_rate": 7.545276050492025e-06, + "loss": 0.2572, + "step": 7467 + }, + { + "epoch": 0.5916419092889681, + "grad_norm": 1.5538640611998669, + "learning_rate": 7.542788610981384e-06, + "loss": 0.3586, + "step": 7468 + }, + { + "epoch": 0.5917211328976035, + "grad_norm": 1.473066563444876, + "learning_rate": 7.540301333266289e-06, + "loss": 0.3298, + "step": 7469 + }, + { + "epoch": 0.5918003565062389, + "grad_norm": 1.157732814223413, + "learning_rate": 7.537814217510518e-06, + "loss": 0.2131, + "step": 7470 + }, + { + "epoch": 0.5918795801148742, + "grad_norm": 1.2652375994673888, + "learning_rate": 7.535327263877832e-06, + "loss": 0.2705, + "step": 7471 + }, + { + "epoch": 0.5919588037235096, + "grad_norm": 1.141362571719065, + "learning_rate": 7.532840472531988e-06, + "loss": 0.2423, + "step": 7472 + }, + { + "epoch": 0.592038027332145, + "grad_norm": 1.7007286739688363, + "learning_rate": 7.530353843636726e-06, + "loss": 0.2879, + "step": 7473 + }, + { + "epoch": 0.5921172509407804, + "grad_norm": 1.3018318599537513, + "learning_rate": 7.52786737735578e-06, + "loss": 0.2549, + "step": 7474 + }, + { + "epoch": 0.5921964745494157, + "grad_norm": 1.1864689912220028, + "learning_rate": 7.525381073852874e-06, + "loss": 0.1985, + "step": 7475 + }, + { + "epoch": 0.5922756981580511, + "grad_norm": 1.4190625680201512, + "learning_rate": 7.522894933291715e-06, + "loss": 0.3076, + "step": 7476 + }, + { + "epoch": 0.5923549217666865, + "grad_norm": 1.2969648037633639, + "learning_rate": 7.5204089558360076e-06, + "loss": 0.2797, + "step": 7477 + }, + { + "epoch": 0.5924341453753218, + "grad_norm": 1.3373739069866868, + "learning_rate": 7.517923141649439e-06, + "loss": 0.2766, + "step": 7478 + }, + { + "epoch": 0.5925133689839572, + "grad_norm": 1.3512528231625365, + "learning_rate": 7.515437490895688e-06, + "loss": 0.36, + "step": 7479 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 1.1546955065879365, + "learning_rate": 7.5129520037384225e-06, + "loss": 0.2742, + "step": 7480 + }, + { + "epoch": 0.592671816201228, + "grad_norm": 1.6571432928098642, + "learning_rate": 7.5104666803413015e-06, + "loss": 0.2883, + "step": 7481 + }, + { + "epoch": 0.5927510398098633, + "grad_norm": 1.0168811704328824, + "learning_rate": 7.50798152086797e-06, + "loss": 0.181, + "step": 7482 + }, + { + "epoch": 0.5928302634184988, + "grad_norm": 1.0660797183668953, + "learning_rate": 7.505496525482066e-06, + "loss": 0.2247, + "step": 7483 + }, + { + "epoch": 0.5929094870271341, + "grad_norm": 1.3884581243400174, + "learning_rate": 7.503011694347212e-06, + "loss": 0.2753, + "step": 7484 + }, + { + "epoch": 0.5929887106357694, + "grad_norm": 1.6672136037078602, + "learning_rate": 7.500527027627025e-06, + "loss": 0.3018, + "step": 7485 + }, + { + "epoch": 0.5930679342444048, + "grad_norm": 1.3825309258770417, + "learning_rate": 7.4980425254851034e-06, + "loss": 0.2729, + "step": 7486 + }, + { + "epoch": 0.5931471578530402, + "grad_norm": 1.3220385655169353, + "learning_rate": 7.495558188085044e-06, + "loss": 0.224, + "step": 7487 + }, + { + "epoch": 0.5932263814616756, + "grad_norm": 1.372592469822823, + "learning_rate": 7.493074015590429e-06, + "loss": 0.2625, + "step": 7488 + }, + { + "epoch": 0.5933056050703109, + "grad_norm": 1.2115544832619078, + "learning_rate": 7.490590008164824e-06, + "loss": 0.2048, + "step": 7489 + }, + { + "epoch": 0.5933848286789464, + "grad_norm": 1.237305794544412, + "learning_rate": 7.488106165971795e-06, + "loss": 0.2589, + "step": 7490 + }, + { + "epoch": 0.5934640522875817, + "grad_norm": 1.3037533439806281, + "learning_rate": 7.485622489174888e-06, + "loss": 0.1868, + "step": 7491 + }, + { + "epoch": 0.593543275896217, + "grad_norm": 1.3264577974516714, + "learning_rate": 7.483138977937643e-06, + "loss": 0.2209, + "step": 7492 + }, + { + "epoch": 0.5936224995048525, + "grad_norm": 1.266194862230873, + "learning_rate": 7.480655632423586e-06, + "loss": 0.1685, + "step": 7493 + }, + { + "epoch": 0.5937017231134878, + "grad_norm": 1.4688133553403262, + "learning_rate": 7.478172452796231e-06, + "loss": 0.281, + "step": 7494 + }, + { + "epoch": 0.5937809467221232, + "grad_norm": 1.2678216071115498, + "learning_rate": 7.475689439219085e-06, + "loss": 0.2251, + "step": 7495 + }, + { + "epoch": 0.5938601703307586, + "grad_norm": 1.2659312085566632, + "learning_rate": 7.473206591855646e-06, + "loss": 0.248, + "step": 7496 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 1.777273256780976, + "learning_rate": 7.470723910869393e-06, + "loss": 0.3508, + "step": 7497 + }, + { + "epoch": 0.5940186175480293, + "grad_norm": 1.3234516144639468, + "learning_rate": 7.468241396423801e-06, + "loss": 0.2617, + "step": 7498 + }, + { + "epoch": 0.5940978411566646, + "grad_norm": 1.2946505327690996, + "learning_rate": 7.465759048682333e-06, + "loss": 0.1795, + "step": 7499 + }, + { + "epoch": 0.5941770647653001, + "grad_norm": 1.332227007812678, + "learning_rate": 7.463276867808435e-06, + "loss": 0.2254, + "step": 7500 + }, + { + "epoch": 0.5942562883739354, + "grad_norm": 1.3911425480582986, + "learning_rate": 7.46079485396555e-06, + "loss": 0.2984, + "step": 7501 + }, + { + "epoch": 0.5943355119825708, + "grad_norm": 1.461404613720531, + "learning_rate": 7.458313007317106e-06, + "loss": 0.3018, + "step": 7502 + }, + { + "epoch": 0.5944147355912062, + "grad_norm": 1.3151399902166259, + "learning_rate": 7.45583132802652e-06, + "loss": 0.2576, + "step": 7503 + }, + { + "epoch": 0.5944939591998416, + "grad_norm": 1.4257537833821978, + "learning_rate": 7.4533498162572004e-06, + "loss": 0.3103, + "step": 7504 + }, + { + "epoch": 0.5945731828084769, + "grad_norm": 1.3807882136832523, + "learning_rate": 7.450868472172541e-06, + "loss": 0.2852, + "step": 7505 + }, + { + "epoch": 0.5946524064171123, + "grad_norm": 1.5790774466013637, + "learning_rate": 7.448387295935926e-06, + "loss": 0.2449, + "step": 7506 + }, + { + "epoch": 0.5947316300257477, + "grad_norm": 1.2052271497834661, + "learning_rate": 7.445906287710733e-06, + "loss": 0.2588, + "step": 7507 + }, + { + "epoch": 0.594810853634383, + "grad_norm": 1.206588330644287, + "learning_rate": 7.443425447660319e-06, + "loss": 0.2394, + "step": 7508 + }, + { + "epoch": 0.5948900772430185, + "grad_norm": 1.4398635310460473, + "learning_rate": 7.4409447759480404e-06, + "loss": 0.2689, + "step": 7509 + }, + { + "epoch": 0.5949693008516538, + "grad_norm": 1.273649803856423, + "learning_rate": 7.438464272737232e-06, + "loss": 0.2313, + "step": 7510 + }, + { + "epoch": 0.5950485244602892, + "grad_norm": 1.3196732345679216, + "learning_rate": 7.435983938191227e-06, + "loss": 0.2606, + "step": 7511 + }, + { + "epoch": 0.5951277480689245, + "grad_norm": 1.41977868196511, + "learning_rate": 7.433503772473343e-06, + "loss": 0.3115, + "step": 7512 + }, + { + "epoch": 0.5952069716775599, + "grad_norm": 1.5681720250193947, + "learning_rate": 7.431023775746886e-06, + "loss": 0.3535, + "step": 7513 + }, + { + "epoch": 0.5952861952861953, + "grad_norm": 1.3059631707700323, + "learning_rate": 7.428543948175151e-06, + "loss": 0.2478, + "step": 7514 + }, + { + "epoch": 0.5953654188948306, + "grad_norm": 1.4592618113563653, + "learning_rate": 7.426064289921429e-06, + "loss": 0.2491, + "step": 7515 + }, + { + "epoch": 0.5954446425034661, + "grad_norm": 1.5934290539020544, + "learning_rate": 7.423584801148985e-06, + "loss": 0.3142, + "step": 7516 + }, + { + "epoch": 0.5955238661121014, + "grad_norm": 1.2059085233451832, + "learning_rate": 7.421105482021084e-06, + "loss": 0.2405, + "step": 7517 + }, + { + "epoch": 0.5956030897207368, + "grad_norm": 1.0503339324969476, + "learning_rate": 7.41862633270098e-06, + "loss": 0.1798, + "step": 7518 + }, + { + "epoch": 0.5956823133293722, + "grad_norm": 1.179440912848171, + "learning_rate": 7.416147353351909e-06, + "loss": 0.2606, + "step": 7519 + }, + { + "epoch": 0.5957615369380075, + "grad_norm": 1.7951957662791829, + "learning_rate": 7.4136685441371025e-06, + "loss": 0.3119, + "step": 7520 + }, + { + "epoch": 0.5958407605466429, + "grad_norm": 1.2866725077806018, + "learning_rate": 7.41118990521978e-06, + "loss": 0.2682, + "step": 7521 + }, + { + "epoch": 0.5959199841552782, + "grad_norm": 1.4937678006327972, + "learning_rate": 7.408711436763143e-06, + "loss": 0.3004, + "step": 7522 + }, + { + "epoch": 0.5959992077639137, + "grad_norm": 1.2954424479938147, + "learning_rate": 7.406233138930389e-06, + "loss": 0.2728, + "step": 7523 + }, + { + "epoch": 0.596078431372549, + "grad_norm": 1.051369170825376, + "learning_rate": 7.4037550118847044e-06, + "loss": 0.2164, + "step": 7524 + }, + { + "epoch": 0.5961576549811844, + "grad_norm": 1.4811417945690901, + "learning_rate": 7.401277055789259e-06, + "loss": 0.291, + "step": 7525 + }, + { + "epoch": 0.5962368785898198, + "grad_norm": 1.4524776438044589, + "learning_rate": 7.398799270807217e-06, + "loss": 0.268, + "step": 7526 + }, + { + "epoch": 0.5963161021984551, + "grad_norm": 1.5893021749315575, + "learning_rate": 7.3963216571017235e-06, + "loss": 0.3047, + "step": 7527 + }, + { + "epoch": 0.5963953258070905, + "grad_norm": 1.2217490114807055, + "learning_rate": 7.3938442148359215e-06, + "loss": 0.2327, + "step": 7528 + }, + { + "epoch": 0.5964745494157259, + "grad_norm": 1.461490164975469, + "learning_rate": 7.391366944172941e-06, + "loss": 0.2637, + "step": 7529 + }, + { + "epoch": 0.5965537730243613, + "grad_norm": 1.6686776838493305, + "learning_rate": 7.388889845275893e-06, + "loss": 0.3454, + "step": 7530 + }, + { + "epoch": 0.5966329966329966, + "grad_norm": 1.6606977997013623, + "learning_rate": 7.3864129183078835e-06, + "loss": 0.3359, + "step": 7531 + }, + { + "epoch": 0.596712220241632, + "grad_norm": 1.4439122612478255, + "learning_rate": 7.38393616343201e-06, + "loss": 0.2494, + "step": 7532 + }, + { + "epoch": 0.5967914438502674, + "grad_norm": 1.1928585321778848, + "learning_rate": 7.381459580811352e-06, + "loss": 0.2281, + "step": 7533 + }, + { + "epoch": 0.5968706674589027, + "grad_norm": 1.195981238174483, + "learning_rate": 7.378983170608982e-06, + "loss": 0.2145, + "step": 7534 + }, + { + "epoch": 0.5969498910675382, + "grad_norm": 1.2098194020188708, + "learning_rate": 7.376506932987956e-06, + "loss": 0.2793, + "step": 7535 + }, + { + "epoch": 0.5970291146761735, + "grad_norm": 1.162930319798335, + "learning_rate": 7.374030868111326e-06, + "loss": 0.2118, + "step": 7536 + }, + { + "epoch": 0.5971083382848089, + "grad_norm": 1.3675315864210198, + "learning_rate": 7.371554976142128e-06, + "loss": 0.2394, + "step": 7537 + }, + { + "epoch": 0.5971875618934442, + "grad_norm": 1.3157797402425673, + "learning_rate": 7.369079257243388e-06, + "loss": 0.262, + "step": 7538 + }, + { + "epoch": 0.5972667855020796, + "grad_norm": 1.241262629533937, + "learning_rate": 7.366603711578119e-06, + "loss": 0.1741, + "step": 7539 + }, + { + "epoch": 0.597346009110715, + "grad_norm": 1.0737004901610891, + "learning_rate": 7.364128339309326e-06, + "loss": 0.1747, + "step": 7540 + }, + { + "epoch": 0.5974252327193503, + "grad_norm": 1.6515854999616628, + "learning_rate": 7.361653140599997e-06, + "loss": 0.2809, + "step": 7541 + }, + { + "epoch": 0.5975044563279858, + "grad_norm": 1.214640346893805, + "learning_rate": 7.359178115613116e-06, + "loss": 0.2101, + "step": 7542 + }, + { + "epoch": 0.5975836799366211, + "grad_norm": 1.5687227286702028, + "learning_rate": 7.356703264511646e-06, + "loss": 0.3295, + "step": 7543 + }, + { + "epoch": 0.5976629035452565, + "grad_norm": 1.3371952710416235, + "learning_rate": 7.354228587458549e-06, + "loss": 0.2467, + "step": 7544 + }, + { + "epoch": 0.5977421271538919, + "grad_norm": 1.6742897953652043, + "learning_rate": 7.351754084616771e-06, + "loss": 0.2696, + "step": 7545 + }, + { + "epoch": 0.5978213507625272, + "grad_norm": 1.6202427255169483, + "learning_rate": 7.349279756149241e-06, + "loss": 0.294, + "step": 7546 + }, + { + "epoch": 0.5979005743711626, + "grad_norm": 1.145232105313392, + "learning_rate": 7.346805602218885e-06, + "loss": 0.1856, + "step": 7547 + }, + { + "epoch": 0.597979797979798, + "grad_norm": 1.2489985364064866, + "learning_rate": 7.344331622988616e-06, + "loss": 0.1871, + "step": 7548 + }, + { + "epoch": 0.5980590215884334, + "grad_norm": 1.190098770449158, + "learning_rate": 7.341857818621328e-06, + "loss": 0.1944, + "step": 7549 + }, + { + "epoch": 0.5981382451970687, + "grad_norm": 1.1875903422994931, + "learning_rate": 7.339384189279917e-06, + "loss": 0.1735, + "step": 7550 + }, + { + "epoch": 0.5982174688057041, + "grad_norm": 1.1442166451653273, + "learning_rate": 7.33691073512725e-06, + "loss": 0.2234, + "step": 7551 + }, + { + "epoch": 0.5982966924143395, + "grad_norm": 1.2501289177988377, + "learning_rate": 7.3344374563262e-06, + "loss": 0.2216, + "step": 7552 + }, + { + "epoch": 0.5983759160229748, + "grad_norm": 1.2439122423072726, + "learning_rate": 7.3319643530396175e-06, + "loss": 0.2552, + "step": 7553 + }, + { + "epoch": 0.5984551396316102, + "grad_norm": 1.3609535658775174, + "learning_rate": 7.329491425430344e-06, + "loss": 0.2249, + "step": 7554 + }, + { + "epoch": 0.5985343632402456, + "grad_norm": 1.7554836451089235, + "learning_rate": 7.327018673661209e-06, + "loss": 0.3718, + "step": 7555 + }, + { + "epoch": 0.598613586848881, + "grad_norm": 1.3091489250488109, + "learning_rate": 7.324546097895036e-06, + "loss": 0.2797, + "step": 7556 + }, + { + "epoch": 0.5986928104575163, + "grad_norm": 1.2228300459839474, + "learning_rate": 7.3220736982946275e-06, + "loss": 0.2146, + "step": 7557 + }, + { + "epoch": 0.5987720340661518, + "grad_norm": 1.4658117667642692, + "learning_rate": 7.3196014750227815e-06, + "loss": 0.3217, + "step": 7558 + }, + { + "epoch": 0.5988512576747871, + "grad_norm": 1.0709370498880546, + "learning_rate": 7.317129428242279e-06, + "loss": 0.219, + "step": 7559 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 1.5411691154734617, + "learning_rate": 7.3146575581158945e-06, + "loss": 0.2521, + "step": 7560 + }, + { + "epoch": 0.5990097048920578, + "grad_norm": 1.507784814225209, + "learning_rate": 7.312185864806391e-06, + "loss": 0.284, + "step": 7561 + }, + { + "epoch": 0.5990889285006932, + "grad_norm": 1.5397344550307526, + "learning_rate": 7.309714348476513e-06, + "loss": 0.2678, + "step": 7562 + }, + { + "epoch": 0.5991681521093286, + "grad_norm": 1.2582193892350415, + "learning_rate": 7.307243009289005e-06, + "loss": 0.2144, + "step": 7563 + }, + { + "epoch": 0.5992473757179639, + "grad_norm": 1.29034387370778, + "learning_rate": 7.304771847406582e-06, + "loss": 0.2591, + "step": 7564 + }, + { + "epoch": 0.5993265993265994, + "grad_norm": 1.2722615946466806, + "learning_rate": 7.3023008629919665e-06, + "loss": 0.222, + "step": 7565 + }, + { + "epoch": 0.5994058229352347, + "grad_norm": 1.185571388012682, + "learning_rate": 7.299830056207861e-06, + "loss": 0.1994, + "step": 7566 + }, + { + "epoch": 0.59948504654387, + "grad_norm": 1.1144300415417774, + "learning_rate": 7.29735942721695e-06, + "loss": 0.2111, + "step": 7567 + }, + { + "epoch": 0.5995642701525055, + "grad_norm": 1.6675233219565557, + "learning_rate": 7.294888976181919e-06, + "loss": 0.3327, + "step": 7568 + }, + { + "epoch": 0.5996434937611408, + "grad_norm": 1.4716918778419212, + "learning_rate": 7.2924187032654335e-06, + "loss": 0.2673, + "step": 7569 + }, + { + "epoch": 0.5997227173697762, + "grad_norm": 1.2864802650628842, + "learning_rate": 7.289948608630146e-06, + "loss": 0.2492, + "step": 7570 + }, + { + "epoch": 0.5998019409784116, + "grad_norm": 1.9405470338841433, + "learning_rate": 7.287478692438705e-06, + "loss": 0.3413, + "step": 7571 + }, + { + "epoch": 0.599881164587047, + "grad_norm": 1.3919885563331966, + "learning_rate": 7.285008954853739e-06, + "loss": 0.2946, + "step": 7572 + }, + { + "epoch": 0.5999603881956823, + "grad_norm": 1.3527190620007843, + "learning_rate": 7.282539396037868e-06, + "loss": 0.25, + "step": 7573 + }, + { + "epoch": 0.6000396118043176, + "grad_norm": 1.3575830560168818, + "learning_rate": 7.280070016153706e-06, + "loss": 0.247, + "step": 7574 + }, + { + "epoch": 0.6001188354129531, + "grad_norm": 1.1074610957125472, + "learning_rate": 7.277600815363842e-06, + "loss": 0.1799, + "step": 7575 + }, + { + "epoch": 0.6001980590215884, + "grad_norm": 1.4974324744478347, + "learning_rate": 7.275131793830865e-06, + "loss": 0.2427, + "step": 7576 + }, + { + "epoch": 0.6002772826302238, + "grad_norm": 1.4819571534446438, + "learning_rate": 7.272662951717352e-06, + "loss": 0.285, + "step": 7577 + }, + { + "epoch": 0.6003565062388592, + "grad_norm": 1.1200596221058652, + "learning_rate": 7.270194289185858e-06, + "loss": 0.1978, + "step": 7578 + }, + { + "epoch": 0.6004357298474946, + "grad_norm": 1.4567852423629415, + "learning_rate": 7.267725806398936e-06, + "loss": 0.2565, + "step": 7579 + }, + { + "epoch": 0.6005149534561299, + "grad_norm": 1.6309369064281796, + "learning_rate": 7.265257503519122e-06, + "loss": 0.2526, + "step": 7580 + }, + { + "epoch": 0.6005941770647653, + "grad_norm": 1.4367210835566138, + "learning_rate": 7.262789380708942e-06, + "loss": 0.3525, + "step": 7581 + }, + { + "epoch": 0.6006734006734007, + "grad_norm": 1.2838400073077956, + "learning_rate": 7.260321438130913e-06, + "loss": 0.2382, + "step": 7582 + }, + { + "epoch": 0.600752624282036, + "grad_norm": 1.3781989739168623, + "learning_rate": 7.257853675947533e-06, + "loss": 0.2087, + "step": 7583 + }, + { + "epoch": 0.6008318478906715, + "grad_norm": 1.3718476582603227, + "learning_rate": 7.255386094321293e-06, + "loss": 0.2938, + "step": 7584 + }, + { + "epoch": 0.6009110714993068, + "grad_norm": 1.2773596554900355, + "learning_rate": 7.2529186934146756e-06, + "loss": 0.1827, + "step": 7585 + }, + { + "epoch": 0.6009902951079422, + "grad_norm": 1.4263887915558608, + "learning_rate": 7.250451473390141e-06, + "loss": 0.2592, + "step": 7586 + }, + { + "epoch": 0.6010695187165775, + "grad_norm": 1.2470390430746008, + "learning_rate": 7.24798443441015e-06, + "loss": 0.2346, + "step": 7587 + }, + { + "epoch": 0.6011487423252129, + "grad_norm": 1.6550969227994585, + "learning_rate": 7.24551757663714e-06, + "loss": 0.2215, + "step": 7588 + }, + { + "epoch": 0.6012279659338483, + "grad_norm": 1.2391121337914008, + "learning_rate": 7.2430509002335434e-06, + "loss": 0.2657, + "step": 7589 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 1.4223756864133885, + "learning_rate": 7.240584405361781e-06, + "loss": 0.2925, + "step": 7590 + }, + { + "epoch": 0.6013864131511191, + "grad_norm": 1.1830896443593542, + "learning_rate": 7.238118092184256e-06, + "loss": 0.2278, + "step": 7591 + }, + { + "epoch": 0.6014656367597544, + "grad_norm": 1.31474535313264, + "learning_rate": 7.2356519608633665e-06, + "loss": 0.2805, + "step": 7592 + }, + { + "epoch": 0.6015448603683898, + "grad_norm": 1.5988782456451411, + "learning_rate": 7.233186011561498e-06, + "loss": 0.4399, + "step": 7593 + }, + { + "epoch": 0.6016240839770252, + "grad_norm": 1.237152187497868, + "learning_rate": 7.230720244441016e-06, + "loss": 0.2222, + "step": 7594 + }, + { + "epoch": 0.6017033075856605, + "grad_norm": 1.4422977852783678, + "learning_rate": 7.228254659664278e-06, + "loss": 0.3135, + "step": 7595 + }, + { + "epoch": 0.6017825311942959, + "grad_norm": 1.2867701459483227, + "learning_rate": 7.225789257393636e-06, + "loss": 0.2205, + "step": 7596 + }, + { + "epoch": 0.6018617548029312, + "grad_norm": 1.358466347550248, + "learning_rate": 7.223324037791421e-06, + "loss": 0.288, + "step": 7597 + }, + { + "epoch": 0.6019409784115667, + "grad_norm": 1.4813948425701016, + "learning_rate": 7.220859001019957e-06, + "loss": 0.2921, + "step": 7598 + }, + { + "epoch": 0.602020202020202, + "grad_norm": 1.475011915392138, + "learning_rate": 7.218394147241559e-06, + "loss": 0.2789, + "step": 7599 + }, + { + "epoch": 0.6020994256288374, + "grad_norm": 1.4199374867012704, + "learning_rate": 7.2159294766185174e-06, + "loss": 0.2562, + "step": 7600 + }, + { + "epoch": 0.6021786492374728, + "grad_norm": 1.1078324594846334, + "learning_rate": 7.213464989313126e-06, + "loss": 0.2555, + "step": 7601 + }, + { + "epoch": 0.6022578728461081, + "grad_norm": 1.205078372849288, + "learning_rate": 7.211000685487658e-06, + "loss": 0.257, + "step": 7602 + }, + { + "epoch": 0.6023370964547435, + "grad_norm": 1.3474617361512962, + "learning_rate": 7.208536565304374e-06, + "loss": 0.3349, + "step": 7603 + }, + { + "epoch": 0.6024163200633789, + "grad_norm": 1.296315277738826, + "learning_rate": 7.206072628925526e-06, + "loss": 0.2403, + "step": 7604 + }, + { + "epoch": 0.6024955436720143, + "grad_norm": 1.2907027882147537, + "learning_rate": 7.203608876513351e-06, + "loss": 0.2365, + "step": 7605 + }, + { + "epoch": 0.6025747672806496, + "grad_norm": 1.2602089604938873, + "learning_rate": 7.201145308230075e-06, + "loss": 0.1982, + "step": 7606 + }, + { + "epoch": 0.602653990889285, + "grad_norm": 1.482015580486737, + "learning_rate": 7.198681924237918e-06, + "loss": 0.1965, + "step": 7607 + }, + { + "epoch": 0.6027332144979204, + "grad_norm": 1.3410721016847686, + "learning_rate": 7.196218724699072e-06, + "loss": 0.2991, + "step": 7608 + }, + { + "epoch": 0.6028124381065557, + "grad_norm": 1.3224091380902647, + "learning_rate": 7.193755709775734e-06, + "loss": 0.2715, + "step": 7609 + }, + { + "epoch": 0.6028916617151912, + "grad_norm": 1.3255702135615999, + "learning_rate": 7.191292879630081e-06, + "loss": 0.3004, + "step": 7610 + }, + { + "epoch": 0.6029708853238265, + "grad_norm": 1.0371932607578918, + "learning_rate": 7.188830234424275e-06, + "loss": 0.1657, + "step": 7611 + }, + { + "epoch": 0.6030501089324619, + "grad_norm": 1.5098590101199332, + "learning_rate": 7.186367774320474e-06, + "loss": 0.3258, + "step": 7612 + }, + { + "epoch": 0.6031293325410972, + "grad_norm": 1.5422270160827176, + "learning_rate": 7.1839054994808145e-06, + "loss": 0.3902, + "step": 7613 + }, + { + "epoch": 0.6032085561497326, + "grad_norm": 1.312643343052515, + "learning_rate": 7.181443410067428e-06, + "loss": 0.1997, + "step": 7614 + }, + { + "epoch": 0.603287779758368, + "grad_norm": 1.417290301611864, + "learning_rate": 7.1789815062424325e-06, + "loss": 0.2415, + "step": 7615 + }, + { + "epoch": 0.6033670033670033, + "grad_norm": 1.4662280227092654, + "learning_rate": 7.176519788167929e-06, + "loss": 0.2777, + "step": 7616 + }, + { + "epoch": 0.6034462269756388, + "grad_norm": 1.652959151890591, + "learning_rate": 7.174058256006012e-06, + "loss": 0.3207, + "step": 7617 + }, + { + "epoch": 0.6035254505842741, + "grad_norm": 1.3832409994574812, + "learning_rate": 7.171596909918763e-06, + "loss": 0.2664, + "step": 7618 + }, + { + "epoch": 0.6036046741929095, + "grad_norm": 1.4818966972735843, + "learning_rate": 7.169135750068247e-06, + "loss": 0.3078, + "step": 7619 + }, + { + "epoch": 0.6036838978015449, + "grad_norm": 1.3124057107514822, + "learning_rate": 7.1666747766165226e-06, + "loss": 0.3103, + "step": 7620 + }, + { + "epoch": 0.6037631214101802, + "grad_norm": 1.2642671973250477, + "learning_rate": 7.164213989725628e-06, + "loss": 0.229, + "step": 7621 + }, + { + "epoch": 0.6038423450188156, + "grad_norm": 1.4475338696578857, + "learning_rate": 7.1617533895575975e-06, + "loss": 0.2366, + "step": 7622 + }, + { + "epoch": 0.6039215686274509, + "grad_norm": 1.1362475466523527, + "learning_rate": 7.1592929762744515e-06, + "loss": 0.249, + "step": 7623 + }, + { + "epoch": 0.6040007922360864, + "grad_norm": 1.4969631805560057, + "learning_rate": 7.156832750038192e-06, + "loss": 0.3206, + "step": 7624 + }, + { + "epoch": 0.6040800158447217, + "grad_norm": 1.480775787039677, + "learning_rate": 7.154372711010815e-06, + "loss": 0.3075, + "step": 7625 + }, + { + "epoch": 0.6041592394533571, + "grad_norm": 1.0438062446654606, + "learning_rate": 7.1519128593543065e-06, + "loss": 0.2199, + "step": 7626 + }, + { + "epoch": 0.6042384630619925, + "grad_norm": 1.2425749351325481, + "learning_rate": 7.149453195230629e-06, + "loss": 0.2482, + "step": 7627 + }, + { + "epoch": 0.6043176866706278, + "grad_norm": 1.5276301750159949, + "learning_rate": 7.1469937188017444e-06, + "loss": 0.3, + "step": 7628 + }, + { + "epoch": 0.6043969102792632, + "grad_norm": 1.3645058575409816, + "learning_rate": 7.144534430229595e-06, + "loss": 0.2666, + "step": 7629 + }, + { + "epoch": 0.6044761338878986, + "grad_norm": 1.3346557313714589, + "learning_rate": 7.142075329676112e-06, + "loss": 0.3015, + "step": 7630 + }, + { + "epoch": 0.604555357496534, + "grad_norm": 1.5178466886404989, + "learning_rate": 7.139616417303221e-06, + "loss": 0.3404, + "step": 7631 + }, + { + "epoch": 0.6046345811051693, + "grad_norm": 1.5751998276511372, + "learning_rate": 7.137157693272822e-06, + "loss": 0.319, + "step": 7632 + }, + { + "epoch": 0.6047138047138048, + "grad_norm": 1.2243817083712045, + "learning_rate": 7.1346991577468136e-06, + "loss": 0.2456, + "step": 7633 + }, + { + "epoch": 0.6047930283224401, + "grad_norm": 1.576997019983935, + "learning_rate": 7.132240810887083e-06, + "loss": 0.3, + "step": 7634 + }, + { + "epoch": 0.6048722519310754, + "grad_norm": 1.276460742090886, + "learning_rate": 7.129782652855492e-06, + "loss": 0.2325, + "step": 7635 + }, + { + "epoch": 0.6049514755397108, + "grad_norm": 1.2307566147077864, + "learning_rate": 7.127324683813906e-06, + "loss": 0.2649, + "step": 7636 + }, + { + "epoch": 0.6050306991483462, + "grad_norm": 1.4509190114186126, + "learning_rate": 7.124866903924164e-06, + "loss": 0.2656, + "step": 7637 + }, + { + "epoch": 0.6051099227569816, + "grad_norm": 1.1120443691618007, + "learning_rate": 7.122409313348102e-06, + "loss": 0.183, + "step": 7638 + }, + { + "epoch": 0.6051891463656169, + "grad_norm": 1.5302864414371933, + "learning_rate": 7.119951912247545e-06, + "loss": 0.3274, + "step": 7639 + }, + { + "epoch": 0.6052683699742524, + "grad_norm": 1.2440057244227303, + "learning_rate": 7.117494700784292e-06, + "loss": 0.2593, + "step": 7640 + }, + { + "epoch": 0.6053475935828877, + "grad_norm": 1.2163018543505084, + "learning_rate": 7.115037679120147e-06, + "loss": 0.2439, + "step": 7641 + }, + { + "epoch": 0.605426817191523, + "grad_norm": 1.3989399837038783, + "learning_rate": 7.112580847416886e-06, + "loss": 0.2719, + "step": 7642 + }, + { + "epoch": 0.6055060408001585, + "grad_norm": 1.451482042129622, + "learning_rate": 7.110124205836283e-06, + "loss": 0.2781, + "step": 7643 + }, + { + "epoch": 0.6055852644087938, + "grad_norm": 1.1850817091931767, + "learning_rate": 7.107667754540097e-06, + "loss": 0.1898, + "step": 7644 + }, + { + "epoch": 0.6056644880174292, + "grad_norm": 1.3807420375681736, + "learning_rate": 7.105211493690073e-06, + "loss": 0.2276, + "step": 7645 + }, + { + "epoch": 0.6057437116260646, + "grad_norm": 1.2778309598246989, + "learning_rate": 7.102755423447941e-06, + "loss": 0.3443, + "step": 7646 + }, + { + "epoch": 0.6058229352347, + "grad_norm": 1.3612511716945264, + "learning_rate": 7.100299543975426e-06, + "loss": 0.2833, + "step": 7647 + }, + { + "epoch": 0.6059021588433353, + "grad_norm": 1.3438388616097634, + "learning_rate": 7.097843855434232e-06, + "loss": 0.238, + "step": 7648 + }, + { + "epoch": 0.6059813824519706, + "grad_norm": 1.06734096536117, + "learning_rate": 7.09538835798606e-06, + "loss": 0.2218, + "step": 7649 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.7653414371877851, + "learning_rate": 7.092933051792583e-06, + "loss": 0.3245, + "step": 7650 + }, + { + "epoch": 0.6061398296692414, + "grad_norm": 1.4484162132772, + "learning_rate": 7.090477937015479e-06, + "loss": 0.2901, + "step": 7651 + }, + { + "epoch": 0.6062190532778768, + "grad_norm": 1.4631815501374397, + "learning_rate": 7.088023013816403e-06, + "loss": 0.2601, + "step": 7652 + }, + { + "epoch": 0.6062982768865122, + "grad_norm": 1.302104715331595, + "learning_rate": 7.085568282357e-06, + "loss": 0.1908, + "step": 7653 + }, + { + "epoch": 0.6063775004951476, + "grad_norm": 1.1477863929626102, + "learning_rate": 7.083113742798901e-06, + "loss": 0.2326, + "step": 7654 + }, + { + "epoch": 0.6064567241037829, + "grad_norm": 1.3271057650593732, + "learning_rate": 7.080659395303729e-06, + "loss": 0.239, + "step": 7655 + }, + { + "epoch": 0.6065359477124183, + "grad_norm": 1.4422446146305785, + "learning_rate": 7.078205240033087e-06, + "loss": 0.3386, + "step": 7656 + }, + { + "epoch": 0.6066151713210537, + "grad_norm": 1.2683362562076268, + "learning_rate": 7.075751277148574e-06, + "loss": 0.2658, + "step": 7657 + }, + { + "epoch": 0.606694394929689, + "grad_norm": 1.2103415811882574, + "learning_rate": 7.073297506811766e-06, + "loss": 0.2344, + "step": 7658 + }, + { + "epoch": 0.6067736185383245, + "grad_norm": 1.7299228955644137, + "learning_rate": 7.0708439291842345e-06, + "loss": 0.3189, + "step": 7659 + }, + { + "epoch": 0.6068528421469598, + "grad_norm": 1.436109037777133, + "learning_rate": 7.068390544427539e-06, + "loss": 0.3035, + "step": 7660 + }, + { + "epoch": 0.6069320657555952, + "grad_norm": 1.1486437387618742, + "learning_rate": 7.065937352703218e-06, + "loss": 0.1783, + "step": 7661 + }, + { + "epoch": 0.6070112893642305, + "grad_norm": 1.6571791017563637, + "learning_rate": 7.063484354172804e-06, + "loss": 0.2971, + "step": 7662 + }, + { + "epoch": 0.6070905129728659, + "grad_norm": 1.2819092805765941, + "learning_rate": 7.061031548997818e-06, + "loss": 0.2742, + "step": 7663 + }, + { + "epoch": 0.6071697365815013, + "grad_norm": 1.6164089896622902, + "learning_rate": 7.058578937339759e-06, + "loss": 0.2814, + "step": 7664 + }, + { + "epoch": 0.6072489601901366, + "grad_norm": 1.1338041250321742, + "learning_rate": 7.056126519360129e-06, + "loss": 0.2087, + "step": 7665 + }, + { + "epoch": 0.6073281837987721, + "grad_norm": 1.4153650065011405, + "learning_rate": 7.053674295220399e-06, + "loss": 0.2774, + "step": 7666 + }, + { + "epoch": 0.6074074074074074, + "grad_norm": 1.0904547920568635, + "learning_rate": 7.05122226508204e-06, + "loss": 0.2073, + "step": 7667 + }, + { + "epoch": 0.6074866310160428, + "grad_norm": 1.2735500978240153, + "learning_rate": 7.048770429106509e-06, + "loss": 0.2182, + "step": 7668 + }, + { + "epoch": 0.6075658546246782, + "grad_norm": 1.2202602223211134, + "learning_rate": 7.0463187874552415e-06, + "loss": 0.1987, + "step": 7669 + }, + { + "epoch": 0.6076450782333135, + "grad_norm": 1.6657327480832729, + "learning_rate": 7.043867340289672e-06, + "loss": 0.3109, + "step": 7670 + }, + { + "epoch": 0.6077243018419489, + "grad_norm": 1.546529025544999, + "learning_rate": 7.0414160877712155e-06, + "loss": 0.3119, + "step": 7671 + }, + { + "epoch": 0.6078035254505842, + "grad_norm": 1.5595210707850917, + "learning_rate": 7.038965030061273e-06, + "loss": 0.3367, + "step": 7672 + }, + { + "epoch": 0.6078827490592197, + "grad_norm": 1.4489660816512373, + "learning_rate": 7.0365141673212336e-06, + "loss": 0.3363, + "step": 7673 + }, + { + "epoch": 0.607961972667855, + "grad_norm": 1.3846116873889731, + "learning_rate": 7.034063499712479e-06, + "loss": 0.2654, + "step": 7674 + }, + { + "epoch": 0.6080411962764904, + "grad_norm": 1.4727879496742948, + "learning_rate": 7.031613027396369e-06, + "loss": 0.2991, + "step": 7675 + }, + { + "epoch": 0.6081204198851258, + "grad_norm": 1.093452263137143, + "learning_rate": 7.029162750534259e-06, + "loss": 0.1996, + "step": 7676 + }, + { + "epoch": 0.6081996434937611, + "grad_norm": 1.3466974223453194, + "learning_rate": 7.02671266928749e-06, + "loss": 0.3031, + "step": 7677 + }, + { + "epoch": 0.6082788671023965, + "grad_norm": 1.4547073963412491, + "learning_rate": 7.024262783817382e-06, + "loss": 0.2758, + "step": 7678 + }, + { + "epoch": 0.6083580907110319, + "grad_norm": 1.0736126359896117, + "learning_rate": 7.02181309428525e-06, + "loss": 0.1938, + "step": 7679 + }, + { + "epoch": 0.6084373143196673, + "grad_norm": 1.3088296906648964, + "learning_rate": 7.0193636008524e-06, + "loss": 0.1881, + "step": 7680 + }, + { + "epoch": 0.6085165379283026, + "grad_norm": 1.3886848472259685, + "learning_rate": 7.016914303680111e-06, + "loss": 0.3084, + "step": 7681 + }, + { + "epoch": 0.6085957615369381, + "grad_norm": 1.2613166203389428, + "learning_rate": 7.014465202929665e-06, + "loss": 0.2663, + "step": 7682 + }, + { + "epoch": 0.6086749851455734, + "grad_norm": 1.6515958637953383, + "learning_rate": 7.012016298762317e-06, + "loss": 0.2547, + "step": 7683 + }, + { + "epoch": 0.6087542087542087, + "grad_norm": 1.5680217077627265, + "learning_rate": 7.009567591339319e-06, + "loss": 0.267, + "step": 7684 + }, + { + "epoch": 0.6088334323628442, + "grad_norm": 1.3609889060529607, + "learning_rate": 7.007119080821908e-06, + "loss": 0.2134, + "step": 7685 + }, + { + "epoch": 0.6089126559714795, + "grad_norm": 1.2815317668378545, + "learning_rate": 7.004670767371302e-06, + "loss": 0.2573, + "step": 7686 + }, + { + "epoch": 0.6089918795801149, + "grad_norm": 1.4211930606019993, + "learning_rate": 7.002222651148714e-06, + "loss": 0.26, + "step": 7687 + }, + { + "epoch": 0.6090711031887502, + "grad_norm": 1.5759770865439906, + "learning_rate": 6.999774732315343e-06, + "loss": 0.228, + "step": 7688 + }, + { + "epoch": 0.6091503267973856, + "grad_norm": 1.2649741365817415, + "learning_rate": 6.9973270110323666e-06, + "loss": 0.3179, + "step": 7689 + }, + { + "epoch": 0.609229550406021, + "grad_norm": 1.4724172675170308, + "learning_rate": 6.994879487460961e-06, + "loss": 0.2535, + "step": 7690 + }, + { + "epoch": 0.6093087740146563, + "grad_norm": 1.3633123981322597, + "learning_rate": 6.992432161762278e-06, + "loss": 0.2103, + "step": 7691 + }, + { + "epoch": 0.6093879976232918, + "grad_norm": 1.1961069374163367, + "learning_rate": 6.989985034097466e-06, + "loss": 0.2661, + "step": 7692 + }, + { + "epoch": 0.6094672212319271, + "grad_norm": 1.100296627858807, + "learning_rate": 6.9875381046276605e-06, + "loss": 0.2293, + "step": 7693 + }, + { + "epoch": 0.6095464448405625, + "grad_norm": 1.3163332359817324, + "learning_rate": 6.985091373513972e-06, + "loss": 0.2662, + "step": 7694 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 1.5701687072096584, + "learning_rate": 6.982644840917509e-06, + "loss": 0.2711, + "step": 7695 + }, + { + "epoch": 0.6097048920578332, + "grad_norm": 1.5999833871568183, + "learning_rate": 6.980198506999368e-06, + "loss": 0.2725, + "step": 7696 + }, + { + "epoch": 0.6097841156664686, + "grad_norm": 1.3046754328671022, + "learning_rate": 6.977752371920623e-06, + "loss": 0.2719, + "step": 7697 + }, + { + "epoch": 0.6098633392751039, + "grad_norm": 1.166379536581183, + "learning_rate": 6.975306435842344e-06, + "loss": 0.239, + "step": 7698 + }, + { + "epoch": 0.6099425628837394, + "grad_norm": 1.3532412927885529, + "learning_rate": 6.97286069892558e-06, + "loss": 0.3494, + "step": 7699 + }, + { + "epoch": 0.6100217864923747, + "grad_norm": 1.8497233726219748, + "learning_rate": 6.970415161331373e-06, + "loss": 0.3164, + "step": 7700 + }, + { + "epoch": 0.6101010101010101, + "grad_norm": 1.4577701930336793, + "learning_rate": 6.967969823220752e-06, + "loss": 0.3132, + "step": 7701 + }, + { + "epoch": 0.6101802337096455, + "grad_norm": 1.2740756093705055, + "learning_rate": 6.965524684754729e-06, + "loss": 0.1977, + "step": 7702 + }, + { + "epoch": 0.6102594573182808, + "grad_norm": 1.6928182834385006, + "learning_rate": 6.963079746094302e-06, + "loss": 0.3022, + "step": 7703 + }, + { + "epoch": 0.6103386809269162, + "grad_norm": 1.1915882985252264, + "learning_rate": 6.960635007400465e-06, + "loss": 0.1831, + "step": 7704 + }, + { + "epoch": 0.6104179045355516, + "grad_norm": 0.9612231998826694, + "learning_rate": 6.9581904688341854e-06, + "loss": 0.1583, + "step": 7705 + }, + { + "epoch": 0.610497128144187, + "grad_norm": 1.2584465451269586, + "learning_rate": 6.955746130556429e-06, + "loss": 0.2703, + "step": 7706 + }, + { + "epoch": 0.6105763517528223, + "grad_norm": 1.3272191941585112, + "learning_rate": 6.95330199272814e-06, + "loss": 0.256, + "step": 7707 + }, + { + "epoch": 0.6106555753614578, + "grad_norm": 1.4735213475794482, + "learning_rate": 6.950858055510254e-06, + "loss": 0.2835, + "step": 7708 + }, + { + "epoch": 0.6107347989700931, + "grad_norm": 1.5163448598653948, + "learning_rate": 6.948414319063696e-06, + "loss": 0.2501, + "step": 7709 + }, + { + "epoch": 0.6108140225787284, + "grad_norm": 1.045999863935323, + "learning_rate": 6.945970783549372e-06, + "loss": 0.2008, + "step": 7710 + }, + { + "epoch": 0.6108932461873638, + "grad_norm": 1.36572657805259, + "learning_rate": 6.943527449128174e-06, + "loss": 0.2031, + "step": 7711 + }, + { + "epoch": 0.6109724697959992, + "grad_norm": 1.4901676569597133, + "learning_rate": 6.9410843159609905e-06, + "loss": 0.2991, + "step": 7712 + }, + { + "epoch": 0.6110516934046346, + "grad_norm": 1.646298927811571, + "learning_rate": 6.9386413842086845e-06, + "loss": 0.355, + "step": 7713 + }, + { + "epoch": 0.6111309170132699, + "grad_norm": 1.2423640550843618, + "learning_rate": 6.936198654032114e-06, + "loss": 0.2717, + "step": 7714 + }, + { + "epoch": 0.6112101406219054, + "grad_norm": 1.2973883818946998, + "learning_rate": 6.933756125592117e-06, + "loss": 0.2152, + "step": 7715 + }, + { + "epoch": 0.6112893642305407, + "grad_norm": 1.2275036350913804, + "learning_rate": 6.931313799049526e-06, + "loss": 0.2509, + "step": 7716 + }, + { + "epoch": 0.611368587839176, + "grad_norm": 1.312081765391115, + "learning_rate": 6.928871674565158e-06, + "loss": 0.232, + "step": 7717 + }, + { + "epoch": 0.6114478114478115, + "grad_norm": 1.3661461186965818, + "learning_rate": 6.926429752299812e-06, + "loss": 0.3271, + "step": 7718 + }, + { + "epoch": 0.6115270350564468, + "grad_norm": 1.882720190549972, + "learning_rate": 6.923988032414277e-06, + "loss": 0.3696, + "step": 7719 + }, + { + "epoch": 0.6116062586650822, + "grad_norm": 1.1984863845970999, + "learning_rate": 6.9215465150693305e-06, + "loss": 0.2027, + "step": 7720 + }, + { + "epoch": 0.6116854822737176, + "grad_norm": 1.429597771750269, + "learning_rate": 6.919105200425733e-06, + "loss": 0.2919, + "step": 7721 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 1.5184198041344545, + "learning_rate": 6.916664088644234e-06, + "loss": 0.2871, + "step": 7722 + }, + { + "epoch": 0.6118439294909883, + "grad_norm": 1.4086076423644345, + "learning_rate": 6.914223179885567e-06, + "loss": 0.2428, + "step": 7723 + }, + { + "epoch": 0.6119231530996236, + "grad_norm": 1.2572304389546765, + "learning_rate": 6.911782474310456e-06, + "loss": 0.236, + "step": 7724 + }, + { + "epoch": 0.6120023767082591, + "grad_norm": 1.6418119743764823, + "learning_rate": 6.909341972079613e-06, + "loss": 0.3402, + "step": 7725 + }, + { + "epoch": 0.6120816003168944, + "grad_norm": 1.1670073821444726, + "learning_rate": 6.9069016733537255e-06, + "loss": 0.2307, + "step": 7726 + }, + { + "epoch": 0.6121608239255298, + "grad_norm": 1.564452810238282, + "learning_rate": 6.904461578293483e-06, + "loss": 0.2565, + "step": 7727 + }, + { + "epoch": 0.6122400475341652, + "grad_norm": 1.5527829802522377, + "learning_rate": 6.902021687059549e-06, + "loss": 0.3159, + "step": 7728 + }, + { + "epoch": 0.6123192711428006, + "grad_norm": 1.6433272836333535, + "learning_rate": 6.89958199981258e-06, + "loss": 0.3423, + "step": 7729 + }, + { + "epoch": 0.6123984947514359, + "grad_norm": 1.8738589396243654, + "learning_rate": 6.89714251671322e-06, + "loss": 0.3125, + "step": 7730 + }, + { + "epoch": 0.6124777183600713, + "grad_norm": 1.1874115241160907, + "learning_rate": 6.894703237922094e-06, + "loss": 0.2464, + "step": 7731 + }, + { + "epoch": 0.6125569419687067, + "grad_norm": 1.8512966006014047, + "learning_rate": 6.892264163599817e-06, + "loss": 0.3413, + "step": 7732 + }, + { + "epoch": 0.612636165577342, + "grad_norm": 1.2406554799048644, + "learning_rate": 6.889825293906993e-06, + "loss": 0.2473, + "step": 7733 + }, + { + "epoch": 0.6127153891859775, + "grad_norm": 1.4156787630362848, + "learning_rate": 6.887386629004207e-06, + "loss": 0.256, + "step": 7734 + }, + { + "epoch": 0.6127946127946128, + "grad_norm": 1.5132544175692206, + "learning_rate": 6.884948169052037e-06, + "loss": 0.2709, + "step": 7735 + }, + { + "epoch": 0.6128738364032482, + "grad_norm": 2.067735977357367, + "learning_rate": 6.88250991421104e-06, + "loss": 0.2776, + "step": 7736 + }, + { + "epoch": 0.6129530600118835, + "grad_norm": 1.4397452115485452, + "learning_rate": 6.880071864641762e-06, + "loss": 0.2494, + "step": 7737 + }, + { + "epoch": 0.6130322836205189, + "grad_norm": 1.2730386636922504, + "learning_rate": 6.8776340205047446e-06, + "loss": 0.2682, + "step": 7738 + }, + { + "epoch": 0.6131115072291543, + "grad_norm": 1.3112896925886945, + "learning_rate": 6.875196381960498e-06, + "loss": 0.2414, + "step": 7739 + }, + { + "epoch": 0.6131907308377896, + "grad_norm": 1.0521531302110294, + "learning_rate": 6.872758949169536e-06, + "loss": 0.2107, + "step": 7740 + }, + { + "epoch": 0.6132699544464251, + "grad_norm": 1.1492510187805138, + "learning_rate": 6.8703217222923525e-06, + "loss": 0.2349, + "step": 7741 + }, + { + "epoch": 0.6133491780550604, + "grad_norm": 1.022767235650454, + "learning_rate": 6.867884701489421e-06, + "loss": 0.1664, + "step": 7742 + }, + { + "epoch": 0.6134284016636958, + "grad_norm": 1.246149490561892, + "learning_rate": 6.865447886921215e-06, + "loss": 0.1421, + "step": 7743 + }, + { + "epoch": 0.6135076252723312, + "grad_norm": 1.6341138540698272, + "learning_rate": 6.86301127874818e-06, + "loss": 0.3103, + "step": 7744 + }, + { + "epoch": 0.6135868488809665, + "grad_norm": 1.4571288520051886, + "learning_rate": 6.860574877130757e-06, + "loss": 0.282, + "step": 7745 + }, + { + "epoch": 0.6136660724896019, + "grad_norm": 1.1698689438632988, + "learning_rate": 6.8581386822293765e-06, + "loss": 0.2366, + "step": 7746 + }, + { + "epoch": 0.6137452960982372, + "grad_norm": 1.6744235509914767, + "learning_rate": 6.8557026942044425e-06, + "loss": 0.3847, + "step": 7747 + }, + { + "epoch": 0.6138245197068727, + "grad_norm": 1.0594846265669022, + "learning_rate": 6.853266913216357e-06, + "loss": 0.1388, + "step": 7748 + }, + { + "epoch": 0.613903743315508, + "grad_norm": 1.3872101678269653, + "learning_rate": 6.850831339425508e-06, + "loss": 0.2847, + "step": 7749 + }, + { + "epoch": 0.6139829669241434, + "grad_norm": 1.4442779616701826, + "learning_rate": 6.848395972992261e-06, + "loss": 0.2721, + "step": 7750 + }, + { + "epoch": 0.6140621905327788, + "grad_norm": 1.5233426138249149, + "learning_rate": 6.845960814076973e-06, + "loss": 0.3488, + "step": 7751 + }, + { + "epoch": 0.6141414141414141, + "grad_norm": 1.382163340681494, + "learning_rate": 6.8435258628399905e-06, + "loss": 0.2964, + "step": 7752 + }, + { + "epoch": 0.6142206377500495, + "grad_norm": 1.321844255477333, + "learning_rate": 6.841091119441639e-06, + "loss": 0.2742, + "step": 7753 + }, + { + "epoch": 0.6142998613586849, + "grad_norm": 1.407107473806448, + "learning_rate": 6.8386565840422385e-06, + "loss": 0.2881, + "step": 7754 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 1.2948420184462874, + "learning_rate": 6.836222256802093e-06, + "loss": 0.219, + "step": 7755 + }, + { + "epoch": 0.6144583085759556, + "grad_norm": 1.3238417128741429, + "learning_rate": 6.833788137881486e-06, + "loss": 0.3172, + "step": 7756 + }, + { + "epoch": 0.6145375321845911, + "grad_norm": 1.2385372749502246, + "learning_rate": 6.8313542274406964e-06, + "loss": 0.2717, + "step": 7757 + }, + { + "epoch": 0.6146167557932264, + "grad_norm": 1.007569470426892, + "learning_rate": 6.828920525639985e-06, + "loss": 0.1992, + "step": 7758 + }, + { + "epoch": 0.6146959794018617, + "grad_norm": 1.1787572754542879, + "learning_rate": 6.826487032639597e-06, + "loss": 0.2152, + "step": 7759 + }, + { + "epoch": 0.6147752030104972, + "grad_norm": 1.3642322573829562, + "learning_rate": 6.8240537485997704e-06, + "loss": 0.2747, + "step": 7760 + }, + { + "epoch": 0.6148544266191325, + "grad_norm": 1.5193715878449312, + "learning_rate": 6.821620673680721e-06, + "loss": 0.297, + "step": 7761 + }, + { + "epoch": 0.6149336502277679, + "grad_norm": 1.39310703365241, + "learning_rate": 6.819187808042656e-06, + "loss": 0.2786, + "step": 7762 + }, + { + "epoch": 0.6150128738364032, + "grad_norm": 1.6407176420538385, + "learning_rate": 6.816755151845771e-06, + "loss": 0.3333, + "step": 7763 + }, + { + "epoch": 0.6150920974450387, + "grad_norm": 1.4982201325932958, + "learning_rate": 6.814322705250241e-06, + "loss": 0.3212, + "step": 7764 + }, + { + "epoch": 0.615171321053674, + "grad_norm": 1.6815880092237951, + "learning_rate": 6.8118904684162325e-06, + "loss": 0.377, + "step": 7765 + }, + { + "epoch": 0.6152505446623093, + "grad_norm": 1.5737255499392573, + "learning_rate": 6.8094584415038975e-06, + "loss": 0.3937, + "step": 7766 + }, + { + "epoch": 0.6153297682709448, + "grad_norm": 1.278821402629601, + "learning_rate": 6.807026624673372e-06, + "loss": 0.2233, + "step": 7767 + }, + { + "epoch": 0.6154089918795801, + "grad_norm": 1.1800119192224856, + "learning_rate": 6.80459501808478e-06, + "loss": 0.2195, + "step": 7768 + }, + { + "epoch": 0.6154882154882155, + "grad_norm": 1.1942410774626977, + "learning_rate": 6.8021636218982275e-06, + "loss": 0.1952, + "step": 7769 + }, + { + "epoch": 0.6155674390968509, + "grad_norm": 1.2058949477151222, + "learning_rate": 6.799732436273816e-06, + "loss": 0.2194, + "step": 7770 + }, + { + "epoch": 0.6156466627054862, + "grad_norm": 1.1194869878543667, + "learning_rate": 6.797301461371626e-06, + "loss": 0.2013, + "step": 7771 + }, + { + "epoch": 0.6157258863141216, + "grad_norm": 1.429570216790275, + "learning_rate": 6.7948706973517235e-06, + "loss": 0.2476, + "step": 7772 + }, + { + "epoch": 0.6158051099227569, + "grad_norm": 1.3034841215222428, + "learning_rate": 6.792440144374162e-06, + "loss": 0.275, + "step": 7773 + }, + { + "epoch": 0.6158843335313924, + "grad_norm": 1.152664346651917, + "learning_rate": 6.790009802598984e-06, + "loss": 0.21, + "step": 7774 + }, + { + "epoch": 0.6159635571400277, + "grad_norm": 1.4544537788642788, + "learning_rate": 6.787579672186215e-06, + "loss": 0.3069, + "step": 7775 + }, + { + "epoch": 0.6160427807486631, + "grad_norm": 0.9895825427148084, + "learning_rate": 6.78514975329587e-06, + "loss": 0.1693, + "step": 7776 + }, + { + "epoch": 0.6161220043572985, + "grad_norm": 1.2583732512982744, + "learning_rate": 6.78272004608794e-06, + "loss": 0.2255, + "step": 7777 + }, + { + "epoch": 0.6162012279659338, + "grad_norm": 1.5195780759360493, + "learning_rate": 6.780290550722417e-06, + "loss": 0.4331, + "step": 7778 + }, + { + "epoch": 0.6162804515745692, + "grad_norm": 1.1795453733648757, + "learning_rate": 6.777861267359272e-06, + "loss": 0.2066, + "step": 7779 + }, + { + "epoch": 0.6163596751832046, + "grad_norm": 1.5333298623501757, + "learning_rate": 6.7754321961584535e-06, + "loss": 0.2336, + "step": 7780 + }, + { + "epoch": 0.61643889879184, + "grad_norm": 1.1174812412087562, + "learning_rate": 6.773003337279911e-06, + "loss": 0.1891, + "step": 7781 + }, + { + "epoch": 0.6165181224004753, + "grad_norm": 1.223592985208414, + "learning_rate": 6.7705746908835734e-06, + "loss": 0.2797, + "step": 7782 + }, + { + "epoch": 0.6165973460091108, + "grad_norm": 1.0826977734484358, + "learning_rate": 6.768146257129351e-06, + "loss": 0.2157, + "step": 7783 + }, + { + "epoch": 0.6166765696177461, + "grad_norm": 1.5628539638921424, + "learning_rate": 6.765718036177148e-06, + "loss": 0.369, + "step": 7784 + }, + { + "epoch": 0.6167557932263814, + "grad_norm": 1.3651095380859373, + "learning_rate": 6.763290028186849e-06, + "loss": 0.2958, + "step": 7785 + }, + { + "epoch": 0.6168350168350168, + "grad_norm": 1.4408720965076283, + "learning_rate": 6.760862233318327e-06, + "loss": 0.2942, + "step": 7786 + }, + { + "epoch": 0.6169142404436522, + "grad_norm": 1.7356013934880312, + "learning_rate": 6.758434651731445e-06, + "loss": 0.3133, + "step": 7787 + }, + { + "epoch": 0.6169934640522876, + "grad_norm": 1.2284488376324045, + "learning_rate": 6.756007283586039e-06, + "loss": 0.2357, + "step": 7788 + }, + { + "epoch": 0.6170726876609229, + "grad_norm": 1.1350879978097135, + "learning_rate": 6.753580129041945e-06, + "loss": 0.2467, + "step": 7789 + }, + { + "epoch": 0.6171519112695584, + "grad_norm": 1.2389020391180534, + "learning_rate": 6.751153188258983e-06, + "loss": 0.2583, + "step": 7790 + }, + { + "epoch": 0.6172311348781937, + "grad_norm": 1.449969054861684, + "learning_rate": 6.748726461396946e-06, + "loss": 0.209, + "step": 7791 + }, + { + "epoch": 0.617310358486829, + "grad_norm": 1.0899637416630972, + "learning_rate": 6.7462999486156315e-06, + "loss": 0.2141, + "step": 7792 + }, + { + "epoch": 0.6173895820954645, + "grad_norm": 1.068208711496623, + "learning_rate": 6.743873650074807e-06, + "loss": 0.2102, + "step": 7793 + }, + { + "epoch": 0.6174688057040998, + "grad_norm": 1.2790955957950543, + "learning_rate": 6.741447565934236e-06, + "loss": 0.2604, + "step": 7794 + }, + { + "epoch": 0.6175480293127352, + "grad_norm": 1.113377807828955, + "learning_rate": 6.739021696353665e-06, + "loss": 0.1867, + "step": 7795 + }, + { + "epoch": 0.6176272529213706, + "grad_norm": 1.6958804608095717, + "learning_rate": 6.736596041492821e-06, + "loss": 0.3465, + "step": 7796 + }, + { + "epoch": 0.617706476530006, + "grad_norm": 1.532662396731456, + "learning_rate": 6.734170601511427e-06, + "loss": 0.3379, + "step": 7797 + }, + { + "epoch": 0.6177857001386413, + "grad_norm": 1.4325661972155246, + "learning_rate": 6.7317453765691855e-06, + "loss": 0.2166, + "step": 7798 + }, + { + "epoch": 0.6178649237472766, + "grad_norm": 1.0810134276551888, + "learning_rate": 6.729320366825785e-06, + "loss": 0.1891, + "step": 7799 + }, + { + "epoch": 0.6179441473559121, + "grad_norm": 1.4236772235474453, + "learning_rate": 6.726895572440901e-06, + "loss": 0.2813, + "step": 7800 + }, + { + "epoch": 0.6180233709645474, + "grad_norm": 1.5092256028731807, + "learning_rate": 6.7244709935741925e-06, + "loss": 0.3087, + "step": 7801 + }, + { + "epoch": 0.6181025945731828, + "grad_norm": 1.5979792887277424, + "learning_rate": 6.722046630385309e-06, + "loss": 0.3196, + "step": 7802 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 1.5055075254314938, + "learning_rate": 6.719622483033883e-06, + "loss": 0.305, + "step": 7803 + }, + { + "epoch": 0.6182610417904536, + "grad_norm": 1.1393192551815847, + "learning_rate": 6.7171985516795315e-06, + "loss": 0.2066, + "step": 7804 + }, + { + "epoch": 0.6183402653990889, + "grad_norm": 1.268081009863507, + "learning_rate": 6.714774836481862e-06, + "loss": 0.2761, + "step": 7805 + }, + { + "epoch": 0.6184194890077243, + "grad_norm": 1.3522460011113968, + "learning_rate": 6.71235133760046e-06, + "loss": 0.2176, + "step": 7806 + }, + { + "epoch": 0.6184987126163597, + "grad_norm": 1.2098124392195162, + "learning_rate": 6.709928055194902e-06, + "loss": 0.2217, + "step": 7807 + }, + { + "epoch": 0.618577936224995, + "grad_norm": 1.2087322220014571, + "learning_rate": 6.707504989424753e-06, + "loss": 0.2854, + "step": 7808 + }, + { + "epoch": 0.6186571598336305, + "grad_norm": 1.1241695770250928, + "learning_rate": 6.705082140449557e-06, + "loss": 0.1981, + "step": 7809 + }, + { + "epoch": 0.6187363834422658, + "grad_norm": 1.5880513437450594, + "learning_rate": 6.702659508428847e-06, + "loss": 0.2637, + "step": 7810 + }, + { + "epoch": 0.6188156070509012, + "grad_norm": 1.3693406675072954, + "learning_rate": 6.7002370935221454e-06, + "loss": 0.2713, + "step": 7811 + }, + { + "epoch": 0.6188948306595365, + "grad_norm": 1.170654097612544, + "learning_rate": 6.697814895888951e-06, + "loss": 0.2192, + "step": 7812 + }, + { + "epoch": 0.6189740542681719, + "grad_norm": 1.3087175467841219, + "learning_rate": 6.695392915688759e-06, + "loss": 0.2481, + "step": 7813 + }, + { + "epoch": 0.6190532778768073, + "grad_norm": 1.4849130638357622, + "learning_rate": 6.692971153081041e-06, + "loss": 0.3214, + "step": 7814 + }, + { + "epoch": 0.6191325014854426, + "grad_norm": 1.3953250991898063, + "learning_rate": 6.690549608225258e-06, + "loss": 0.2264, + "step": 7815 + }, + { + "epoch": 0.6192117250940781, + "grad_norm": 1.370734142632818, + "learning_rate": 6.688128281280863e-06, + "loss": 0.2355, + "step": 7816 + }, + { + "epoch": 0.6192909487027134, + "grad_norm": 1.2819694950918628, + "learning_rate": 6.685707172407284e-06, + "loss": 0.1974, + "step": 7817 + }, + { + "epoch": 0.6193701723113488, + "grad_norm": 1.6678432907521146, + "learning_rate": 6.683286281763939e-06, + "loss": 0.2446, + "step": 7818 + }, + { + "epoch": 0.6194493959199842, + "grad_norm": 1.1343649598551384, + "learning_rate": 6.6808656095102365e-06, + "loss": 0.191, + "step": 7819 + }, + { + "epoch": 0.6195286195286195, + "grad_norm": 1.0976398957495892, + "learning_rate": 6.6784451558055596e-06, + "loss": 0.2147, + "step": 7820 + }, + { + "epoch": 0.6196078431372549, + "grad_norm": 1.414591058325258, + "learning_rate": 6.67602492080929e-06, + "loss": 0.2328, + "step": 7821 + }, + { + "epoch": 0.6196870667458902, + "grad_norm": 1.5082495366457516, + "learning_rate": 6.6736049046807815e-06, + "loss": 0.2742, + "step": 7822 + }, + { + "epoch": 0.6197662903545257, + "grad_norm": 1.2146270981585556, + "learning_rate": 6.671185107579387e-06, + "loss": 0.1722, + "step": 7823 + }, + { + "epoch": 0.619845513963161, + "grad_norm": 1.1089253094008462, + "learning_rate": 6.668765529664436e-06, + "loss": 0.152, + "step": 7824 + }, + { + "epoch": 0.6199247375717964, + "grad_norm": 1.2432384518883137, + "learning_rate": 6.6663461710952445e-06, + "loss": 0.2288, + "step": 7825 + }, + { + "epoch": 0.6200039611804318, + "grad_norm": 1.4470943878159777, + "learning_rate": 6.663927032031118e-06, + "loss": 0.2928, + "step": 7826 + }, + { + "epoch": 0.6200831847890671, + "grad_norm": 1.5076932585509855, + "learning_rate": 6.661508112631347e-06, + "loss": 0.2546, + "step": 7827 + }, + { + "epoch": 0.6201624083977025, + "grad_norm": 1.571916336192168, + "learning_rate": 6.659089413055202e-06, + "loss": 0.3052, + "step": 7828 + }, + { + "epoch": 0.6202416320063379, + "grad_norm": 1.3894933548873982, + "learning_rate": 6.656670933461942e-06, + "loss": 0.2695, + "step": 7829 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 1.0872391840674955, + "learning_rate": 6.654252674010815e-06, + "loss": 0.1721, + "step": 7830 + }, + { + "epoch": 0.6204000792236086, + "grad_norm": 1.064168591015102, + "learning_rate": 6.6518346348610484e-06, + "loss": 0.1547, + "step": 7831 + }, + { + "epoch": 0.6204793028322441, + "grad_norm": 1.2802428101478698, + "learning_rate": 6.649416816171861e-06, + "loss": 0.2642, + "step": 7832 + }, + { + "epoch": 0.6205585264408794, + "grad_norm": 1.4539983747523386, + "learning_rate": 6.646999218102457e-06, + "loss": 0.2649, + "step": 7833 + }, + { + "epoch": 0.6206377500495147, + "grad_norm": 1.5812233432610567, + "learning_rate": 6.644581840812019e-06, + "loss": 0.2845, + "step": 7834 + }, + { + "epoch": 0.6207169736581502, + "grad_norm": 1.5996091098100835, + "learning_rate": 6.64216468445972e-06, + "loss": 0.3499, + "step": 7835 + }, + { + "epoch": 0.6207961972667855, + "grad_norm": 1.3978420024613916, + "learning_rate": 6.639747749204723e-06, + "loss": 0.2936, + "step": 7836 + }, + { + "epoch": 0.6208754208754209, + "grad_norm": 1.2812456399799128, + "learning_rate": 6.637331035206166e-06, + "loss": 0.192, + "step": 7837 + }, + { + "epoch": 0.6209546444840562, + "grad_norm": 1.6261109627284294, + "learning_rate": 6.634914542623182e-06, + "loss": 0.3179, + "step": 7838 + }, + { + "epoch": 0.6210338680926917, + "grad_norm": 1.333396456551822, + "learning_rate": 6.632498271614882e-06, + "loss": 0.31, + "step": 7839 + }, + { + "epoch": 0.621113091701327, + "grad_norm": 1.4797066334661229, + "learning_rate": 6.630082222340366e-06, + "loss": 0.2998, + "step": 7840 + }, + { + "epoch": 0.6211923153099623, + "grad_norm": 1.45164227806152, + "learning_rate": 6.627666394958725e-06, + "loss": 0.3126, + "step": 7841 + }, + { + "epoch": 0.6212715389185978, + "grad_norm": 1.737152882985017, + "learning_rate": 6.625250789629021e-06, + "loss": 0.2985, + "step": 7842 + }, + { + "epoch": 0.6213507625272331, + "grad_norm": 1.1370470924073617, + "learning_rate": 6.622835406510315e-06, + "loss": 0.2198, + "step": 7843 + }, + { + "epoch": 0.6214299861358685, + "grad_norm": 1.4729672412875099, + "learning_rate": 6.620420245761651e-06, + "loss": 0.253, + "step": 7844 + }, + { + "epoch": 0.6215092097445039, + "grad_norm": 1.300988908160649, + "learning_rate": 6.6180053075420484e-06, + "loss": 0.2385, + "step": 7845 + }, + { + "epoch": 0.6215884333531392, + "grad_norm": 1.5699782638837072, + "learning_rate": 6.615590592010526e-06, + "loss": 0.3094, + "step": 7846 + }, + { + "epoch": 0.6216676569617746, + "grad_norm": 1.473281039707279, + "learning_rate": 6.613176099326077e-06, + "loss": 0.241, + "step": 7847 + }, + { + "epoch": 0.6217468805704099, + "grad_norm": 1.113483652990401, + "learning_rate": 6.610761829647685e-06, + "loss": 0.1978, + "step": 7848 + }, + { + "epoch": 0.6218261041790454, + "grad_norm": 1.360539071493947, + "learning_rate": 6.608347783134319e-06, + "loss": 0.2553, + "step": 7849 + }, + { + "epoch": 0.6219053277876807, + "grad_norm": 1.2327850961705231, + "learning_rate": 6.605933959944933e-06, + "loss": 0.2659, + "step": 7850 + }, + { + "epoch": 0.6219845513963161, + "grad_norm": 1.1907455991686198, + "learning_rate": 6.603520360238462e-06, + "loss": 0.2421, + "step": 7851 + }, + { + "epoch": 0.6220637750049515, + "grad_norm": 1.4671631812125632, + "learning_rate": 6.601106984173835e-06, + "loss": 0.2675, + "step": 7852 + }, + { + "epoch": 0.6221429986135868, + "grad_norm": 1.2849240582613972, + "learning_rate": 6.598693831909957e-06, + "loss": 0.2875, + "step": 7853 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 1.4438224010003577, + "learning_rate": 6.596280903605725e-06, + "loss": 0.3196, + "step": 7854 + }, + { + "epoch": 0.6223014458308576, + "grad_norm": 1.3191640947961132, + "learning_rate": 6.593868199420017e-06, + "loss": 0.2612, + "step": 7855 + }, + { + "epoch": 0.622380669439493, + "grad_norm": 1.4503705790415486, + "learning_rate": 6.591455719511699e-06, + "loss": 0.2706, + "step": 7856 + }, + { + "epoch": 0.6224598930481283, + "grad_norm": 1.5041727883374938, + "learning_rate": 6.589043464039624e-06, + "loss": 0.3972, + "step": 7857 + }, + { + "epoch": 0.6225391166567638, + "grad_norm": 1.5429877257400408, + "learning_rate": 6.58663143316262e-06, + "loss": 0.3489, + "step": 7858 + }, + { + "epoch": 0.6226183402653991, + "grad_norm": 1.4556036248722355, + "learning_rate": 6.584219627039513e-06, + "loss": 0.3174, + "step": 7859 + }, + { + "epoch": 0.6226975638740344, + "grad_norm": 1.3495884674443839, + "learning_rate": 6.58180804582911e-06, + "loss": 0.2731, + "step": 7860 + }, + { + "epoch": 0.6227767874826698, + "grad_norm": 1.229775798580066, + "learning_rate": 6.579396689690198e-06, + "loss": 0.2403, + "step": 7861 + }, + { + "epoch": 0.6228560110913052, + "grad_norm": 1.3210697496673112, + "learning_rate": 6.576985558781557e-06, + "loss": 0.2539, + "step": 7862 + }, + { + "epoch": 0.6229352346999406, + "grad_norm": 1.4186420081770639, + "learning_rate": 6.574574653261945e-06, + "loss": 0.3054, + "step": 7863 + }, + { + "epoch": 0.6230144583085759, + "grad_norm": 1.2521601700348162, + "learning_rate": 6.572163973290109e-06, + "loss": 0.2679, + "step": 7864 + }, + { + "epoch": 0.6230936819172114, + "grad_norm": 1.6865582370350147, + "learning_rate": 6.569753519024784e-06, + "loss": 0.3346, + "step": 7865 + }, + { + "epoch": 0.6231729055258467, + "grad_norm": 1.6728242113314875, + "learning_rate": 6.567343290624683e-06, + "loss": 0.2515, + "step": 7866 + }, + { + "epoch": 0.623252129134482, + "grad_norm": 1.0310324890464801, + "learning_rate": 6.564933288248509e-06, + "loss": 0.1736, + "step": 7867 + }, + { + "epoch": 0.6233313527431175, + "grad_norm": 1.2877800114096616, + "learning_rate": 6.562523512054951e-06, + "loss": 0.2237, + "step": 7868 + }, + { + "epoch": 0.6234105763517528, + "grad_norm": 1.3010640430093539, + "learning_rate": 6.560113962202679e-06, + "loss": 0.1949, + "step": 7869 + }, + { + "epoch": 0.6234897999603882, + "grad_norm": 1.360701075201637, + "learning_rate": 6.557704638850352e-06, + "loss": 0.2671, + "step": 7870 + }, + { + "epoch": 0.6235690235690236, + "grad_norm": 1.5045777587901634, + "learning_rate": 6.555295542156609e-06, + "loss": 0.2996, + "step": 7871 + }, + { + "epoch": 0.623648247177659, + "grad_norm": 1.3987349807546454, + "learning_rate": 6.55288667228008e-06, + "loss": 0.2374, + "step": 7872 + }, + { + "epoch": 0.6237274707862943, + "grad_norm": 1.3726309683596492, + "learning_rate": 6.550478029379379e-06, + "loss": 0.3302, + "step": 7873 + }, + { + "epoch": 0.6238066943949296, + "grad_norm": 1.3422415988420477, + "learning_rate": 6.548069613613099e-06, + "loss": 0.2699, + "step": 7874 + }, + { + "epoch": 0.6238859180035651, + "grad_norm": 1.4771107362273952, + "learning_rate": 6.545661425139827e-06, + "loss": 0.3082, + "step": 7875 + }, + { + "epoch": 0.6239651416122004, + "grad_norm": 1.1439122437650966, + "learning_rate": 6.543253464118131e-06, + "loss": 0.1911, + "step": 7876 + }, + { + "epoch": 0.6240443652208358, + "grad_norm": 1.3750241773055052, + "learning_rate": 6.540845730706557e-06, + "loss": 0.2409, + "step": 7877 + }, + { + "epoch": 0.6241235888294712, + "grad_norm": 1.1334701504850064, + "learning_rate": 6.538438225063653e-06, + "loss": 0.2246, + "step": 7878 + }, + { + "epoch": 0.6242028124381066, + "grad_norm": 1.2744100356036898, + "learning_rate": 6.536030947347931e-06, + "loss": 0.1619, + "step": 7879 + }, + { + "epoch": 0.6242820360467419, + "grad_norm": 1.356265927746559, + "learning_rate": 6.533623897717905e-06, + "loss": 0.2923, + "step": 7880 + }, + { + "epoch": 0.6243612596553773, + "grad_norm": 1.7969961616461871, + "learning_rate": 6.531217076332068e-06, + "loss": 0.3339, + "step": 7881 + }, + { + "epoch": 0.6244404832640127, + "grad_norm": 1.028256064515624, + "learning_rate": 6.528810483348893e-06, + "loss": 0.2179, + "step": 7882 + }, + { + "epoch": 0.624519706872648, + "grad_norm": 1.2845900586565773, + "learning_rate": 6.526404118926848e-06, + "loss": 0.2258, + "step": 7883 + }, + { + "epoch": 0.6245989304812835, + "grad_norm": 1.3564910475185137, + "learning_rate": 6.523997983224375e-06, + "loss": 0.2134, + "step": 7884 + }, + { + "epoch": 0.6246781540899188, + "grad_norm": 1.4488507660186598, + "learning_rate": 6.52159207639991e-06, + "loss": 0.3189, + "step": 7885 + }, + { + "epoch": 0.6247573776985542, + "grad_norm": 1.2704437460350062, + "learning_rate": 6.519186398611872e-06, + "loss": 0.2244, + "step": 7886 + }, + { + "epoch": 0.6248366013071895, + "grad_norm": 1.4763807725933376, + "learning_rate": 6.51678095001866e-06, + "loss": 0.1966, + "step": 7887 + }, + { + "epoch": 0.6249158249158249, + "grad_norm": 1.222871590649851, + "learning_rate": 6.51437573077866e-06, + "loss": 0.2023, + "step": 7888 + }, + { + "epoch": 0.6249950485244603, + "grad_norm": 1.157591028130183, + "learning_rate": 6.5119707410502495e-06, + "loss": 0.171, + "step": 7889 + }, + { + "epoch": 0.6250742721330956, + "grad_norm": 1.2708367551985662, + "learning_rate": 6.509565980991781e-06, + "loss": 0.2439, + "step": 7890 + }, + { + "epoch": 0.6251534957417311, + "grad_norm": 1.1816391458178548, + "learning_rate": 6.5071614507615985e-06, + "loss": 0.2012, + "step": 7891 + }, + { + "epoch": 0.6252327193503664, + "grad_norm": 1.0269538697081408, + "learning_rate": 6.5047571505180265e-06, + "loss": 0.216, + "step": 7892 + }, + { + "epoch": 0.6253119429590018, + "grad_norm": 1.9270199626841715, + "learning_rate": 6.502353080419379e-06, + "loss": 0.4559, + "step": 7893 + }, + { + "epoch": 0.6253911665676372, + "grad_norm": 1.5655967262342554, + "learning_rate": 6.4999492406239525e-06, + "loss": 0.2552, + "step": 7894 + }, + { + "epoch": 0.6254703901762725, + "grad_norm": 1.2875754867098208, + "learning_rate": 6.497545631290025e-06, + "loss": 0.2306, + "step": 7895 + }, + { + "epoch": 0.6255496137849079, + "grad_norm": 1.2069262045484102, + "learning_rate": 6.495142252575866e-06, + "loss": 0.2279, + "step": 7896 + }, + { + "epoch": 0.6256288373935432, + "grad_norm": 1.2335206564036638, + "learning_rate": 6.492739104639727e-06, + "loss": 0.2351, + "step": 7897 + }, + { + "epoch": 0.6257080610021787, + "grad_norm": 1.213606316899808, + "learning_rate": 6.490336187639841e-06, + "loss": 0.178, + "step": 7898 + }, + { + "epoch": 0.625787284610814, + "grad_norm": 1.3864068886610745, + "learning_rate": 6.487933501734429e-06, + "loss": 0.2918, + "step": 7899 + }, + { + "epoch": 0.6258665082194494, + "grad_norm": 1.2782345427445867, + "learning_rate": 6.485531047081697e-06, + "loss": 0.2264, + "step": 7900 + }, + { + "epoch": 0.6259457318280848, + "grad_norm": 1.7215899775331072, + "learning_rate": 6.483128823839835e-06, + "loss": 0.3407, + "step": 7901 + }, + { + "epoch": 0.6260249554367201, + "grad_norm": 1.3305414033458183, + "learning_rate": 6.480726832167019e-06, + "loss": 0.2411, + "step": 7902 + }, + { + "epoch": 0.6261041790453555, + "grad_norm": 1.2733483334165683, + "learning_rate": 6.4783250722214066e-06, + "loss": 0.2167, + "step": 7903 + }, + { + "epoch": 0.6261834026539909, + "grad_norm": 1.1135076361338612, + "learning_rate": 6.475923544161142e-06, + "loss": 0.2394, + "step": 7904 + }, + { + "epoch": 0.6262626262626263, + "grad_norm": 1.3461582826813059, + "learning_rate": 6.473522248144359e-06, + "loss": 0.2058, + "step": 7905 + }, + { + "epoch": 0.6263418498712616, + "grad_norm": 1.064316761926121, + "learning_rate": 6.471121184329167e-06, + "loss": 0.1524, + "step": 7906 + }, + { + "epoch": 0.6264210734798971, + "grad_norm": 1.4828105648414607, + "learning_rate": 6.468720352873662e-06, + "loss": 0.304, + "step": 7907 + }, + { + "epoch": 0.6265002970885324, + "grad_norm": 1.3572785813027461, + "learning_rate": 6.466319753935933e-06, + "loss": 0.2474, + "step": 7908 + }, + { + "epoch": 0.6265795206971677, + "grad_norm": 1.066587242392369, + "learning_rate": 6.463919387674043e-06, + "loss": 0.2126, + "step": 7909 + }, + { + "epoch": 0.6266587443058032, + "grad_norm": 1.7415276609291013, + "learning_rate": 6.461519254246046e-06, + "loss": 0.4044, + "step": 7910 + }, + { + "epoch": 0.6267379679144385, + "grad_norm": 1.3298089502106016, + "learning_rate": 6.459119353809982e-06, + "loss": 0.2452, + "step": 7911 + }, + { + "epoch": 0.6268171915230739, + "grad_norm": 1.661440900791588, + "learning_rate": 6.45671968652387e-06, + "loss": 0.3367, + "step": 7912 + }, + { + "epoch": 0.6268964151317092, + "grad_norm": 1.3651603244542607, + "learning_rate": 6.4543202525457175e-06, + "loss": 0.3134, + "step": 7913 + }, + { + "epoch": 0.6269756387403447, + "grad_norm": 1.4048164691702776, + "learning_rate": 6.451921052033516e-06, + "loss": 0.2961, + "step": 7914 + }, + { + "epoch": 0.62705486234898, + "grad_norm": 1.3361953354383622, + "learning_rate": 6.449522085145241e-06, + "loss": 0.2737, + "step": 7915 + }, + { + "epoch": 0.6271340859576153, + "grad_norm": 1.149003513924878, + "learning_rate": 6.447123352038853e-06, + "loss": 0.1764, + "step": 7916 + }, + { + "epoch": 0.6272133095662508, + "grad_norm": 1.3471055065035118, + "learning_rate": 6.444724852872297e-06, + "loss": 0.2685, + "step": 7917 + }, + { + "epoch": 0.6272925331748861, + "grad_norm": 1.2276114647335168, + "learning_rate": 6.4423265878035015e-06, + "loss": 0.2226, + "step": 7918 + }, + { + "epoch": 0.6273717567835215, + "grad_norm": 1.188488469219547, + "learning_rate": 6.439928556990382e-06, + "loss": 0.232, + "step": 7919 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 1.402828012944689, + "learning_rate": 6.437530760590838e-06, + "loss": 0.2987, + "step": 7920 + }, + { + "epoch": 0.6275302040007923, + "grad_norm": 1.7895268636931385, + "learning_rate": 6.435133198762751e-06, + "loss": 0.2609, + "step": 7921 + }, + { + "epoch": 0.6276094276094276, + "grad_norm": 1.102775135462172, + "learning_rate": 6.432735871663991e-06, + "loss": 0.1933, + "step": 7922 + }, + { + "epoch": 0.6276886512180629, + "grad_norm": 1.1671431399334748, + "learning_rate": 6.430338779452407e-06, + "loss": 0.2014, + "step": 7923 + }, + { + "epoch": 0.6277678748266984, + "grad_norm": 1.460059218832911, + "learning_rate": 6.4279419222858416e-06, + "loss": 0.2962, + "step": 7924 + }, + { + "epoch": 0.6278470984353337, + "grad_norm": 1.3251451811598105, + "learning_rate": 6.4255453003221115e-06, + "loss": 0.2646, + "step": 7925 + }, + { + "epoch": 0.6279263220439691, + "grad_norm": 1.071990626031152, + "learning_rate": 6.423148913719022e-06, + "loss": 0.2182, + "step": 7926 + }, + { + "epoch": 0.6280055456526045, + "grad_norm": 1.463140844815423, + "learning_rate": 6.420752762634369e-06, + "loss": 0.3495, + "step": 7927 + }, + { + "epoch": 0.6280847692612398, + "grad_norm": 1.7618109014012373, + "learning_rate": 6.4183568472259216e-06, + "loss": 0.3039, + "step": 7928 + }, + { + "epoch": 0.6281639928698752, + "grad_norm": 1.2088212086692502, + "learning_rate": 6.415961167651443e-06, + "loss": 0.2194, + "step": 7929 + }, + { + "epoch": 0.6282432164785106, + "grad_norm": 1.3023905476271826, + "learning_rate": 6.413565724068678e-06, + "loss": 0.25, + "step": 7930 + }, + { + "epoch": 0.628322440087146, + "grad_norm": 1.5106230695221274, + "learning_rate": 6.4111705166353525e-06, + "loss": 0.3293, + "step": 7931 + }, + { + "epoch": 0.6284016636957813, + "grad_norm": 1.3437321579925263, + "learning_rate": 6.40877554550918e-06, + "loss": 0.2893, + "step": 7932 + }, + { + "epoch": 0.6284808873044168, + "grad_norm": 1.3422776168889232, + "learning_rate": 6.406380810847856e-06, + "loss": 0.2189, + "step": 7933 + }, + { + "epoch": 0.6285601109130521, + "grad_norm": 1.6842183198735798, + "learning_rate": 6.403986312809065e-06, + "loss": 0.3122, + "step": 7934 + }, + { + "epoch": 0.6286393345216874, + "grad_norm": 1.1621268293310467, + "learning_rate": 6.401592051550475e-06, + "loss": 0.2065, + "step": 7935 + }, + { + "epoch": 0.6287185581303228, + "grad_norm": 1.5883806928644246, + "learning_rate": 6.399198027229732e-06, + "loss": 0.2708, + "step": 7936 + }, + { + "epoch": 0.6287977817389582, + "grad_norm": 1.3300264994041024, + "learning_rate": 6.39680424000447e-06, + "loss": 0.2705, + "step": 7937 + }, + { + "epoch": 0.6288770053475936, + "grad_norm": 1.181351397387026, + "learning_rate": 6.3944106900323174e-06, + "loss": 0.2567, + "step": 7938 + }, + { + "epoch": 0.6289562289562289, + "grad_norm": 1.4421441585754056, + "learning_rate": 6.392017377470867e-06, + "loss": 0.3056, + "step": 7939 + }, + { + "epoch": 0.6290354525648644, + "grad_norm": 1.4291983578427043, + "learning_rate": 6.389624302477715e-06, + "loss": 0.2634, + "step": 7940 + }, + { + "epoch": 0.6291146761734997, + "grad_norm": 1.421398808971498, + "learning_rate": 6.387231465210428e-06, + "loss": 0.2465, + "step": 7941 + }, + { + "epoch": 0.629193899782135, + "grad_norm": 1.344573473955173, + "learning_rate": 6.384838865826567e-06, + "loss": 0.2981, + "step": 7942 + }, + { + "epoch": 0.6292731233907705, + "grad_norm": 1.2131340886699908, + "learning_rate": 6.382446504483672e-06, + "loss": 0.2531, + "step": 7943 + }, + { + "epoch": 0.6293523469994058, + "grad_norm": 1.4725237311461137, + "learning_rate": 6.380054381339267e-06, + "loss": 0.319, + "step": 7944 + }, + { + "epoch": 0.6294315706080412, + "grad_norm": 1.3522484977876823, + "learning_rate": 6.377662496550863e-06, + "loss": 0.2295, + "step": 7945 + }, + { + "epoch": 0.6295107942166766, + "grad_norm": 1.4569857053753255, + "learning_rate": 6.375270850275956e-06, + "loss": 0.2879, + "step": 7946 + }, + { + "epoch": 0.629590017825312, + "grad_norm": 1.1392354980395054, + "learning_rate": 6.37287944267202e-06, + "loss": 0.2135, + "step": 7947 + }, + { + "epoch": 0.6296692414339473, + "grad_norm": 1.4279972461409565, + "learning_rate": 6.370488273896522e-06, + "loss": 0.2462, + "step": 7948 + }, + { + "epoch": 0.6297484650425826, + "grad_norm": 1.5304215447769638, + "learning_rate": 6.368097344106905e-06, + "loss": 0.2406, + "step": 7949 + }, + { + "epoch": 0.6298276886512181, + "grad_norm": 1.3711366317779963, + "learning_rate": 6.365706653460602e-06, + "loss": 0.2472, + "step": 7950 + }, + { + "epoch": 0.6299069122598534, + "grad_norm": 1.1960743131696108, + "learning_rate": 6.363316202115033e-06, + "loss": 0.2353, + "step": 7951 + }, + { + "epoch": 0.6299861358684888, + "grad_norm": 1.2335483904974787, + "learning_rate": 6.3609259902275884e-06, + "loss": 0.1919, + "step": 7952 + }, + { + "epoch": 0.6300653594771242, + "grad_norm": 1.2573339591647876, + "learning_rate": 6.358536017955659e-06, + "loss": 0.2759, + "step": 7953 + }, + { + "epoch": 0.6301445830857596, + "grad_norm": 1.6335534015466822, + "learning_rate": 6.3561462854566135e-06, + "loss": 0.2305, + "step": 7954 + }, + { + "epoch": 0.6302238066943949, + "grad_norm": 1.50173513891403, + "learning_rate": 6.3537567928878e-06, + "loss": 0.3171, + "step": 7955 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 1.2106244228139458, + "learning_rate": 6.3513675404065575e-06, + "loss": 0.179, + "step": 7956 + }, + { + "epoch": 0.6303822539116657, + "grad_norm": 1.3410385020453481, + "learning_rate": 6.348978528170205e-06, + "loss": 0.2723, + "step": 7957 + }, + { + "epoch": 0.630461477520301, + "grad_norm": 1.1109539175552414, + "learning_rate": 6.34658975633605e-06, + "loss": 0.2144, + "step": 7958 + }, + { + "epoch": 0.6305407011289365, + "grad_norm": 1.1604329522500991, + "learning_rate": 6.344201225061382e-06, + "loss": 0.1995, + "step": 7959 + }, + { + "epoch": 0.6306199247375718, + "grad_norm": 1.5451625398131381, + "learning_rate": 6.341812934503469e-06, + "loss": 0.2753, + "step": 7960 + }, + { + "epoch": 0.6306991483462072, + "grad_norm": 1.0907712760922053, + "learning_rate": 6.339424884819574e-06, + "loss": 0.1979, + "step": 7961 + }, + { + "epoch": 0.6307783719548425, + "grad_norm": 1.4321159554861054, + "learning_rate": 6.337037076166939e-06, + "loss": 0.2376, + "step": 7962 + }, + { + "epoch": 0.6308575955634779, + "grad_norm": 1.464011371325957, + "learning_rate": 6.334649508702784e-06, + "loss": 0.25, + "step": 7963 + }, + { + "epoch": 0.6309368191721133, + "grad_norm": 1.1842917072394445, + "learning_rate": 6.332262182584325e-06, + "loss": 0.1981, + "step": 7964 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 1.6624431311533532, + "learning_rate": 6.3298750979687515e-06, + "loss": 0.2409, + "step": 7965 + }, + { + "epoch": 0.6310952663893841, + "grad_norm": 1.3020087291640237, + "learning_rate": 6.327488255013244e-06, + "loss": 0.1949, + "step": 7966 + }, + { + "epoch": 0.6311744899980194, + "grad_norm": 1.7397515464689268, + "learning_rate": 6.325101653874965e-06, + "loss": 0.3581, + "step": 7967 + }, + { + "epoch": 0.6312537136066548, + "grad_norm": 1.4357715584318613, + "learning_rate": 6.322715294711057e-06, + "loss": 0.2899, + "step": 7968 + }, + { + "epoch": 0.6313329372152902, + "grad_norm": 1.0551460725633173, + "learning_rate": 6.320329177678656e-06, + "loss": 0.2118, + "step": 7969 + }, + { + "epoch": 0.6314121608239255, + "grad_norm": 1.551773028752438, + "learning_rate": 6.31794330293487e-06, + "loss": 0.3104, + "step": 7970 + }, + { + "epoch": 0.6314913844325609, + "grad_norm": 1.3773959896894676, + "learning_rate": 6.315557670636803e-06, + "loss": 0.277, + "step": 7971 + }, + { + "epoch": 0.6315706080411962, + "grad_norm": 1.5496005972463787, + "learning_rate": 6.313172280941534e-06, + "loss": 0.3401, + "step": 7972 + }, + { + "epoch": 0.6316498316498317, + "grad_norm": 1.189422442448734, + "learning_rate": 6.31078713400613e-06, + "loss": 0.2331, + "step": 7973 + }, + { + "epoch": 0.631729055258467, + "grad_norm": 1.2019421472257314, + "learning_rate": 6.308402229987641e-06, + "loss": 0.1867, + "step": 7974 + }, + { + "epoch": 0.6318082788671024, + "grad_norm": 1.325017322609509, + "learning_rate": 6.3060175690431055e-06, + "loss": 0.2416, + "step": 7975 + }, + { + "epoch": 0.6318875024757378, + "grad_norm": 1.056175835548149, + "learning_rate": 6.303633151329535e-06, + "loss": 0.1663, + "step": 7976 + }, + { + "epoch": 0.6319667260843731, + "grad_norm": 1.3293310055862992, + "learning_rate": 6.3012489770039396e-06, + "loss": 0.2022, + "step": 7977 + }, + { + "epoch": 0.6320459496930085, + "grad_norm": 1.579073432393702, + "learning_rate": 6.2988650462232995e-06, + "loss": 0.2912, + "step": 7978 + }, + { + "epoch": 0.6321251733016439, + "grad_norm": 1.2519327769615696, + "learning_rate": 6.296481359144587e-06, + "loss": 0.1929, + "step": 7979 + }, + { + "epoch": 0.6322043969102793, + "grad_norm": 1.71041473158289, + "learning_rate": 6.29409791592476e-06, + "loss": 0.3188, + "step": 7980 + }, + { + "epoch": 0.6322836205189146, + "grad_norm": 1.1347644233140113, + "learning_rate": 6.2917147167207495e-06, + "loss": 0.1712, + "step": 7981 + }, + { + "epoch": 0.6323628441275501, + "grad_norm": 1.1843762479035767, + "learning_rate": 6.289331761689482e-06, + "loss": 0.2228, + "step": 7982 + }, + { + "epoch": 0.6324420677361854, + "grad_norm": 1.53311235433751, + "learning_rate": 6.286949050987868e-06, + "loss": 0.2868, + "step": 7983 + }, + { + "epoch": 0.6325212913448207, + "grad_norm": 1.302764049039856, + "learning_rate": 6.284566584772791e-06, + "loss": 0.2255, + "step": 7984 + }, + { + "epoch": 0.6326005149534562, + "grad_norm": 1.2957824955745165, + "learning_rate": 6.2821843632011245e-06, + "loss": 0.2041, + "step": 7985 + }, + { + "epoch": 0.6326797385620915, + "grad_norm": 1.969293870083056, + "learning_rate": 6.2798023864297315e-06, + "loss": 0.2917, + "step": 7986 + }, + { + "epoch": 0.6327589621707269, + "grad_norm": 1.2233563381305874, + "learning_rate": 6.277420654615449e-06, + "loss": 0.2188, + "step": 7987 + }, + { + "epoch": 0.6328381857793622, + "grad_norm": 1.1513267264730471, + "learning_rate": 6.275039167915103e-06, + "loss": 0.2073, + "step": 7988 + }, + { + "epoch": 0.6329174093879977, + "grad_norm": 0.9315260048976783, + "learning_rate": 6.2726579264855084e-06, + "loss": 0.1426, + "step": 7989 + }, + { + "epoch": 0.632996632996633, + "grad_norm": 1.4360417558532792, + "learning_rate": 6.270276930483451e-06, + "loss": 0.2376, + "step": 7990 + }, + { + "epoch": 0.6330758566052683, + "grad_norm": 1.1726899624308016, + "learning_rate": 6.267896180065711e-06, + "loss": 0.2248, + "step": 7991 + }, + { + "epoch": 0.6331550802139038, + "grad_norm": 0.9689849167460542, + "learning_rate": 6.265515675389053e-06, + "loss": 0.1758, + "step": 7992 + }, + { + "epoch": 0.6332343038225391, + "grad_norm": 1.6434028325602796, + "learning_rate": 6.263135416610217e-06, + "loss": 0.3037, + "step": 7993 + }, + { + "epoch": 0.6333135274311745, + "grad_norm": 1.417284940949376, + "learning_rate": 6.260755403885934e-06, + "loss": 0.3272, + "step": 7994 + }, + { + "epoch": 0.6333927510398099, + "grad_norm": 1.3356281462827087, + "learning_rate": 6.258375637372914e-06, + "loss": 0.2352, + "step": 7995 + }, + { + "epoch": 0.6334719746484453, + "grad_norm": 1.4068535366615593, + "learning_rate": 6.2559961172278545e-06, + "loss": 0.244, + "step": 7996 + }, + { + "epoch": 0.6335511982570806, + "grad_norm": 1.3966119148081728, + "learning_rate": 6.253616843607439e-06, + "loss": 0.283, + "step": 7997 + }, + { + "epoch": 0.6336304218657159, + "grad_norm": 1.3414003634640943, + "learning_rate": 6.251237816668324e-06, + "loss": 0.2806, + "step": 7998 + }, + { + "epoch": 0.6337096454743514, + "grad_norm": 1.3611285228852037, + "learning_rate": 6.248859036567162e-06, + "loss": 0.2706, + "step": 7999 + }, + { + "epoch": 0.6337888690829867, + "grad_norm": 1.3566240572591168, + "learning_rate": 6.246480503460585e-06, + "loss": 0.32, + "step": 8000 + }, + { + "epoch": 0.6338680926916221, + "grad_norm": 1.4524789934657398, + "learning_rate": 6.2441022175052034e-06, + "loss": 0.2852, + "step": 8001 + }, + { + "epoch": 0.6339473163002575, + "grad_norm": 1.3058458142880431, + "learning_rate": 6.241724178857621e-06, + "loss": 0.1823, + "step": 8002 + }, + { + "epoch": 0.6340265399088929, + "grad_norm": 1.2251111763773705, + "learning_rate": 6.2393463876744165e-06, + "loss": 0.223, + "step": 8003 + }, + { + "epoch": 0.6341057635175282, + "grad_norm": 1.332157357102398, + "learning_rate": 6.236968844112157e-06, + "loss": 0.2037, + "step": 8004 + }, + { + "epoch": 0.6341849871261636, + "grad_norm": 1.3961917006140965, + "learning_rate": 6.234591548327393e-06, + "loss": 0.3648, + "step": 8005 + }, + { + "epoch": 0.634264210734799, + "grad_norm": 1.4176685393081507, + "learning_rate": 6.232214500476657e-06, + "loss": 0.2373, + "step": 8006 + }, + { + "epoch": 0.6343434343434343, + "grad_norm": 1.3710331613135092, + "learning_rate": 6.229837700716465e-06, + "loss": 0.2875, + "step": 8007 + }, + { + "epoch": 0.6344226579520698, + "grad_norm": 1.0594316895398406, + "learning_rate": 6.227461149203324e-06, + "loss": 0.2316, + "step": 8008 + }, + { + "epoch": 0.6345018815607051, + "grad_norm": 1.1444203954850434, + "learning_rate": 6.225084846093711e-06, + "loss": 0.1773, + "step": 8009 + }, + { + "epoch": 0.6345811051693404, + "grad_norm": 1.3261424124115733, + "learning_rate": 6.222708791544098e-06, + "loss": 0.2409, + "step": 8010 + }, + { + "epoch": 0.6346603287779758, + "grad_norm": 1.3358312534447354, + "learning_rate": 6.220332985710936e-06, + "loss": 0.2852, + "step": 8011 + }, + { + "epoch": 0.6347395523866112, + "grad_norm": 1.5026752282340712, + "learning_rate": 6.21795742875066e-06, + "loss": 0.323, + "step": 8012 + }, + { + "epoch": 0.6348187759952466, + "grad_norm": 1.4066423973431426, + "learning_rate": 6.21558212081969e-06, + "loss": 0.3013, + "step": 8013 + }, + { + "epoch": 0.6348979996038819, + "grad_norm": 1.4722671985572602, + "learning_rate": 6.213207062074427e-06, + "loss": 0.2705, + "step": 8014 + }, + { + "epoch": 0.6349772232125174, + "grad_norm": 1.0601271666965861, + "learning_rate": 6.210832252671257e-06, + "loss": 0.2131, + "step": 8015 + }, + { + "epoch": 0.6350564468211527, + "grad_norm": 1.1400294517673846, + "learning_rate": 6.208457692766554e-06, + "loss": 0.1932, + "step": 8016 + }, + { + "epoch": 0.635135670429788, + "grad_norm": 1.2491108110090605, + "learning_rate": 6.206083382516665e-06, + "loss": 0.2065, + "step": 8017 + }, + { + "epoch": 0.6352148940384235, + "grad_norm": 1.321773297198763, + "learning_rate": 6.203709322077933e-06, + "loss": 0.2545, + "step": 8018 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 1.1923136976366386, + "learning_rate": 6.201335511606673e-06, + "loss": 0.2056, + "step": 8019 + }, + { + "epoch": 0.6353733412556942, + "grad_norm": 1.3347489964688852, + "learning_rate": 6.198961951259193e-06, + "loss": 0.2804, + "step": 8020 + }, + { + "epoch": 0.6354525648643295, + "grad_norm": 1.1267880587626702, + "learning_rate": 6.196588641191778e-06, + "loss": 0.2438, + "step": 8021 + }, + { + "epoch": 0.635531788472965, + "grad_norm": 1.1308980116832539, + "learning_rate": 6.194215581560701e-06, + "loss": 0.242, + "step": 8022 + }, + { + "epoch": 0.6356110120816003, + "grad_norm": 1.2779739730243986, + "learning_rate": 6.191842772522214e-06, + "loss": 0.2337, + "step": 8023 + }, + { + "epoch": 0.6356902356902356, + "grad_norm": 1.3624698490752212, + "learning_rate": 6.18947021423256e-06, + "loss": 0.3439, + "step": 8024 + }, + { + "epoch": 0.6357694592988711, + "grad_norm": 1.0973915768768137, + "learning_rate": 6.187097906847954e-06, + "loss": 0.1771, + "step": 8025 + }, + { + "epoch": 0.6358486829075064, + "grad_norm": 1.467508941145104, + "learning_rate": 6.184725850524608e-06, + "loss": 0.2596, + "step": 8026 + }, + { + "epoch": 0.6359279065161418, + "grad_norm": 1.522003872481998, + "learning_rate": 6.182354045418704e-06, + "loss": 0.2466, + "step": 8027 + }, + { + "epoch": 0.6360071301247772, + "grad_norm": 1.3645213643982332, + "learning_rate": 6.179982491686416e-06, + "loss": 0.221, + "step": 8028 + }, + { + "epoch": 0.6360863537334126, + "grad_norm": 1.2548595312537048, + "learning_rate": 6.177611189483903e-06, + "loss": 0.2466, + "step": 8029 + }, + { + "epoch": 0.6361655773420479, + "grad_norm": 1.106055212766811, + "learning_rate": 6.175240138967299e-06, + "loss": 0.2443, + "step": 8030 + }, + { + "epoch": 0.6362448009506833, + "grad_norm": 1.3466327499411583, + "learning_rate": 6.172869340292729e-06, + "loss": 0.2148, + "step": 8031 + }, + { + "epoch": 0.6363240245593187, + "grad_norm": 1.021235311324439, + "learning_rate": 6.170498793616298e-06, + "loss": 0.1594, + "step": 8032 + }, + { + "epoch": 0.636403248167954, + "grad_norm": 1.2045430605283585, + "learning_rate": 6.168128499094095e-06, + "loss": 0.1911, + "step": 8033 + }, + { + "epoch": 0.6364824717765895, + "grad_norm": 1.374392964832849, + "learning_rate": 6.165758456882193e-06, + "loss": 0.2621, + "step": 8034 + }, + { + "epoch": 0.6365616953852248, + "grad_norm": 1.2347304732789428, + "learning_rate": 6.163388667136646e-06, + "loss": 0.2421, + "step": 8035 + }, + { + "epoch": 0.6366409189938602, + "grad_norm": 1.2669424764477113, + "learning_rate": 6.161019130013495e-06, + "loss": 0.223, + "step": 8036 + }, + { + "epoch": 0.6367201426024955, + "grad_norm": 1.4873313119263174, + "learning_rate": 6.158649845668764e-06, + "loss": 0.2653, + "step": 8037 + }, + { + "epoch": 0.6367993662111309, + "grad_norm": 0.9596454291776326, + "learning_rate": 6.156280814258455e-06, + "loss": 0.1501, + "step": 8038 + }, + { + "epoch": 0.6368785898197663, + "grad_norm": 1.200643575966026, + "learning_rate": 6.153912035938559e-06, + "loss": 0.1945, + "step": 8039 + }, + { + "epoch": 0.6369578134284016, + "grad_norm": 1.1927866166700951, + "learning_rate": 6.151543510865053e-06, + "loss": 0.2033, + "step": 8040 + }, + { + "epoch": 0.6370370370370371, + "grad_norm": 1.2795686409282219, + "learning_rate": 6.149175239193887e-06, + "loss": 0.2985, + "step": 8041 + }, + { + "epoch": 0.6371162606456724, + "grad_norm": 0.9553649361006948, + "learning_rate": 6.1468072210810035e-06, + "loss": 0.1341, + "step": 8042 + }, + { + "epoch": 0.6371954842543078, + "grad_norm": 1.5852703901838598, + "learning_rate": 6.144439456682323e-06, + "loss": 0.329, + "step": 8043 + }, + { + "epoch": 0.6372747078629432, + "grad_norm": 1.482306706139458, + "learning_rate": 6.142071946153751e-06, + "loss": 0.2806, + "step": 8044 + }, + { + "epoch": 0.6373539314715785, + "grad_norm": 1.1460548324101296, + "learning_rate": 6.139704689651181e-06, + "loss": 0.1685, + "step": 8045 + }, + { + "epoch": 0.6374331550802139, + "grad_norm": 1.626341732291147, + "learning_rate": 6.1373376873304814e-06, + "loss": 0.2091, + "step": 8046 + }, + { + "epoch": 0.6375123786888492, + "grad_norm": 1.001152699744232, + "learning_rate": 6.134970939347511e-06, + "loss": 0.1285, + "step": 8047 + }, + { + "epoch": 0.6375916022974847, + "grad_norm": 1.008053339614477, + "learning_rate": 6.132604445858104e-06, + "loss": 0.1458, + "step": 8048 + }, + { + "epoch": 0.63767082590612, + "grad_norm": 1.1157277496227973, + "learning_rate": 6.130238207018085e-06, + "loss": 0.1716, + "step": 8049 + }, + { + "epoch": 0.6377500495147554, + "grad_norm": 1.6419352984436941, + "learning_rate": 6.127872222983264e-06, + "loss": 0.276, + "step": 8050 + }, + { + "epoch": 0.6378292731233908, + "grad_norm": 1.20600993351154, + "learning_rate": 6.125506493909422e-06, + "loss": 0.208, + "step": 8051 + }, + { + "epoch": 0.6379084967320261, + "grad_norm": 1.566105491328064, + "learning_rate": 6.123141019952334e-06, + "loss": 0.255, + "step": 8052 + }, + { + "epoch": 0.6379877203406615, + "grad_norm": 1.4479775924949587, + "learning_rate": 6.1207758012677595e-06, + "loss": 0.2486, + "step": 8053 + }, + { + "epoch": 0.6380669439492969, + "grad_norm": 1.2511611696692906, + "learning_rate": 6.11841083801143e-06, + "loss": 0.2055, + "step": 8054 + }, + { + "epoch": 0.6381461675579323, + "grad_norm": 1.7104731442978196, + "learning_rate": 6.116046130339073e-06, + "loss": 0.295, + "step": 8055 + }, + { + "epoch": 0.6382253911665676, + "grad_norm": 1.7903796169589645, + "learning_rate": 6.1136816784063855e-06, + "loss": 0.3202, + "step": 8056 + }, + { + "epoch": 0.6383046147752031, + "grad_norm": 1.349185893522902, + "learning_rate": 6.1113174823690615e-06, + "loss": 0.2162, + "step": 8057 + }, + { + "epoch": 0.6383838383838384, + "grad_norm": 1.3086956141157358, + "learning_rate": 6.108953542382771e-06, + "loss": 0.2543, + "step": 8058 + }, + { + "epoch": 0.6384630619924737, + "grad_norm": 1.141986189379648, + "learning_rate": 6.106589858603167e-06, + "loss": 0.1621, + "step": 8059 + }, + { + "epoch": 0.6385422856011091, + "grad_norm": 1.43661214316852, + "learning_rate": 6.1042264311858845e-06, + "loss": 0.3148, + "step": 8060 + }, + { + "epoch": 0.6386215092097445, + "grad_norm": 1.8620339954626937, + "learning_rate": 6.101863260286551e-06, + "loss": 0.3747, + "step": 8061 + }, + { + "epoch": 0.6387007328183799, + "grad_norm": 1.0105018161183208, + "learning_rate": 6.099500346060765e-06, + "loss": 0.1521, + "step": 8062 + }, + { + "epoch": 0.6387799564270152, + "grad_norm": 1.2111547571213945, + "learning_rate": 6.09713768866411e-06, + "loss": 0.2178, + "step": 8063 + }, + { + "epoch": 0.6388591800356507, + "grad_norm": 1.1808399021789684, + "learning_rate": 6.094775288252157e-06, + "loss": 0.1908, + "step": 8064 + }, + { + "epoch": 0.638938403644286, + "grad_norm": 1.3504727635905611, + "learning_rate": 6.092413144980465e-06, + "loss": 0.2443, + "step": 8065 + }, + { + "epoch": 0.6390176272529213, + "grad_norm": 1.3454894316807033, + "learning_rate": 6.090051259004563e-06, + "loss": 0.2283, + "step": 8066 + }, + { + "epoch": 0.6390968508615568, + "grad_norm": 1.3156447316691806, + "learning_rate": 6.087689630479974e-06, + "loss": 0.2693, + "step": 8067 + }, + { + "epoch": 0.6391760744701921, + "grad_norm": 1.6816845594710903, + "learning_rate": 6.085328259562195e-06, + "loss": 0.2772, + "step": 8068 + }, + { + "epoch": 0.6392552980788275, + "grad_norm": 1.148164657575566, + "learning_rate": 6.082967146406714e-06, + "loss": 0.187, + "step": 8069 + }, + { + "epoch": 0.6393345216874629, + "grad_norm": 1.5950127381559818, + "learning_rate": 6.0806062911690025e-06, + "loss": 0.2589, + "step": 8070 + }, + { + "epoch": 0.6394137452960983, + "grad_norm": 1.2873938690263296, + "learning_rate": 6.078245694004503e-06, + "loss": 0.2335, + "step": 8071 + }, + { + "epoch": 0.6394929689047336, + "grad_norm": 1.4644729728192902, + "learning_rate": 6.075885355068658e-06, + "loss": 0.2976, + "step": 8072 + }, + { + "epoch": 0.6395721925133689, + "grad_norm": 1.421648315784826, + "learning_rate": 6.073525274516879e-06, + "loss": 0.2411, + "step": 8073 + }, + { + "epoch": 0.6396514161220044, + "grad_norm": 1.1954237777976786, + "learning_rate": 6.071165452504568e-06, + "loss": 0.2523, + "step": 8074 + }, + { + "epoch": 0.6397306397306397, + "grad_norm": 1.3567725475206658, + "learning_rate": 6.068805889187109e-06, + "loss": 0.2488, + "step": 8075 + }, + { + "epoch": 0.6398098633392751, + "grad_norm": 1.2934966069985312, + "learning_rate": 6.066446584719864e-06, + "loss": 0.2265, + "step": 8076 + }, + { + "epoch": 0.6398890869479105, + "grad_norm": 1.144472010160506, + "learning_rate": 6.064087539258186e-06, + "loss": 0.253, + "step": 8077 + }, + { + "epoch": 0.6399683105565459, + "grad_norm": 1.3979526248631313, + "learning_rate": 6.061728752957406e-06, + "loss": 0.2382, + "step": 8078 + }, + { + "epoch": 0.6400475341651812, + "grad_norm": 1.4491552380755233, + "learning_rate": 6.059370225972834e-06, + "loss": 0.2756, + "step": 8079 + }, + { + "epoch": 0.6401267577738166, + "grad_norm": 1.3682277452711427, + "learning_rate": 6.057011958459776e-06, + "loss": 0.221, + "step": 8080 + }, + { + "epoch": 0.640205981382452, + "grad_norm": 1.5430427389900436, + "learning_rate": 6.0546539505735055e-06, + "loss": 0.23, + "step": 8081 + }, + { + "epoch": 0.6402852049910873, + "grad_norm": 1.7759455149578112, + "learning_rate": 6.052296202469288e-06, + "loss": 0.3947, + "step": 8082 + }, + { + "epoch": 0.6403644285997228, + "grad_norm": 1.6337039711604564, + "learning_rate": 6.049938714302372e-06, + "loss": 0.3786, + "step": 8083 + }, + { + "epoch": 0.6404436522083581, + "grad_norm": 1.111737770160778, + "learning_rate": 6.047581486227984e-06, + "loss": 0.1883, + "step": 8084 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 1.1198638871192923, + "learning_rate": 6.045224518401338e-06, + "loss": 0.2154, + "step": 8085 + }, + { + "epoch": 0.6406020994256288, + "grad_norm": 2.4032904982201178, + "learning_rate": 6.04286781097763e-06, + "loss": 0.3569, + "step": 8086 + }, + { + "epoch": 0.6406813230342642, + "grad_norm": 1.420201500240393, + "learning_rate": 6.040511364112034e-06, + "loss": 0.2391, + "step": 8087 + }, + { + "epoch": 0.6407605466428996, + "grad_norm": 1.3607410030656533, + "learning_rate": 6.038155177959715e-06, + "loss": 0.2567, + "step": 8088 + }, + { + "epoch": 0.6408397702515349, + "grad_norm": 1.1934694222060973, + "learning_rate": 6.035799252675811e-06, + "loss": 0.1822, + "step": 8089 + }, + { + "epoch": 0.6409189938601704, + "grad_norm": 1.272895635947715, + "learning_rate": 6.0334435884154526e-06, + "loss": 0.2073, + "step": 8090 + }, + { + "epoch": 0.6409982174688057, + "grad_norm": 1.6594473483004053, + "learning_rate": 6.031088185333751e-06, + "loss": 0.3758, + "step": 8091 + }, + { + "epoch": 0.641077441077441, + "grad_norm": 1.4848538734620187, + "learning_rate": 6.028733043585793e-06, + "loss": 0.2321, + "step": 8092 + }, + { + "epoch": 0.6411566646860765, + "grad_norm": 1.3133230616462108, + "learning_rate": 6.026378163326654e-06, + "loss": 0.2826, + "step": 8093 + }, + { + "epoch": 0.6412358882947118, + "grad_norm": 1.0856353463652906, + "learning_rate": 6.024023544711396e-06, + "loss": 0.1959, + "step": 8094 + }, + { + "epoch": 0.6413151119033472, + "grad_norm": 1.3612629007419905, + "learning_rate": 6.021669187895054e-06, + "loss": 0.2353, + "step": 8095 + }, + { + "epoch": 0.6413943355119825, + "grad_norm": 1.3690513963544237, + "learning_rate": 6.019315093032656e-06, + "loss": 0.3015, + "step": 8096 + }, + { + "epoch": 0.641473559120618, + "grad_norm": 1.407255136875795, + "learning_rate": 6.016961260279204e-06, + "loss": 0.2593, + "step": 8097 + }, + { + "epoch": 0.6415527827292533, + "grad_norm": 1.1827693175835738, + "learning_rate": 6.0146076897896865e-06, + "loss": 0.2307, + "step": 8098 + }, + { + "epoch": 0.6416320063378886, + "grad_norm": 1.7272292197631658, + "learning_rate": 6.012254381719078e-06, + "loss": 0.3228, + "step": 8099 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 1.1978680700700264, + "learning_rate": 6.0099013362223305e-06, + "loss": 0.2156, + "step": 8100 + }, + { + "epoch": 0.6417904535551594, + "grad_norm": 1.4270642523243047, + "learning_rate": 6.007548553454379e-06, + "loss": 0.2831, + "step": 8101 + }, + { + "epoch": 0.6418696771637948, + "grad_norm": 1.2915990697196864, + "learning_rate": 6.005196033570147e-06, + "loss": 0.251, + "step": 8102 + }, + { + "epoch": 0.6419489007724302, + "grad_norm": 1.667631554960559, + "learning_rate": 6.002843776724534e-06, + "loss": 0.3437, + "step": 8103 + }, + { + "epoch": 0.6420281243810656, + "grad_norm": 1.2014071194813634, + "learning_rate": 6.000491783072426e-06, + "loss": 0.2435, + "step": 8104 + }, + { + "epoch": 0.6421073479897009, + "grad_norm": 1.252933854040516, + "learning_rate": 5.998140052768687e-06, + "loss": 0.2237, + "step": 8105 + }, + { + "epoch": 0.6421865715983363, + "grad_norm": 1.2022151399151069, + "learning_rate": 5.995788585968171e-06, + "loss": 0.2314, + "step": 8106 + }, + { + "epoch": 0.6422657952069717, + "grad_norm": 1.1784643893099416, + "learning_rate": 5.993437382825711e-06, + "loss": 0.1883, + "step": 8107 + }, + { + "epoch": 0.642345018815607, + "grad_norm": 1.1422486946810984, + "learning_rate": 5.991086443496119e-06, + "loss": 0.1828, + "step": 8108 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 1.3525221768082902, + "learning_rate": 5.9887357681341955e-06, + "loss": 0.2166, + "step": 8109 + }, + { + "epoch": 0.6425034660328778, + "grad_norm": 1.4813949540614977, + "learning_rate": 5.9863853568947215e-06, + "loss": 0.3234, + "step": 8110 + }, + { + "epoch": 0.6425826896415132, + "grad_norm": 1.1455857166825378, + "learning_rate": 5.9840352099324595e-06, + "loss": 0.2065, + "step": 8111 + }, + { + "epoch": 0.6426619132501485, + "grad_norm": 1.0984746551442057, + "learning_rate": 5.981685327402156e-06, + "loss": 0.1567, + "step": 8112 + }, + { + "epoch": 0.6427411368587839, + "grad_norm": 1.2642006811308861, + "learning_rate": 5.9793357094585365e-06, + "loss": 0.2269, + "step": 8113 + }, + { + "epoch": 0.6428203604674193, + "grad_norm": 1.4220520740083904, + "learning_rate": 5.976986356256316e-06, + "loss": 0.2324, + "step": 8114 + }, + { + "epoch": 0.6428995840760546, + "grad_norm": 1.1291914443064694, + "learning_rate": 5.974637267950187e-06, + "loss": 0.2274, + "step": 8115 + }, + { + "epoch": 0.6429788076846901, + "grad_norm": 1.247434955682266, + "learning_rate": 5.972288444694822e-06, + "loss": 0.233, + "step": 8116 + }, + { + "epoch": 0.6430580312933254, + "grad_norm": 1.4103680739157824, + "learning_rate": 5.9699398866448846e-06, + "loss": 0.2528, + "step": 8117 + }, + { + "epoch": 0.6431372549019608, + "grad_norm": 1.4585774943281162, + "learning_rate": 5.967591593955016e-06, + "loss": 0.2967, + "step": 8118 + }, + { + "epoch": 0.6432164785105962, + "grad_norm": 1.098661328588494, + "learning_rate": 5.965243566779837e-06, + "loss": 0.2398, + "step": 8119 + }, + { + "epoch": 0.6432957021192315, + "grad_norm": 1.6583771941322858, + "learning_rate": 5.962895805273956e-06, + "loss": 0.3312, + "step": 8120 + }, + { + "epoch": 0.6433749257278669, + "grad_norm": 1.273705229997679, + "learning_rate": 5.960548309591958e-06, + "loss": 0.2072, + "step": 8121 + }, + { + "epoch": 0.6434541493365022, + "grad_norm": 1.6771965777271247, + "learning_rate": 5.958201079888419e-06, + "loss": 0.2701, + "step": 8122 + }, + { + "epoch": 0.6435333729451377, + "grad_norm": 1.6019870082854617, + "learning_rate": 5.9558541163178915e-06, + "loss": 0.2773, + "step": 8123 + }, + { + "epoch": 0.643612596553773, + "grad_norm": 1.4033284501363137, + "learning_rate": 5.953507419034911e-06, + "loss": 0.2614, + "step": 8124 + }, + { + "epoch": 0.6436918201624084, + "grad_norm": 1.4180770775533054, + "learning_rate": 5.951160988193998e-06, + "loss": 0.3666, + "step": 8125 + }, + { + "epoch": 0.6437710437710438, + "grad_norm": 1.2279536714428312, + "learning_rate": 5.948814823949649e-06, + "loss": 0.2352, + "step": 8126 + }, + { + "epoch": 0.6438502673796791, + "grad_norm": 1.3640400335928415, + "learning_rate": 5.946468926456352e-06, + "loss": 0.2319, + "step": 8127 + }, + { + "epoch": 0.6439294909883145, + "grad_norm": 0.9992245033639103, + "learning_rate": 5.944123295868574e-06, + "loss": 0.1401, + "step": 8128 + }, + { + "epoch": 0.6440087145969499, + "grad_norm": 1.3656269910040781, + "learning_rate": 5.9417779323407576e-06, + "loss": 0.2585, + "step": 8129 + }, + { + "epoch": 0.6440879382055853, + "grad_norm": 1.4782410547695937, + "learning_rate": 5.939432836027339e-06, + "loss": 0.2431, + "step": 8130 + }, + { + "epoch": 0.6441671618142206, + "grad_norm": 1.2938615571457268, + "learning_rate": 5.937088007082731e-06, + "loss": 0.2073, + "step": 8131 + }, + { + "epoch": 0.6442463854228561, + "grad_norm": 1.195901103286782, + "learning_rate": 5.934743445661326e-06, + "loss": 0.243, + "step": 8132 + }, + { + "epoch": 0.6443256090314914, + "grad_norm": 1.4971039839426732, + "learning_rate": 5.932399151917507e-06, + "loss": 0.2666, + "step": 8133 + }, + { + "epoch": 0.6444048326401267, + "grad_norm": 1.292911705123545, + "learning_rate": 5.93005512600563e-06, + "loss": 0.2669, + "step": 8134 + }, + { + "epoch": 0.6444840562487621, + "grad_norm": 1.28628281896135, + "learning_rate": 5.92771136808004e-06, + "loss": 0.2629, + "step": 8135 + }, + { + "epoch": 0.6445632798573975, + "grad_norm": 1.3083046895397594, + "learning_rate": 5.925367878295063e-06, + "loss": 0.2365, + "step": 8136 + }, + { + "epoch": 0.6446425034660329, + "grad_norm": 1.4778166867537974, + "learning_rate": 5.9230246568050035e-06, + "loss": 0.2244, + "step": 8137 + }, + { + "epoch": 0.6447217270746682, + "grad_norm": 1.1986066038734173, + "learning_rate": 5.920681703764153e-06, + "loss": 0.1907, + "step": 8138 + }, + { + "epoch": 0.6448009506833037, + "grad_norm": 1.4055921060539385, + "learning_rate": 5.918339019326789e-06, + "loss": 0.2382, + "step": 8139 + }, + { + "epoch": 0.644880174291939, + "grad_norm": 1.3909759227487568, + "learning_rate": 5.915996603647157e-06, + "loss": 0.2749, + "step": 8140 + }, + { + "epoch": 0.6449593979005743, + "grad_norm": 1.6910052400495772, + "learning_rate": 5.913654456879496e-06, + "loss": 0.3476, + "step": 8141 + }, + { + "epoch": 0.6450386215092098, + "grad_norm": 1.4956247990378597, + "learning_rate": 5.911312579178028e-06, + "loss": 0.321, + "step": 8142 + }, + { + "epoch": 0.6451178451178451, + "grad_norm": 1.3955388581533108, + "learning_rate": 5.908970970696955e-06, + "loss": 0.2963, + "step": 8143 + }, + { + "epoch": 0.6451970687264805, + "grad_norm": 1.2812517471902314, + "learning_rate": 5.906629631590457e-06, + "loss": 0.1798, + "step": 8144 + }, + { + "epoch": 0.6452762923351159, + "grad_norm": 1.3023495715682363, + "learning_rate": 5.904288562012703e-06, + "loss": 0.1976, + "step": 8145 + }, + { + "epoch": 0.6453555159437513, + "grad_norm": 1.1957466887544972, + "learning_rate": 5.901947762117838e-06, + "loss": 0.2409, + "step": 8146 + }, + { + "epoch": 0.6454347395523866, + "grad_norm": 1.2181002182020895, + "learning_rate": 5.899607232059994e-06, + "loss": 0.2302, + "step": 8147 + }, + { + "epoch": 0.6455139631610219, + "grad_norm": 1.3383646556688498, + "learning_rate": 5.897266971993286e-06, + "loss": 0.3127, + "step": 8148 + }, + { + "epoch": 0.6455931867696574, + "grad_norm": 1.3013278544632247, + "learning_rate": 5.894926982071805e-06, + "loss": 0.2451, + "step": 8149 + }, + { + "epoch": 0.6456724103782927, + "grad_norm": 1.455135644873426, + "learning_rate": 5.892587262449631e-06, + "loss": 0.2684, + "step": 8150 + }, + { + "epoch": 0.6457516339869281, + "grad_norm": 1.3949670248958077, + "learning_rate": 5.890247813280822e-06, + "loss": 0.2396, + "step": 8151 + }, + { + "epoch": 0.6458308575955635, + "grad_norm": 1.4654976448807884, + "learning_rate": 5.8879086347194196e-06, + "loss": 0.2395, + "step": 8152 + }, + { + "epoch": 0.6459100812041989, + "grad_norm": 1.2225795709262621, + "learning_rate": 5.885569726919449e-06, + "loss": 0.2041, + "step": 8153 + }, + { + "epoch": 0.6459893048128342, + "grad_norm": 1.2257139244742783, + "learning_rate": 5.883231090034911e-06, + "loss": 0.2311, + "step": 8154 + }, + { + "epoch": 0.6460685284214696, + "grad_norm": 1.5088672370952754, + "learning_rate": 5.8808927242197984e-06, + "loss": 0.3058, + "step": 8155 + }, + { + "epoch": 0.646147752030105, + "grad_norm": 1.2286795373313426, + "learning_rate": 5.878554629628081e-06, + "loss": 0.2665, + "step": 8156 + }, + { + "epoch": 0.6462269756387403, + "grad_norm": 1.1509846666770909, + "learning_rate": 5.87621680641371e-06, + "loss": 0.2457, + "step": 8157 + }, + { + "epoch": 0.6463061992473758, + "grad_norm": 1.249818567896011, + "learning_rate": 5.873879254730621e-06, + "loss": 0.179, + "step": 8158 + }, + { + "epoch": 0.6463854228560111, + "grad_norm": 1.2229891677882212, + "learning_rate": 5.871541974732727e-06, + "loss": 0.1548, + "step": 8159 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 1.3046319716154064, + "learning_rate": 5.869204966573929e-06, + "loss": 0.2272, + "step": 8160 + }, + { + "epoch": 0.6465438700732818, + "grad_norm": 1.4202659233052892, + "learning_rate": 5.866868230408111e-06, + "loss": 0.246, + "step": 8161 + }, + { + "epoch": 0.6466230936819172, + "grad_norm": 1.5452830360718968, + "learning_rate": 5.86453176638913e-06, + "loss": 0.2401, + "step": 8162 + }, + { + "epoch": 0.6467023172905526, + "grad_norm": 1.4251130783891992, + "learning_rate": 5.862195574670834e-06, + "loss": 0.2375, + "step": 8163 + }, + { + "epoch": 0.6467815408991879, + "grad_norm": 1.4304510655614806, + "learning_rate": 5.85985965540705e-06, + "loss": 0.3316, + "step": 8164 + }, + { + "epoch": 0.6468607645078234, + "grad_norm": 1.3000918366011986, + "learning_rate": 5.857524008751586e-06, + "loss": 0.186, + "step": 8165 + }, + { + "epoch": 0.6469399881164587, + "grad_norm": 1.382534595387525, + "learning_rate": 5.855188634858235e-06, + "loss": 0.3204, + "step": 8166 + }, + { + "epoch": 0.647019211725094, + "grad_norm": 1.3098944138557873, + "learning_rate": 5.852853533880768e-06, + "loss": 0.2021, + "step": 8167 + }, + { + "epoch": 0.6470984353337295, + "grad_norm": 1.384742778393619, + "learning_rate": 5.850518705972941e-06, + "loss": 0.2784, + "step": 8168 + }, + { + "epoch": 0.6471776589423648, + "grad_norm": 1.2976537575609843, + "learning_rate": 5.848184151288492e-06, + "loss": 0.2723, + "step": 8169 + }, + { + "epoch": 0.6472568825510002, + "grad_norm": 1.2891180624230896, + "learning_rate": 5.845849869981137e-06, + "loss": 0.2111, + "step": 8170 + }, + { + "epoch": 0.6473361061596355, + "grad_norm": 1.742309750335544, + "learning_rate": 5.843515862204581e-06, + "loss": 0.3064, + "step": 8171 + }, + { + "epoch": 0.647415329768271, + "grad_norm": 1.3321958610973492, + "learning_rate": 5.841182128112506e-06, + "loss": 0.2421, + "step": 8172 + }, + { + "epoch": 0.6474945533769063, + "grad_norm": 1.220068248213159, + "learning_rate": 5.838848667858577e-06, + "loss": 0.2585, + "step": 8173 + }, + { + "epoch": 0.6475737769855416, + "grad_norm": 1.1799814246518199, + "learning_rate": 5.83651548159644e-06, + "loss": 0.1991, + "step": 8174 + }, + { + "epoch": 0.6476530005941771, + "grad_norm": 1.4795636129522518, + "learning_rate": 5.834182569479727e-06, + "loss": 0.2515, + "step": 8175 + }, + { + "epoch": 0.6477322242028124, + "grad_norm": 0.991585093191464, + "learning_rate": 5.831849931662047e-06, + "loss": 0.1368, + "step": 8176 + }, + { + "epoch": 0.6478114478114478, + "grad_norm": 1.1468145182399327, + "learning_rate": 5.829517568296989e-06, + "loss": 0.1973, + "step": 8177 + }, + { + "epoch": 0.6478906714200832, + "grad_norm": 1.5411334228179303, + "learning_rate": 5.827185479538138e-06, + "loss": 0.3021, + "step": 8178 + }, + { + "epoch": 0.6479698950287186, + "grad_norm": 1.3809742617805387, + "learning_rate": 5.824853665539043e-06, + "loss": 0.2363, + "step": 8179 + }, + { + "epoch": 0.6480491186373539, + "grad_norm": 1.3884944220143203, + "learning_rate": 5.82252212645324e-06, + "loss": 0.2015, + "step": 8180 + }, + { + "epoch": 0.6481283422459893, + "grad_norm": 1.3504039071335587, + "learning_rate": 5.820190862434259e-06, + "loss": 0.2755, + "step": 8181 + }, + { + "epoch": 0.6482075658546247, + "grad_norm": 1.5062850129582808, + "learning_rate": 5.8178598736355985e-06, + "loss": 0.2207, + "step": 8182 + }, + { + "epoch": 0.64828678946326, + "grad_norm": 1.3468532382164187, + "learning_rate": 5.815529160210738e-06, + "loss": 0.2389, + "step": 8183 + }, + { + "epoch": 0.6483660130718955, + "grad_norm": 1.2873584372407643, + "learning_rate": 5.813198722313151e-06, + "loss": 0.2538, + "step": 8184 + }, + { + "epoch": 0.6484452366805308, + "grad_norm": 1.2147192141188234, + "learning_rate": 5.810868560096283e-06, + "loss": 0.2408, + "step": 8185 + }, + { + "epoch": 0.6485244602891662, + "grad_norm": 1.150210873959303, + "learning_rate": 5.808538673713564e-06, + "loss": 0.179, + "step": 8186 + }, + { + "epoch": 0.6486036838978015, + "grad_norm": 1.5514435555091965, + "learning_rate": 5.8062090633184e-06, + "loss": 0.2602, + "step": 8187 + }, + { + "epoch": 0.6486829075064369, + "grad_norm": 1.3612607417387983, + "learning_rate": 5.803879729064195e-06, + "loss": 0.1671, + "step": 8188 + }, + { + "epoch": 0.6487621311150723, + "grad_norm": 1.5891191175421429, + "learning_rate": 5.801550671104319e-06, + "loss": 0.3012, + "step": 8189 + }, + { + "epoch": 0.6488413547237076, + "grad_norm": 0.9579452311161174, + "learning_rate": 5.7992218895921256e-06, + "loss": 0.1441, + "step": 8190 + }, + { + "epoch": 0.6489205783323431, + "grad_norm": 1.2491284816231087, + "learning_rate": 5.796893384680964e-06, + "loss": 0.2324, + "step": 8191 + }, + { + "epoch": 0.6489998019409784, + "grad_norm": 1.152755292113233, + "learning_rate": 5.7945651565241455e-06, + "loss": 0.1983, + "step": 8192 + }, + { + "epoch": 0.6490790255496138, + "grad_norm": 1.4206019671704502, + "learning_rate": 5.792237205274974e-06, + "loss": 0.2531, + "step": 8193 + }, + { + "epoch": 0.6491582491582492, + "grad_norm": 1.4044823750296693, + "learning_rate": 5.789909531086741e-06, + "loss": 0.3426, + "step": 8194 + }, + { + "epoch": 0.6492374727668845, + "grad_norm": 1.4364212142703516, + "learning_rate": 5.787582134112706e-06, + "loss": 0.2413, + "step": 8195 + }, + { + "epoch": 0.6493166963755199, + "grad_norm": 1.064031031591425, + "learning_rate": 5.785255014506115e-06, + "loss": 0.2006, + "step": 8196 + }, + { + "epoch": 0.6493959199841552, + "grad_norm": 1.371714606810431, + "learning_rate": 5.782928172420206e-06, + "loss": 0.2549, + "step": 8197 + }, + { + "epoch": 0.6494751435927907, + "grad_norm": 1.4441606154563482, + "learning_rate": 5.780601608008185e-06, + "loss": 0.2654, + "step": 8198 + }, + { + "epoch": 0.649554367201426, + "grad_norm": 1.4719931528968992, + "learning_rate": 5.778275321423241e-06, + "loss": 0.3326, + "step": 8199 + }, + { + "epoch": 0.6496335908100614, + "grad_norm": 1.4809493585096092, + "learning_rate": 5.7759493128185584e-06, + "loss": 0.2896, + "step": 8200 + }, + { + "epoch": 0.6497128144186968, + "grad_norm": 1.659886742655029, + "learning_rate": 5.773623582347289e-06, + "loss": 0.2966, + "step": 8201 + }, + { + "epoch": 0.6497920380273321, + "grad_norm": 1.4284371556711388, + "learning_rate": 5.77129813016257e-06, + "loss": 0.3075, + "step": 8202 + }, + { + "epoch": 0.6498712616359675, + "grad_norm": 1.3929331099693574, + "learning_rate": 5.768972956417518e-06, + "loss": 0.2894, + "step": 8203 + }, + { + "epoch": 0.6499504852446029, + "grad_norm": 1.229699544949765, + "learning_rate": 5.766648061265242e-06, + "loss": 0.1881, + "step": 8204 + }, + { + "epoch": 0.6500297088532383, + "grad_norm": 1.4991288856567153, + "learning_rate": 5.764323444858823e-06, + "loss": 0.2548, + "step": 8205 + }, + { + "epoch": 0.6501089324618736, + "grad_norm": 1.4754548644404095, + "learning_rate": 5.761999107351319e-06, + "loss": 0.2941, + "step": 8206 + }, + { + "epoch": 0.6501881560705091, + "grad_norm": 1.288205072146522, + "learning_rate": 5.759675048895785e-06, + "loss": 0.2971, + "step": 8207 + }, + { + "epoch": 0.6502673796791444, + "grad_norm": 1.7057389019166083, + "learning_rate": 5.757351269645248e-06, + "loss": 0.4096, + "step": 8208 + }, + { + "epoch": 0.6503466032877797, + "grad_norm": 1.1658150089316675, + "learning_rate": 5.75502776975271e-06, + "loss": 0.2161, + "step": 8209 + }, + { + "epoch": 0.6504258268964151, + "grad_norm": 1.3783310309691115, + "learning_rate": 5.752704549371173e-06, + "loss": 0.2188, + "step": 8210 + }, + { + "epoch": 0.6505050505050505, + "grad_norm": 1.4202501898947362, + "learning_rate": 5.750381608653605e-06, + "loss": 0.2834, + "step": 8211 + }, + { + "epoch": 0.6505842741136859, + "grad_norm": 1.1136144699359627, + "learning_rate": 5.748058947752955e-06, + "loss": 0.1744, + "step": 8212 + }, + { + "epoch": 0.6506634977223212, + "grad_norm": 1.1337237879831854, + "learning_rate": 5.745736566822169e-06, + "loss": 0.2105, + "step": 8213 + }, + { + "epoch": 0.6507427213309567, + "grad_norm": 1.4534454354028474, + "learning_rate": 5.743414466014159e-06, + "loss": 0.3502, + "step": 8214 + }, + { + "epoch": 0.650821944939592, + "grad_norm": 1.4728076953042555, + "learning_rate": 5.7410926454818265e-06, + "loss": 0.3483, + "step": 8215 + }, + { + "epoch": 0.6509011685482273, + "grad_norm": 1.2762477065054285, + "learning_rate": 5.738771105378046e-06, + "loss": 0.2245, + "step": 8216 + }, + { + "epoch": 0.6509803921568628, + "grad_norm": 1.6185604962932965, + "learning_rate": 5.7364498458556914e-06, + "loss": 0.3277, + "step": 8217 + }, + { + "epoch": 0.6510596157654981, + "grad_norm": 1.3256756646747319, + "learning_rate": 5.734128867067593e-06, + "loss": 0.2721, + "step": 8218 + }, + { + "epoch": 0.6511388393741335, + "grad_norm": 1.2585430603388206, + "learning_rate": 5.731808169166586e-06, + "loss": 0.23, + "step": 8219 + }, + { + "epoch": 0.6512180629827689, + "grad_norm": 1.4410735344608805, + "learning_rate": 5.7294877523054735e-06, + "loss": 0.2193, + "step": 8220 + }, + { + "epoch": 0.6512972865914043, + "grad_norm": 1.3540027586279855, + "learning_rate": 5.727167616637042e-06, + "loss": 0.2816, + "step": 8221 + }, + { + "epoch": 0.6513765102000396, + "grad_norm": 1.5948557006961954, + "learning_rate": 5.7248477623140655e-06, + "loss": 0.2955, + "step": 8222 + }, + { + "epoch": 0.6514557338086749, + "grad_norm": 1.1882125275000315, + "learning_rate": 5.722528189489294e-06, + "loss": 0.2069, + "step": 8223 + }, + { + "epoch": 0.6515349574173104, + "grad_norm": 1.319626426826207, + "learning_rate": 5.720208898315454e-06, + "loss": 0.3017, + "step": 8224 + }, + { + "epoch": 0.6516141810259457, + "grad_norm": 1.6088370067367068, + "learning_rate": 5.717889888945271e-06, + "loss": 0.3623, + "step": 8225 + }, + { + "epoch": 0.6516934046345811, + "grad_norm": 1.2051490578485706, + "learning_rate": 5.715571161531433e-06, + "loss": 0.1816, + "step": 8226 + }, + { + "epoch": 0.6517726282432165, + "grad_norm": 1.5428819965220586, + "learning_rate": 5.7132527162266194e-06, + "loss": 0.282, + "step": 8227 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 1.3229346718964534, + "learning_rate": 5.710934553183484e-06, + "loss": 0.2727, + "step": 8228 + }, + { + "epoch": 0.6519310754604872, + "grad_norm": 1.3644065855357472, + "learning_rate": 5.708616672554675e-06, + "loss": 0.2808, + "step": 8229 + }, + { + "epoch": 0.6520102990691226, + "grad_norm": 1.3604704436586774, + "learning_rate": 5.7062990744928086e-06, + "loss": 0.2885, + "step": 8230 + }, + { + "epoch": 0.652089522677758, + "grad_norm": 1.437622917063151, + "learning_rate": 5.703981759150483e-06, + "loss": 0.2306, + "step": 8231 + }, + { + "epoch": 0.6521687462863933, + "grad_norm": 1.2195711467253147, + "learning_rate": 5.701664726680294e-06, + "loss": 0.1796, + "step": 8232 + }, + { + "epoch": 0.6522479698950288, + "grad_norm": 1.3599793326001144, + "learning_rate": 5.699347977234799e-06, + "loss": 0.2303, + "step": 8233 + }, + { + "epoch": 0.6523271935036641, + "grad_norm": 1.1689218302714448, + "learning_rate": 5.697031510966542e-06, + "loss": 0.175, + "step": 8234 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 1.2743345105512172, + "learning_rate": 5.69471532802806e-06, + "loss": 0.2434, + "step": 8235 + }, + { + "epoch": 0.6524856407209348, + "grad_norm": 1.8280711414999011, + "learning_rate": 5.692399428571857e-06, + "loss": 0.2843, + "step": 8236 + }, + { + "epoch": 0.6525648643295702, + "grad_norm": 1.4282173217074114, + "learning_rate": 5.690083812750422e-06, + "loss": 0.2156, + "step": 8237 + }, + { + "epoch": 0.6526440879382056, + "grad_norm": 1.4012797356770035, + "learning_rate": 5.687768480716233e-06, + "loss": 0.2789, + "step": 8238 + }, + { + "epoch": 0.6527233115468409, + "grad_norm": 1.3066002093093614, + "learning_rate": 5.685453432621741e-06, + "loss": 0.2154, + "step": 8239 + }, + { + "epoch": 0.6528025351554764, + "grad_norm": 1.3782246275048002, + "learning_rate": 5.683138668619381e-06, + "loss": 0.2415, + "step": 8240 + }, + { + "epoch": 0.6528817587641117, + "grad_norm": 1.2795835626579697, + "learning_rate": 5.680824188861564e-06, + "loss": 0.1666, + "step": 8241 + }, + { + "epoch": 0.6529609823727471, + "grad_norm": 1.4334796424175882, + "learning_rate": 5.678509993500695e-06, + "loss": 0.1831, + "step": 8242 + }, + { + "epoch": 0.6530402059813825, + "grad_norm": 1.4289170971199112, + "learning_rate": 5.676196082689149e-06, + "loss": 0.2407, + "step": 8243 + }, + { + "epoch": 0.6531194295900178, + "grad_norm": 1.3171045771506669, + "learning_rate": 5.673882456579282e-06, + "loss": 0.2051, + "step": 8244 + }, + { + "epoch": 0.6531986531986532, + "grad_norm": 1.4045195840365614, + "learning_rate": 5.6715691153234445e-06, + "loss": 0.269, + "step": 8245 + }, + { + "epoch": 0.6532778768072885, + "grad_norm": 1.1058765581272658, + "learning_rate": 5.669256059073953e-06, + "loss": 0.1613, + "step": 8246 + }, + { + "epoch": 0.653357100415924, + "grad_norm": 1.4565447769680877, + "learning_rate": 5.666943287983106e-06, + "loss": 0.2709, + "step": 8247 + }, + { + "epoch": 0.6534363240245593, + "grad_norm": 1.9498937160185494, + "learning_rate": 5.664630802203201e-06, + "loss": 0.3357, + "step": 8248 + }, + { + "epoch": 0.6535155476331946, + "grad_norm": 1.5119685588075111, + "learning_rate": 5.662318601886496e-06, + "loss": 0.3351, + "step": 8249 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 1.5925238934924428, + "learning_rate": 5.660006687185235e-06, + "loss": 0.2522, + "step": 8250 + }, + { + "epoch": 0.6536739948504654, + "grad_norm": 1.3812899894915327, + "learning_rate": 5.657695058251656e-06, + "loss": 0.3437, + "step": 8251 + }, + { + "epoch": 0.6537532184591008, + "grad_norm": 1.4443810885064245, + "learning_rate": 5.655383715237963e-06, + "loss": 0.2827, + "step": 8252 + }, + { + "epoch": 0.6538324420677362, + "grad_norm": 1.2048313068771106, + "learning_rate": 5.653072658296344e-06, + "loss": 0.1971, + "step": 8253 + }, + { + "epoch": 0.6539116656763716, + "grad_norm": 1.4143378162351634, + "learning_rate": 5.650761887578977e-06, + "loss": 0.3018, + "step": 8254 + }, + { + "epoch": 0.6539908892850069, + "grad_norm": 1.4622225499434423, + "learning_rate": 5.648451403238013e-06, + "loss": 0.3305, + "step": 8255 + }, + { + "epoch": 0.6540701128936423, + "grad_norm": 1.3939297464312732, + "learning_rate": 5.646141205425586e-06, + "loss": 0.2254, + "step": 8256 + }, + { + "epoch": 0.6541493365022777, + "grad_norm": 1.3600801342035775, + "learning_rate": 5.643831294293808e-06, + "loss": 0.236, + "step": 8257 + }, + { + "epoch": 0.654228560110913, + "grad_norm": 1.2500107183974192, + "learning_rate": 5.641521669994782e-06, + "loss": 0.2517, + "step": 8258 + }, + { + "epoch": 0.6543077837195485, + "grad_norm": 1.5466957951513571, + "learning_rate": 5.639212332680581e-06, + "loss": 0.2372, + "step": 8259 + }, + { + "epoch": 0.6543870073281838, + "grad_norm": 1.4120252459455545, + "learning_rate": 5.636903282503263e-06, + "loss": 0.2378, + "step": 8260 + }, + { + "epoch": 0.6544662309368192, + "grad_norm": 1.5833833191874824, + "learning_rate": 5.6345945196148734e-06, + "loss": 0.3072, + "step": 8261 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 0.9104005117978915, + "learning_rate": 5.63228604416743e-06, + "loss": 0.1209, + "step": 8262 + }, + { + "epoch": 0.6546246781540899, + "grad_norm": 1.40885313207492, + "learning_rate": 5.62997785631293e-06, + "loss": 0.2315, + "step": 8263 + }, + { + "epoch": 0.6547039017627253, + "grad_norm": 1.2514514813918491, + "learning_rate": 5.627669956203365e-06, + "loss": 0.1825, + "step": 8264 + }, + { + "epoch": 0.6547831253713606, + "grad_norm": 1.591887297539188, + "learning_rate": 5.6253623439906955e-06, + "loss": 0.3877, + "step": 8265 + }, + { + "epoch": 0.6548623489799961, + "grad_norm": 1.3277121617300123, + "learning_rate": 5.623055019826862e-06, + "loss": 0.3057, + "step": 8266 + }, + { + "epoch": 0.6549415725886314, + "grad_norm": 1.5458358195134896, + "learning_rate": 5.6207479838637995e-06, + "loss": 0.2262, + "step": 8267 + }, + { + "epoch": 0.6550207961972668, + "grad_norm": 1.3775270832751523, + "learning_rate": 5.618441236253411e-06, + "loss": 0.2259, + "step": 8268 + }, + { + "epoch": 0.6551000198059022, + "grad_norm": 1.593305268993249, + "learning_rate": 5.616134777147578e-06, + "loss": 0.2855, + "step": 8269 + }, + { + "epoch": 0.6551792434145375, + "grad_norm": 1.4931850181518935, + "learning_rate": 5.6138286066981815e-06, + "loss": 0.2566, + "step": 8270 + }, + { + "epoch": 0.6552584670231729, + "grad_norm": 1.6341990950191314, + "learning_rate": 5.611522725057067e-06, + "loss": 0.2683, + "step": 8271 + }, + { + "epoch": 0.6553376906318082, + "grad_norm": 1.2327587732323917, + "learning_rate": 5.6092171323760635e-06, + "loss": 0.2829, + "step": 8272 + }, + { + "epoch": 0.6554169142404437, + "grad_norm": 1.143406831715001, + "learning_rate": 5.6069118288069824e-06, + "loss": 0.1866, + "step": 8273 + }, + { + "epoch": 0.655496137849079, + "grad_norm": 1.4306590530622671, + "learning_rate": 5.604606814501623e-06, + "loss": 0.2277, + "step": 8274 + }, + { + "epoch": 0.6555753614577144, + "grad_norm": 0.9951344606511768, + "learning_rate": 5.602302089611755e-06, + "loss": 0.1471, + "step": 8275 + }, + { + "epoch": 0.6556545850663498, + "grad_norm": 1.1532442807584862, + "learning_rate": 5.599997654289129e-06, + "loss": 0.2038, + "step": 8276 + }, + { + "epoch": 0.6557338086749851, + "grad_norm": 1.511912539074862, + "learning_rate": 5.5976935086854914e-06, + "loss": 0.3291, + "step": 8277 + }, + { + "epoch": 0.6558130322836205, + "grad_norm": 1.2813473369591573, + "learning_rate": 5.595389652952555e-06, + "loss": 0.2494, + "step": 8278 + }, + { + "epoch": 0.6558922558922559, + "grad_norm": 1.3428850586847336, + "learning_rate": 5.59308608724201e-06, + "loss": 0.2035, + "step": 8279 + }, + { + "epoch": 0.6559714795008913, + "grad_norm": 1.5236277624601837, + "learning_rate": 5.590782811705547e-06, + "loss": 0.2166, + "step": 8280 + }, + { + "epoch": 0.6560507031095266, + "grad_norm": 1.5965910841898305, + "learning_rate": 5.588479826494817e-06, + "loss": 0.2721, + "step": 8281 + }, + { + "epoch": 0.6561299267181621, + "grad_norm": 1.1829408889037407, + "learning_rate": 5.5861771317614624e-06, + "loss": 0.2173, + "step": 8282 + }, + { + "epoch": 0.6562091503267974, + "grad_norm": 1.2316774437155615, + "learning_rate": 5.583874727657109e-06, + "loss": 0.2144, + "step": 8283 + }, + { + "epoch": 0.6562883739354327, + "grad_norm": 1.1893141187767418, + "learning_rate": 5.581572614333356e-06, + "loss": 0.1748, + "step": 8284 + }, + { + "epoch": 0.6563675975440681, + "grad_norm": 1.2242846043315372, + "learning_rate": 5.579270791941787e-06, + "loss": 0.1655, + "step": 8285 + }, + { + "epoch": 0.6564468211527035, + "grad_norm": 1.709248591628549, + "learning_rate": 5.5769692606339584e-06, + "loss": 0.295, + "step": 8286 + }, + { + "epoch": 0.6565260447613389, + "grad_norm": 1.4099422681992102, + "learning_rate": 5.574668020561428e-06, + "loss": 0.2489, + "step": 8287 + }, + { + "epoch": 0.6566052683699742, + "grad_norm": 1.3778342992923602, + "learning_rate": 5.572367071875715e-06, + "loss": 0.2679, + "step": 8288 + }, + { + "epoch": 0.6566844919786097, + "grad_norm": 1.8174632950361576, + "learning_rate": 5.570066414728321e-06, + "loss": 0.3384, + "step": 8289 + }, + { + "epoch": 0.656763715587245, + "grad_norm": 1.0790279673517058, + "learning_rate": 5.567766049270742e-06, + "loss": 0.1812, + "step": 8290 + }, + { + "epoch": 0.6568429391958803, + "grad_norm": 1.1235633883750664, + "learning_rate": 5.5654659756544425e-06, + "loss": 0.2207, + "step": 8291 + }, + { + "epoch": 0.6569221628045158, + "grad_norm": 1.322570757406409, + "learning_rate": 5.563166194030868e-06, + "loss": 0.2504, + "step": 8292 + }, + { + "epoch": 0.6570013864131511, + "grad_norm": 1.9260048581567604, + "learning_rate": 5.560866704551454e-06, + "loss": 0.3836, + "step": 8293 + }, + { + "epoch": 0.6570806100217865, + "grad_norm": 1.1553126660387465, + "learning_rate": 5.5585675073676085e-06, + "loss": 0.1587, + "step": 8294 + }, + { + "epoch": 0.6571598336304219, + "grad_norm": 1.2434067681545933, + "learning_rate": 5.556268602630721e-06, + "loss": 0.2561, + "step": 8295 + }, + { + "epoch": 0.6572390572390573, + "grad_norm": 1.0898522372539647, + "learning_rate": 5.553969990492164e-06, + "loss": 0.2022, + "step": 8296 + }, + { + "epoch": 0.6573182808476926, + "grad_norm": 1.2522349290649752, + "learning_rate": 5.5516716711032906e-06, + "loss": 0.2396, + "step": 8297 + }, + { + "epoch": 0.6573975044563279, + "grad_norm": 1.4848079093922917, + "learning_rate": 5.54937364461543e-06, + "loss": 0.3321, + "step": 8298 + }, + { + "epoch": 0.6574767280649634, + "grad_norm": 1.4566944120284944, + "learning_rate": 5.547075911179902e-06, + "loss": 0.2531, + "step": 8299 + }, + { + "epoch": 0.6575559516735987, + "grad_norm": 1.271216319498662, + "learning_rate": 5.544778470948001e-06, + "loss": 0.2168, + "step": 8300 + }, + { + "epoch": 0.6576351752822341, + "grad_norm": 1.6472376808299591, + "learning_rate": 5.542481324070996e-06, + "loss": 0.3467, + "step": 8301 + }, + { + "epoch": 0.6577143988908695, + "grad_norm": 1.2443022079359343, + "learning_rate": 5.540184470700152e-06, + "loss": 0.218, + "step": 8302 + }, + { + "epoch": 0.6577936224995049, + "grad_norm": 1.3790539371654038, + "learning_rate": 5.537887910986701e-06, + "loss": 0.2933, + "step": 8303 + }, + { + "epoch": 0.6578728461081402, + "grad_norm": 1.576153855481717, + "learning_rate": 5.535591645081857e-06, + "loss": 0.3258, + "step": 8304 + }, + { + "epoch": 0.6579520697167756, + "grad_norm": 1.2823367458262571, + "learning_rate": 5.5332956731368245e-06, + "loss": 0.2333, + "step": 8305 + }, + { + "epoch": 0.658031293325411, + "grad_norm": 1.2228559518003623, + "learning_rate": 5.530999995302781e-06, + "loss": 0.2224, + "step": 8306 + }, + { + "epoch": 0.6581105169340463, + "grad_norm": 1.1215705590993594, + "learning_rate": 5.528704611730879e-06, + "loss": 0.1965, + "step": 8307 + }, + { + "epoch": 0.6581897405426818, + "grad_norm": 1.2812794941391847, + "learning_rate": 5.5264095225722705e-06, + "loss": 0.2487, + "step": 8308 + }, + { + "epoch": 0.6582689641513171, + "grad_norm": 1.2293533067062077, + "learning_rate": 5.524114727978067e-06, + "loss": 0.2174, + "step": 8309 + }, + { + "epoch": 0.6583481877599525, + "grad_norm": 0.9242983580092674, + "learning_rate": 5.5218202280993725e-06, + "loss": 0.1394, + "step": 8310 + }, + { + "epoch": 0.6584274113685878, + "grad_norm": 1.2580389178541742, + "learning_rate": 5.519526023087265e-06, + "loss": 0.1566, + "step": 8311 + }, + { + "epoch": 0.6585066349772232, + "grad_norm": 1.221147943462261, + "learning_rate": 5.517232113092814e-06, + "loss": 0.2364, + "step": 8312 + }, + { + "epoch": 0.6585858585858586, + "grad_norm": 1.4221653778866195, + "learning_rate": 5.5149384982670585e-06, + "loss": 0.2911, + "step": 8313 + }, + { + "epoch": 0.6586650821944939, + "grad_norm": 1.2091627067742483, + "learning_rate": 5.512645178761018e-06, + "loss": 0.21, + "step": 8314 + }, + { + "epoch": 0.6587443058031294, + "grad_norm": 1.5999950611040303, + "learning_rate": 5.5103521547257045e-06, + "loss": 0.256, + "step": 8315 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 1.206254322324537, + "learning_rate": 5.508059426312099e-06, + "loss": 0.238, + "step": 8316 + }, + { + "epoch": 0.6589027530204001, + "grad_norm": 1.2901609319620173, + "learning_rate": 5.5057669936711625e-06, + "loss": 0.1795, + "step": 8317 + }, + { + "epoch": 0.6589819766290355, + "grad_norm": 1.3063836660918773, + "learning_rate": 5.503474856953849e-06, + "loss": 0.2549, + "step": 8318 + }, + { + "epoch": 0.6590612002376708, + "grad_norm": 1.2839605847742221, + "learning_rate": 5.50118301631108e-06, + "loss": 0.2291, + "step": 8319 + }, + { + "epoch": 0.6591404238463062, + "grad_norm": 1.4042232323395845, + "learning_rate": 5.498891471893758e-06, + "loss": 0.2637, + "step": 8320 + }, + { + "epoch": 0.6592196474549415, + "grad_norm": 1.3900295040393047, + "learning_rate": 5.49660022385278e-06, + "loss": 0.2781, + "step": 8321 + }, + { + "epoch": 0.659298871063577, + "grad_norm": 1.4135468750059055, + "learning_rate": 5.494309272339007e-06, + "loss": 0.2366, + "step": 8322 + }, + { + "epoch": 0.6593780946722123, + "grad_norm": 0.9572690044823501, + "learning_rate": 5.492018617503284e-06, + "loss": 0.1343, + "step": 8323 + }, + { + "epoch": 0.6594573182808476, + "grad_norm": 1.1413248322814105, + "learning_rate": 5.48972825949645e-06, + "loss": 0.2179, + "step": 8324 + }, + { + "epoch": 0.6595365418894831, + "grad_norm": 1.486728618626868, + "learning_rate": 5.487438198469306e-06, + "loss": 0.3243, + "step": 8325 + }, + { + "epoch": 0.6596157654981184, + "grad_norm": 1.5748786044733105, + "learning_rate": 5.485148434572645e-06, + "loss": 0.3075, + "step": 8326 + }, + { + "epoch": 0.6596949891067538, + "grad_norm": 1.579162638486017, + "learning_rate": 5.48285896795723e-06, + "loss": 0.4096, + "step": 8327 + }, + { + "epoch": 0.6597742127153892, + "grad_norm": 1.6213997059223582, + "learning_rate": 5.480569798773822e-06, + "loss": 0.308, + "step": 8328 + }, + { + "epoch": 0.6598534363240246, + "grad_norm": 1.264210585124539, + "learning_rate": 5.478280927173145e-06, + "loss": 0.2154, + "step": 8329 + }, + { + "epoch": 0.6599326599326599, + "grad_norm": 1.2948944287049189, + "learning_rate": 5.4759923533059105e-06, + "loss": 0.3472, + "step": 8330 + }, + { + "epoch": 0.6600118835412953, + "grad_norm": 1.326143600656642, + "learning_rate": 5.473704077322814e-06, + "loss": 0.2294, + "step": 8331 + }, + { + "epoch": 0.6600911071499307, + "grad_norm": 1.2643029076313272, + "learning_rate": 5.471416099374525e-06, + "loss": 0.2127, + "step": 8332 + }, + { + "epoch": 0.660170330758566, + "grad_norm": 1.396813226002153, + "learning_rate": 5.469128419611691e-06, + "loss": 0.2167, + "step": 8333 + }, + { + "epoch": 0.6602495543672015, + "grad_norm": 1.3223163825626678, + "learning_rate": 5.466841038184954e-06, + "loss": 0.1849, + "step": 8334 + }, + { + "epoch": 0.6603287779758368, + "grad_norm": 1.1877953739738731, + "learning_rate": 5.464553955244922e-06, + "loss": 0.197, + "step": 8335 + }, + { + "epoch": 0.6604080015844722, + "grad_norm": 1.5085980914038413, + "learning_rate": 5.4622671709421856e-06, + "loss": 0.2754, + "step": 8336 + }, + { + "epoch": 0.6604872251931075, + "grad_norm": 1.3688318101684382, + "learning_rate": 5.459980685427326e-06, + "loss": 0.2589, + "step": 8337 + }, + { + "epoch": 0.6605664488017429, + "grad_norm": 1.2919678212384955, + "learning_rate": 5.457694498850892e-06, + "loss": 0.2215, + "step": 8338 + }, + { + "epoch": 0.6606456724103783, + "grad_norm": 1.3112073508953472, + "learning_rate": 5.455408611363416e-06, + "loss": 0.2623, + "step": 8339 + }, + { + "epoch": 0.6607248960190136, + "grad_norm": 1.2517730206784308, + "learning_rate": 5.45312302311542e-06, + "loss": 0.237, + "step": 8340 + }, + { + "epoch": 0.6608041196276491, + "grad_norm": 1.400009166504189, + "learning_rate": 5.450837734257395e-06, + "loss": 0.2322, + "step": 8341 + }, + { + "epoch": 0.6608833432362844, + "grad_norm": 1.0102956686638216, + "learning_rate": 5.448552744939815e-06, + "loss": 0.2114, + "step": 8342 + }, + { + "epoch": 0.6609625668449198, + "grad_norm": 1.1463414346435077, + "learning_rate": 5.446268055313132e-06, + "loss": 0.2032, + "step": 8343 + }, + { + "epoch": 0.6610417904535552, + "grad_norm": 1.2156987153703278, + "learning_rate": 5.443983665527792e-06, + "loss": 0.2609, + "step": 8344 + }, + { + "epoch": 0.6611210140621905, + "grad_norm": 1.260749758893954, + "learning_rate": 5.441699575734204e-06, + "loss": 0.2674, + "step": 8345 + }, + { + "epoch": 0.6612002376708259, + "grad_norm": 1.2613301971594093, + "learning_rate": 5.439415786082762e-06, + "loss": 0.2166, + "step": 8346 + }, + { + "epoch": 0.6612794612794612, + "grad_norm": 1.3285489438114124, + "learning_rate": 5.437132296723852e-06, + "loss": 0.1998, + "step": 8347 + }, + { + "epoch": 0.6613586848880967, + "grad_norm": 1.1987701287684924, + "learning_rate": 5.434849107807823e-06, + "loss": 0.1647, + "step": 8348 + }, + { + "epoch": 0.661437908496732, + "grad_norm": 1.3031800953705288, + "learning_rate": 5.432566219485012e-06, + "loss": 0.2656, + "step": 8349 + }, + { + "epoch": 0.6615171321053674, + "grad_norm": 1.2078192042504405, + "learning_rate": 5.430283631905742e-06, + "loss": 0.2589, + "step": 8350 + }, + { + "epoch": 0.6615963557140028, + "grad_norm": 1.1896796883324154, + "learning_rate": 5.428001345220306e-06, + "loss": 0.1696, + "step": 8351 + }, + { + "epoch": 0.6616755793226381, + "grad_norm": 1.1528873650765146, + "learning_rate": 5.425719359578978e-06, + "loss": 0.2196, + "step": 8352 + }, + { + "epoch": 0.6617548029312735, + "grad_norm": 1.5532941954778245, + "learning_rate": 5.423437675132025e-06, + "loss": 0.346, + "step": 8353 + }, + { + "epoch": 0.6618340265399089, + "grad_norm": 1.3119490666397386, + "learning_rate": 5.42115629202968e-06, + "loss": 0.2696, + "step": 8354 + }, + { + "epoch": 0.6619132501485443, + "grad_norm": 1.235671770418397, + "learning_rate": 5.4188752104221565e-06, + "loss": 0.2668, + "step": 8355 + }, + { + "epoch": 0.6619924737571796, + "grad_norm": 1.2449852544970257, + "learning_rate": 5.416594430459663e-06, + "loss": 0.2882, + "step": 8356 + }, + { + "epoch": 0.6620716973658151, + "grad_norm": 1.4161869570843637, + "learning_rate": 5.41431395229237e-06, + "loss": 0.3182, + "step": 8357 + }, + { + "epoch": 0.6621509209744504, + "grad_norm": 1.2546617566662572, + "learning_rate": 5.41203377607044e-06, + "loss": 0.2123, + "step": 8358 + }, + { + "epoch": 0.6622301445830857, + "grad_norm": 1.3723082212726574, + "learning_rate": 5.409753901944006e-06, + "loss": 0.2785, + "step": 8359 + }, + { + "epoch": 0.6623093681917211, + "grad_norm": 1.4318874831162889, + "learning_rate": 5.407474330063194e-06, + "loss": 0.2453, + "step": 8360 + }, + { + "epoch": 0.6623885918003565, + "grad_norm": 1.368128744646739, + "learning_rate": 5.4051950605781e-06, + "loss": 0.2587, + "step": 8361 + }, + { + "epoch": 0.6624678154089919, + "grad_norm": 1.4443501090486048, + "learning_rate": 5.402916093638798e-06, + "loss": 0.2618, + "step": 8362 + }, + { + "epoch": 0.6625470390176272, + "grad_norm": 1.4678922342274117, + "learning_rate": 5.400637429395357e-06, + "loss": 0.2644, + "step": 8363 + }, + { + "epoch": 0.6626262626262627, + "grad_norm": 1.366951558813349, + "learning_rate": 5.398359067997808e-06, + "loss": 0.2818, + "step": 8364 + }, + { + "epoch": 0.662705486234898, + "grad_norm": 1.3309913539033384, + "learning_rate": 5.3960810095961705e-06, + "loss": 0.26, + "step": 8365 + }, + { + "epoch": 0.6627847098435333, + "grad_norm": 1.4347339554316367, + "learning_rate": 5.39380325434045e-06, + "loss": 0.2139, + "step": 8366 + }, + { + "epoch": 0.6628639334521688, + "grad_norm": 1.3133136544607424, + "learning_rate": 5.3915258023806195e-06, + "loss": 0.284, + "step": 8367 + }, + { + "epoch": 0.6629431570608041, + "grad_norm": 1.471385836672491, + "learning_rate": 5.3892486538666386e-06, + "loss": 0.1892, + "step": 8368 + }, + { + "epoch": 0.6630223806694395, + "grad_norm": 1.4931572519025, + "learning_rate": 5.386971808948451e-06, + "loss": 0.2538, + "step": 8369 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 2.995854586788855, + "learning_rate": 5.384695267775975e-06, + "loss": 0.319, + "step": 8370 + }, + { + "epoch": 0.6631808278867103, + "grad_norm": 1.7268331743191982, + "learning_rate": 5.382419030499107e-06, + "loss": 0.3081, + "step": 8371 + }, + { + "epoch": 0.6632600514953456, + "grad_norm": 1.2224559884035995, + "learning_rate": 5.380143097267723e-06, + "loss": 0.2135, + "step": 8372 + }, + { + "epoch": 0.6633392751039809, + "grad_norm": 1.4065575245339281, + "learning_rate": 5.377867468231695e-06, + "loss": 0.2067, + "step": 8373 + }, + { + "epoch": 0.6634184987126164, + "grad_norm": 1.1803841719814607, + "learning_rate": 5.3755921435408464e-06, + "loss": 0.1844, + "step": 8374 + }, + { + "epoch": 0.6634977223212517, + "grad_norm": 1.2172472524865847, + "learning_rate": 5.373317123345008e-06, + "loss": 0.2221, + "step": 8375 + }, + { + "epoch": 0.6635769459298871, + "grad_norm": 1.2480401821511662, + "learning_rate": 5.371042407793974e-06, + "loss": 0.1877, + "step": 8376 + }, + { + "epoch": 0.6636561695385225, + "grad_norm": 1.4717036508368897, + "learning_rate": 5.368767997037521e-06, + "loss": 0.2942, + "step": 8377 + }, + { + "epoch": 0.6637353931471579, + "grad_norm": 1.0134401571970157, + "learning_rate": 5.366493891225415e-06, + "loss": 0.1901, + "step": 8378 + }, + { + "epoch": 0.6638146167557932, + "grad_norm": 1.1112482262365826, + "learning_rate": 5.3642200905073914e-06, + "loss": 0.2169, + "step": 8379 + }, + { + "epoch": 0.6638938403644286, + "grad_norm": 1.1695907281471865, + "learning_rate": 5.361946595033165e-06, + "loss": 0.2204, + "step": 8380 + }, + { + "epoch": 0.663973063973064, + "grad_norm": 1.3739125590736159, + "learning_rate": 5.359673404952442e-06, + "loss": 0.2422, + "step": 8381 + }, + { + "epoch": 0.6640522875816993, + "grad_norm": 1.195805585087894, + "learning_rate": 5.357400520414898e-06, + "loss": 0.2693, + "step": 8382 + }, + { + "epoch": 0.6641315111903348, + "grad_norm": 1.2661165772777006, + "learning_rate": 5.355127941570191e-06, + "loss": 0.244, + "step": 8383 + }, + { + "epoch": 0.6642107347989701, + "grad_norm": 0.9370240991230251, + "learning_rate": 5.352855668567956e-06, + "loss": 0.1232, + "step": 8384 + }, + { + "epoch": 0.6642899584076055, + "grad_norm": 1.6172551702759002, + "learning_rate": 5.350583701557816e-06, + "loss": 0.2761, + "step": 8385 + }, + { + "epoch": 0.6643691820162408, + "grad_norm": 1.293160169426191, + "learning_rate": 5.348312040689369e-06, + "loss": 0.1908, + "step": 8386 + }, + { + "epoch": 0.6644484056248762, + "grad_norm": 1.3561173474545039, + "learning_rate": 5.346040686112189e-06, + "loss": 0.2291, + "step": 8387 + }, + { + "epoch": 0.6645276292335116, + "grad_norm": 1.2176060804054414, + "learning_rate": 5.34376963797584e-06, + "loss": 0.2455, + "step": 8388 + }, + { + "epoch": 0.6646068528421469, + "grad_norm": 1.3198655701131181, + "learning_rate": 5.3414988964298555e-06, + "loss": 0.2862, + "step": 8389 + }, + { + "epoch": 0.6646860764507824, + "grad_norm": 1.1604702791360113, + "learning_rate": 5.3392284616237486e-06, + "loss": 0.204, + "step": 8390 + }, + { + "epoch": 0.6647653000594177, + "grad_norm": 1.2216233457999, + "learning_rate": 5.336958333707026e-06, + "loss": 0.1788, + "step": 8391 + }, + { + "epoch": 0.6648445236680531, + "grad_norm": 1.6633590522851645, + "learning_rate": 5.33468851282916e-06, + "loss": 0.2555, + "step": 8392 + }, + { + "epoch": 0.6649237472766885, + "grad_norm": 1.616510431791573, + "learning_rate": 5.332418999139604e-06, + "loss": 0.2635, + "step": 8393 + }, + { + "epoch": 0.6650029708853238, + "grad_norm": 1.3381350250591473, + "learning_rate": 5.330149792787801e-06, + "loss": 0.2218, + "step": 8394 + }, + { + "epoch": 0.6650821944939592, + "grad_norm": 1.9189265524650396, + "learning_rate": 5.3278808939231654e-06, + "loss": 0.3083, + "step": 8395 + }, + { + "epoch": 0.6651614181025945, + "grad_norm": 1.6407692076974278, + "learning_rate": 5.32561230269509e-06, + "loss": 0.2311, + "step": 8396 + }, + { + "epoch": 0.66524064171123, + "grad_norm": 1.4622716419975412, + "learning_rate": 5.32334401925295e-06, + "loss": 0.2655, + "step": 8397 + }, + { + "epoch": 0.6653198653198653, + "grad_norm": 1.410505805108147, + "learning_rate": 5.321076043746108e-06, + "loss": 0.3128, + "step": 8398 + }, + { + "epoch": 0.6653990889285007, + "grad_norm": 1.3380589622612908, + "learning_rate": 5.318808376323895e-06, + "loss": 0.3093, + "step": 8399 + }, + { + "epoch": 0.6654783125371361, + "grad_norm": 1.4032477156111962, + "learning_rate": 5.316541017135622e-06, + "loss": 0.2241, + "step": 8400 + }, + { + "epoch": 0.6655575361457714, + "grad_norm": 1.441332382555078, + "learning_rate": 5.314273966330591e-06, + "loss": 0.2432, + "step": 8401 + }, + { + "epoch": 0.6656367597544068, + "grad_norm": 1.645489629587223, + "learning_rate": 5.3120072240580735e-06, + "loss": 0.3441, + "step": 8402 + }, + { + "epoch": 0.6657159833630422, + "grad_norm": 1.2124897845237796, + "learning_rate": 5.309740790467319e-06, + "loss": 0.2366, + "step": 8403 + }, + { + "epoch": 0.6657952069716776, + "grad_norm": 1.4168855422162083, + "learning_rate": 5.307474665707569e-06, + "loss": 0.2054, + "step": 8404 + }, + { + "epoch": 0.6658744305803129, + "grad_norm": 1.2560444462070148, + "learning_rate": 5.305208849928034e-06, + "loss": 0.1564, + "step": 8405 + }, + { + "epoch": 0.6659536541889483, + "grad_norm": 1.5681134696838033, + "learning_rate": 5.302943343277902e-06, + "loss": 0.2732, + "step": 8406 + }, + { + "epoch": 0.6660328777975837, + "grad_norm": 1.7513691102008173, + "learning_rate": 5.300678145906354e-06, + "loss": 0.3366, + "step": 8407 + }, + { + "epoch": 0.666112101406219, + "grad_norm": 1.8098170248833743, + "learning_rate": 5.298413257962538e-06, + "loss": 0.2996, + "step": 8408 + }, + { + "epoch": 0.6661913250148545, + "grad_norm": 1.2707327230635488, + "learning_rate": 5.296148679595583e-06, + "loss": 0.2315, + "step": 8409 + }, + { + "epoch": 0.6662705486234898, + "grad_norm": 1.2813082736197279, + "learning_rate": 5.293884410954608e-06, + "loss": 0.2243, + "step": 8410 + }, + { + "epoch": 0.6663497722321252, + "grad_norm": 1.14942739380983, + "learning_rate": 5.291620452188699e-06, + "loss": 0.1942, + "step": 8411 + }, + { + "epoch": 0.6664289958407605, + "grad_norm": 1.168954136334741, + "learning_rate": 5.28935680344693e-06, + "loss": 0.166, + "step": 8412 + }, + { + "epoch": 0.6665082194493959, + "grad_norm": 1.326096717153951, + "learning_rate": 5.287093464878343e-06, + "loss": 0.2357, + "step": 8413 + }, + { + "epoch": 0.6665874430580313, + "grad_norm": 1.4431652099713854, + "learning_rate": 5.28483043663198e-06, + "loss": 0.2854, + "step": 8414 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.2459451609829055, + "learning_rate": 5.282567718856845e-06, + "loss": 0.2501, + "step": 8415 + }, + { + "epoch": 0.6667458902753021, + "grad_norm": 1.822981619676857, + "learning_rate": 5.280305311701921e-06, + "loss": 0.3417, + "step": 8416 + }, + { + "epoch": 0.6668251138839374, + "grad_norm": 1.520409681572383, + "learning_rate": 5.278043215316189e-06, + "loss": 0.2515, + "step": 8417 + }, + { + "epoch": 0.6669043374925728, + "grad_norm": 1.52910510635187, + "learning_rate": 5.275781429848589e-06, + "loss": 0.3594, + "step": 8418 + }, + { + "epoch": 0.6669835611012082, + "grad_norm": 1.274697958119952, + "learning_rate": 5.273519955448047e-06, + "loss": 0.1938, + "step": 8419 + }, + { + "epoch": 0.6670627847098435, + "grad_norm": 1.3571398947702835, + "learning_rate": 5.271258792263476e-06, + "loss": 0.1616, + "step": 8420 + }, + { + "epoch": 0.6671420083184789, + "grad_norm": 1.1099998093198988, + "learning_rate": 5.268997940443762e-06, + "loss": 0.2173, + "step": 8421 + }, + { + "epoch": 0.6672212319271142, + "grad_norm": 1.3393583314098787, + "learning_rate": 5.266737400137765e-06, + "loss": 0.2252, + "step": 8422 + }, + { + "epoch": 0.6673004555357497, + "grad_norm": 1.4986042238431063, + "learning_rate": 5.26447717149434e-06, + "loss": 0.3519, + "step": 8423 + }, + { + "epoch": 0.667379679144385, + "grad_norm": 1.515978760049668, + "learning_rate": 5.2622172546623055e-06, + "loss": 0.3013, + "step": 8424 + }, + { + "epoch": 0.6674589027530204, + "grad_norm": 1.1733334031683973, + "learning_rate": 5.259957649790466e-06, + "loss": 0.2071, + "step": 8425 + }, + { + "epoch": 0.6675381263616558, + "grad_norm": 1.4020459253389366, + "learning_rate": 5.257698357027609e-06, + "loss": 0.248, + "step": 8426 + }, + { + "epoch": 0.6676173499702911, + "grad_norm": 1.405320425980236, + "learning_rate": 5.2554393765225e-06, + "loss": 0.2485, + "step": 8427 + }, + { + "epoch": 0.6676965735789265, + "grad_norm": 1.1675120456773727, + "learning_rate": 5.253180708423877e-06, + "loss": 0.2084, + "step": 8428 + }, + { + "epoch": 0.6677757971875619, + "grad_norm": 1.6621527545587165, + "learning_rate": 5.25092235288046e-06, + "loss": 0.3226, + "step": 8429 + }, + { + "epoch": 0.6678550207961973, + "grad_norm": 1.833572965974962, + "learning_rate": 5.248664310040958e-06, + "loss": 0.3796, + "step": 8430 + }, + { + "epoch": 0.6679342444048326, + "grad_norm": 1.5756434244848083, + "learning_rate": 5.246406580054051e-06, + "loss": 0.2665, + "step": 8431 + }, + { + "epoch": 0.6680134680134681, + "grad_norm": 1.0922043325905952, + "learning_rate": 5.244149163068394e-06, + "loss": 0.2668, + "step": 8432 + }, + { + "epoch": 0.6680926916221034, + "grad_norm": 1.5790725364950091, + "learning_rate": 5.241892059232634e-06, + "loss": 0.3399, + "step": 8433 + }, + { + "epoch": 0.6681719152307387, + "grad_norm": 1.2691597457055435, + "learning_rate": 5.239635268695386e-06, + "loss": 0.3114, + "step": 8434 + }, + { + "epoch": 0.6682511388393741, + "grad_norm": 1.124685674957571, + "learning_rate": 5.237378791605249e-06, + "loss": 0.1443, + "step": 8435 + }, + { + "epoch": 0.6683303624480095, + "grad_norm": 1.4088100322452433, + "learning_rate": 5.235122628110805e-06, + "loss": 0.2443, + "step": 8436 + }, + { + "epoch": 0.6684095860566449, + "grad_norm": 1.506275338203759, + "learning_rate": 5.232866778360608e-06, + "loss": 0.225, + "step": 8437 + }, + { + "epoch": 0.6684888096652802, + "grad_norm": 1.2302394320978156, + "learning_rate": 5.230611242503193e-06, + "loss": 0.2566, + "step": 8438 + }, + { + "epoch": 0.6685680332739157, + "grad_norm": 1.3367305292439307, + "learning_rate": 5.228356020687082e-06, + "loss": 0.1728, + "step": 8439 + }, + { + "epoch": 0.668647256882551, + "grad_norm": 1.570880041465479, + "learning_rate": 5.226101113060769e-06, + "loss": 0.4568, + "step": 8440 + }, + { + "epoch": 0.6687264804911863, + "grad_norm": 1.2146470855416167, + "learning_rate": 5.223846519772722e-06, + "loss": 0.1659, + "step": 8441 + }, + { + "epoch": 0.6688057040998218, + "grad_norm": 0.9793511826724763, + "learning_rate": 5.221592240971403e-06, + "loss": 0.1179, + "step": 8442 + }, + { + "epoch": 0.6688849277084571, + "grad_norm": 1.311119656394066, + "learning_rate": 5.219338276805243e-06, + "loss": 0.2936, + "step": 8443 + }, + { + "epoch": 0.6689641513170925, + "grad_norm": 1.264479814063757, + "learning_rate": 5.217084627422656e-06, + "loss": 0.2176, + "step": 8444 + }, + { + "epoch": 0.6690433749257279, + "grad_norm": 1.559064233739909, + "learning_rate": 5.214831292972027e-06, + "loss": 0.3054, + "step": 8445 + }, + { + "epoch": 0.6691225985343633, + "grad_norm": 1.469417472785673, + "learning_rate": 5.212578273601738e-06, + "loss": 0.2489, + "step": 8446 + }, + { + "epoch": 0.6692018221429986, + "grad_norm": 1.4144467094226918, + "learning_rate": 5.210325569460133e-06, + "loss": 0.2049, + "step": 8447 + }, + { + "epoch": 0.6692810457516339, + "grad_norm": 1.7113882056992418, + "learning_rate": 5.208073180695538e-06, + "loss": 0.283, + "step": 8448 + }, + { + "epoch": 0.6693602693602694, + "grad_norm": 1.4925757654922105, + "learning_rate": 5.205821107456273e-06, + "loss": 0.2645, + "step": 8449 + }, + { + "epoch": 0.6694394929689047, + "grad_norm": 1.3284292865441338, + "learning_rate": 5.203569349890618e-06, + "loss": 0.2357, + "step": 8450 + }, + { + "epoch": 0.6695187165775401, + "grad_norm": 1.490567371265332, + "learning_rate": 5.201317908146843e-06, + "loss": 0.2021, + "step": 8451 + }, + { + "epoch": 0.6695979401861755, + "grad_norm": 1.139763631216647, + "learning_rate": 5.199066782373194e-06, + "loss": 0.1641, + "step": 8452 + }, + { + "epoch": 0.6696771637948109, + "grad_norm": 1.4704643530286972, + "learning_rate": 5.196815972717897e-06, + "loss": 0.2812, + "step": 8453 + }, + { + "epoch": 0.6697563874034462, + "grad_norm": 1.188518352415322, + "learning_rate": 5.194565479329154e-06, + "loss": 0.2318, + "step": 8454 + }, + { + "epoch": 0.6698356110120816, + "grad_norm": 1.157039522355783, + "learning_rate": 5.192315302355153e-06, + "loss": 0.1978, + "step": 8455 + }, + { + "epoch": 0.669914834620717, + "grad_norm": 1.6029673775771003, + "learning_rate": 5.190065441944059e-06, + "loss": 0.2648, + "step": 8456 + }, + { + "epoch": 0.6699940582293523, + "grad_norm": 1.4121598554047372, + "learning_rate": 5.187815898244006e-06, + "loss": 0.236, + "step": 8457 + }, + { + "epoch": 0.6700732818379878, + "grad_norm": 1.6087640823315905, + "learning_rate": 5.185566671403126e-06, + "loss": 0.3141, + "step": 8458 + }, + { + "epoch": 0.6701525054466231, + "grad_norm": 1.7917968674289064, + "learning_rate": 5.183317761569515e-06, + "loss": 0.3191, + "step": 8459 + }, + { + "epoch": 0.6702317290552585, + "grad_norm": 1.3066643526897765, + "learning_rate": 5.181069168891248e-06, + "loss": 0.2384, + "step": 8460 + }, + { + "epoch": 0.6703109526638938, + "grad_norm": 1.0940547286913052, + "learning_rate": 5.178820893516394e-06, + "loss": 0.1534, + "step": 8461 + }, + { + "epoch": 0.6703901762725292, + "grad_norm": 1.5425968513334931, + "learning_rate": 5.176572935592986e-06, + "loss": 0.2228, + "step": 8462 + }, + { + "epoch": 0.6704693998811646, + "grad_norm": 1.5085774417439075, + "learning_rate": 5.1743252952690385e-06, + "loss": 0.3214, + "step": 8463 + }, + { + "epoch": 0.6705486234897999, + "grad_norm": 1.2742431060320412, + "learning_rate": 5.172077972692553e-06, + "loss": 0.2724, + "step": 8464 + }, + { + "epoch": 0.6706278470984354, + "grad_norm": 1.3093260273582867, + "learning_rate": 5.1698309680115024e-06, + "loss": 0.2752, + "step": 8465 + }, + { + "epoch": 0.6707070707070707, + "grad_norm": 1.544112622530947, + "learning_rate": 5.167584281373838e-06, + "loss": 0.3363, + "step": 8466 + }, + { + "epoch": 0.6707862943157061, + "grad_norm": 0.858740691762967, + "learning_rate": 5.165337912927502e-06, + "loss": 0.1133, + "step": 8467 + }, + { + "epoch": 0.6708655179243415, + "grad_norm": 1.0593484135962687, + "learning_rate": 5.1630918628204e-06, + "loss": 0.1754, + "step": 8468 + }, + { + "epoch": 0.6709447415329768, + "grad_norm": 1.286337851740026, + "learning_rate": 5.1608461312004245e-06, + "loss": 0.221, + "step": 8469 + }, + { + "epoch": 0.6710239651416122, + "grad_norm": 1.1830134553335772, + "learning_rate": 5.158600718215443e-06, + "loss": 0.2016, + "step": 8470 + }, + { + "epoch": 0.6711031887502475, + "grad_norm": 1.3487162268666644, + "learning_rate": 5.156355624013314e-06, + "loss": 0.3195, + "step": 8471 + }, + { + "epoch": 0.671182412358883, + "grad_norm": 1.4888128495542008, + "learning_rate": 5.15411084874186e-06, + "loss": 0.3181, + "step": 8472 + }, + { + "epoch": 0.6712616359675183, + "grad_norm": 1.6446068645907317, + "learning_rate": 5.151866392548886e-06, + "loss": 0.2726, + "step": 8473 + }, + { + "epoch": 0.6713408595761537, + "grad_norm": 1.1877490702384044, + "learning_rate": 5.149622255582185e-06, + "loss": 0.2221, + "step": 8474 + }, + { + "epoch": 0.6714200831847891, + "grad_norm": 1.0990522472709088, + "learning_rate": 5.147378437989522e-06, + "loss": 0.1913, + "step": 8475 + }, + { + "epoch": 0.6714993067934244, + "grad_norm": 1.2740815935402738, + "learning_rate": 5.145134939918634e-06, + "loss": 0.1948, + "step": 8476 + }, + { + "epoch": 0.6715785304020598, + "grad_norm": 1.3417832980468165, + "learning_rate": 5.1428917615172555e-06, + "loss": 0.2577, + "step": 8477 + }, + { + "epoch": 0.6716577540106952, + "grad_norm": 1.4687130162282487, + "learning_rate": 5.140648902933083e-06, + "loss": 0.2008, + "step": 8478 + }, + { + "epoch": 0.6717369776193306, + "grad_norm": 1.121392047181105, + "learning_rate": 5.138406364313795e-06, + "loss": 0.1886, + "step": 8479 + }, + { + "epoch": 0.6718162012279659, + "grad_norm": 1.2229166373422677, + "learning_rate": 5.136164145807059e-06, + "loss": 0.2089, + "step": 8480 + }, + { + "epoch": 0.6718954248366014, + "grad_norm": 1.7232786237188593, + "learning_rate": 5.13392224756051e-06, + "loss": 0.3236, + "step": 8481 + }, + { + "epoch": 0.6719746484452367, + "grad_norm": 1.627451441971832, + "learning_rate": 5.131680669721768e-06, + "loss": 0.2498, + "step": 8482 + }, + { + "epoch": 0.672053872053872, + "grad_norm": 1.4868196604619808, + "learning_rate": 5.129439412438424e-06, + "loss": 0.296, + "step": 8483 + }, + { + "epoch": 0.6721330956625075, + "grad_norm": 2.1039698134866898, + "learning_rate": 5.127198475858064e-06, + "loss": 0.4189, + "step": 8484 + }, + { + "epoch": 0.6722123192711428, + "grad_norm": 1.3308319363700254, + "learning_rate": 5.124957860128237e-06, + "loss": 0.2114, + "step": 8485 + }, + { + "epoch": 0.6722915428797782, + "grad_norm": 1.3765890639853047, + "learning_rate": 5.122717565396474e-06, + "loss": 0.2948, + "step": 8486 + }, + { + "epoch": 0.6723707664884135, + "grad_norm": 1.0239417494022371, + "learning_rate": 5.1204775918102955e-06, + "loss": 0.1926, + "step": 8487 + }, + { + "epoch": 0.6724499900970489, + "grad_norm": 1.1905686610768917, + "learning_rate": 5.11823793951719e-06, + "loss": 0.1737, + "step": 8488 + }, + { + "epoch": 0.6725292137056843, + "grad_norm": 1.8023486808047542, + "learning_rate": 5.115998608664621e-06, + "loss": 0.348, + "step": 8489 + }, + { + "epoch": 0.6726084373143196, + "grad_norm": 0.9987481576320517, + "learning_rate": 5.1137595994000475e-06, + "loss": 0.158, + "step": 8490 + }, + { + "epoch": 0.6726876609229551, + "grad_norm": 1.444126904666399, + "learning_rate": 5.111520911870894e-06, + "loss": 0.2027, + "step": 8491 + }, + { + "epoch": 0.6727668845315904, + "grad_norm": 1.2909806308018084, + "learning_rate": 5.109282546224563e-06, + "loss": 0.2702, + "step": 8492 + }, + { + "epoch": 0.6728461081402258, + "grad_norm": 1.6389571173709172, + "learning_rate": 5.107044502608447e-06, + "loss": 0.3497, + "step": 8493 + }, + { + "epoch": 0.6729253317488612, + "grad_norm": 1.2757763688490689, + "learning_rate": 5.104806781169906e-06, + "loss": 0.2155, + "step": 8494 + }, + { + "epoch": 0.6730045553574965, + "grad_norm": 1.5524669891540126, + "learning_rate": 5.102569382056281e-06, + "loss": 0.2848, + "step": 8495 + }, + { + "epoch": 0.6730837789661319, + "grad_norm": 1.005628449136803, + "learning_rate": 5.100332305414902e-06, + "loss": 0.1861, + "step": 8496 + }, + { + "epoch": 0.6731630025747672, + "grad_norm": 1.1559717672010146, + "learning_rate": 5.098095551393066e-06, + "loss": 0.2316, + "step": 8497 + }, + { + "epoch": 0.6732422261834027, + "grad_norm": 1.4256732174835267, + "learning_rate": 5.095859120138049e-06, + "loss": 0.2419, + "step": 8498 + }, + { + "epoch": 0.673321449792038, + "grad_norm": 1.3173134431924085, + "learning_rate": 5.093623011797108e-06, + "loss": 0.1876, + "step": 8499 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 1.5911699341757466, + "learning_rate": 5.091387226517489e-06, + "loss": 0.2915, + "step": 8500 + }, + { + "epoch": 0.6734798970093088, + "grad_norm": 1.1139462187101312, + "learning_rate": 5.089151764446403e-06, + "loss": 0.1811, + "step": 8501 + }, + { + "epoch": 0.6735591206179441, + "grad_norm": 1.1686913602798101, + "learning_rate": 5.086916625731038e-06, + "loss": 0.1525, + "step": 8502 + }, + { + "epoch": 0.6736383442265795, + "grad_norm": 1.5299993645370331, + "learning_rate": 5.084681810518577e-06, + "loss": 0.2097, + "step": 8503 + }, + { + "epoch": 0.6737175678352149, + "grad_norm": 1.3585993864843418, + "learning_rate": 5.0824473189561695e-06, + "loss": 0.2968, + "step": 8504 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 1.27432862901369, + "learning_rate": 5.080213151190938e-06, + "loss": 0.2051, + "step": 8505 + }, + { + "epoch": 0.6738760150524856, + "grad_norm": 1.5630370579833923, + "learning_rate": 5.077979307370004e-06, + "loss": 0.2414, + "step": 8506 + }, + { + "epoch": 0.6739552386611211, + "grad_norm": 1.6661789343586069, + "learning_rate": 5.075745787640448e-06, + "loss": 0.3777, + "step": 8507 + }, + { + "epoch": 0.6740344622697564, + "grad_norm": 1.3845123026878738, + "learning_rate": 5.073512592149334e-06, + "loss": 0.249, + "step": 8508 + }, + { + "epoch": 0.6741136858783917, + "grad_norm": 1.174362597711559, + "learning_rate": 5.071279721043716e-06, + "loss": 0.2074, + "step": 8509 + }, + { + "epoch": 0.6741929094870271, + "grad_norm": 1.7705932258648158, + "learning_rate": 5.069047174470613e-06, + "loss": 0.2935, + "step": 8510 + }, + { + "epoch": 0.6742721330956625, + "grad_norm": 1.4229093657661485, + "learning_rate": 5.066814952577021e-06, + "loss": 0.307, + "step": 8511 + }, + { + "epoch": 0.6743513567042979, + "grad_norm": 1.6769480762142728, + "learning_rate": 5.064583055509935e-06, + "loss": 0.3272, + "step": 8512 + }, + { + "epoch": 0.6744305803129332, + "grad_norm": 1.3281396162767891, + "learning_rate": 5.062351483416304e-06, + "loss": 0.2595, + "step": 8513 + }, + { + "epoch": 0.6745098039215687, + "grad_norm": 1.201852454644897, + "learning_rate": 5.060120236443071e-06, + "loss": 0.2006, + "step": 8514 + }, + { + "epoch": 0.674589027530204, + "grad_norm": 1.2356393449355945, + "learning_rate": 5.057889314737148e-06, + "loss": 0.2186, + "step": 8515 + }, + { + "epoch": 0.6746682511388393, + "grad_norm": 1.3155436934862497, + "learning_rate": 5.055658718445435e-06, + "loss": 0.2524, + "step": 8516 + }, + { + "epoch": 0.6747474747474748, + "grad_norm": 1.7750986944987535, + "learning_rate": 5.053428447714806e-06, + "loss": 0.379, + "step": 8517 + }, + { + "epoch": 0.6748266983561101, + "grad_norm": 1.2559600000096462, + "learning_rate": 5.05119850269211e-06, + "loss": 0.2072, + "step": 8518 + }, + { + "epoch": 0.6749059219647455, + "grad_norm": 1.1358965470143332, + "learning_rate": 5.048968883524182e-06, + "loss": 0.1779, + "step": 8519 + }, + { + "epoch": 0.6749851455733809, + "grad_norm": 1.1901867645170272, + "learning_rate": 5.046739590357832e-06, + "loss": 0.2308, + "step": 8520 + }, + { + "epoch": 0.6750643691820163, + "grad_norm": 1.3546082273199074, + "learning_rate": 5.044510623339842e-06, + "loss": 0.2334, + "step": 8521 + }, + { + "epoch": 0.6751435927906516, + "grad_norm": 1.7713516653502106, + "learning_rate": 5.042281982616986e-06, + "loss": 0.3492, + "step": 8522 + }, + { + "epoch": 0.6752228163992869, + "grad_norm": 1.2446057612552865, + "learning_rate": 5.0400536683360064e-06, + "loss": 0.1683, + "step": 8523 + }, + { + "epoch": 0.6753020400079224, + "grad_norm": 1.3075030122277054, + "learning_rate": 5.037825680643624e-06, + "loss": 0.1911, + "step": 8524 + }, + { + "epoch": 0.6753812636165577, + "grad_norm": 1.4412739645827133, + "learning_rate": 5.035598019686549e-06, + "loss": 0.251, + "step": 8525 + }, + { + "epoch": 0.6754604872251931, + "grad_norm": 1.3244581510206603, + "learning_rate": 5.033370685611456e-06, + "loss": 0.2223, + "step": 8526 + }, + { + "epoch": 0.6755397108338285, + "grad_norm": 1.556723114017837, + "learning_rate": 5.031143678565005e-06, + "loss": 0.3142, + "step": 8527 + }, + { + "epoch": 0.6756189344424639, + "grad_norm": 1.4241638309888554, + "learning_rate": 5.028916998693831e-06, + "loss": 0.2512, + "step": 8528 + }, + { + "epoch": 0.6756981580510992, + "grad_norm": 1.2360192743296314, + "learning_rate": 5.02669064614456e-06, + "loss": 0.1853, + "step": 8529 + }, + { + "epoch": 0.6757773816597346, + "grad_norm": 1.5868442034821189, + "learning_rate": 5.024464621063773e-06, + "loss": 0.2948, + "step": 8530 + }, + { + "epoch": 0.67585660526837, + "grad_norm": 1.283333770338844, + "learning_rate": 5.022238923598055e-06, + "loss": 0.2354, + "step": 8531 + }, + { + "epoch": 0.6759358288770053, + "grad_norm": 1.2101424870751536, + "learning_rate": 5.020013553893952e-06, + "loss": 0.2214, + "step": 8532 + }, + { + "epoch": 0.6760150524856408, + "grad_norm": 1.4364586890311886, + "learning_rate": 5.017788512097989e-06, + "loss": 0.2765, + "step": 8533 + }, + { + "epoch": 0.6760942760942761, + "grad_norm": 1.4230165155876449, + "learning_rate": 5.015563798356684e-06, + "loss": 0.2996, + "step": 8534 + }, + { + "epoch": 0.6761734997029115, + "grad_norm": 1.2452297731025281, + "learning_rate": 5.0133394128165204e-06, + "loss": 0.1962, + "step": 8535 + }, + { + "epoch": 0.6762527233115468, + "grad_norm": 1.3670093631658324, + "learning_rate": 5.011115355623957e-06, + "loss": 0.2449, + "step": 8536 + }, + { + "epoch": 0.6763319469201822, + "grad_norm": 1.2923012252307138, + "learning_rate": 5.008891626925447e-06, + "loss": 0.245, + "step": 8537 + }, + { + "epoch": 0.6764111705288176, + "grad_norm": 1.1383179692423497, + "learning_rate": 5.006668226867407e-06, + "loss": 0.1846, + "step": 8538 + }, + { + "epoch": 0.6764903941374529, + "grad_norm": 1.3503419657400249, + "learning_rate": 5.004445155596238e-06, + "loss": 0.2412, + "step": 8539 + }, + { + "epoch": 0.6765696177460884, + "grad_norm": 1.0356937427480393, + "learning_rate": 5.0022224132583154e-06, + "loss": 0.1732, + "step": 8540 + }, + { + "epoch": 0.6766488413547237, + "grad_norm": 1.0524578307785075, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1456, + "step": 8541 + }, + { + "epoch": 0.6767280649633591, + "grad_norm": 1.2741921087485921, + "learning_rate": 4.997777915967631e-06, + "loss": 0.2426, + "step": 8542 + }, + { + "epoch": 0.6768072885719945, + "grad_norm": 1.4206165948309513, + "learning_rate": 4.995556161307511e-06, + "loss": 0.3036, + "step": 8543 + }, + { + "epoch": 0.6768865121806298, + "grad_norm": 1.2446505785218223, + "learning_rate": 4.993334736165941e-06, + "loss": 0.2099, + "step": 8544 + }, + { + "epoch": 0.6769657357892652, + "grad_norm": 1.346416870547465, + "learning_rate": 4.991113640689189e-06, + "loss": 0.2301, + "step": 8545 + }, + { + "epoch": 0.6770449593979005, + "grad_norm": 1.4051032845380176, + "learning_rate": 4.988892875023499e-06, + "loss": 0.3313, + "step": 8546 + }, + { + "epoch": 0.677124183006536, + "grad_norm": 1.5692723343691821, + "learning_rate": 4.9866724393151044e-06, + "loss": 0.2838, + "step": 8547 + }, + { + "epoch": 0.6772034066151713, + "grad_norm": 1.081993396857605, + "learning_rate": 4.984452333710207e-06, + "loss": 0.1436, + "step": 8548 + }, + { + "epoch": 0.6772826302238067, + "grad_norm": 1.3878648851649706, + "learning_rate": 4.982232558354986e-06, + "loss": 0.2102, + "step": 8549 + }, + { + "epoch": 0.6773618538324421, + "grad_norm": 1.0320329047141141, + "learning_rate": 4.980013113395612e-06, + "loss": 0.1597, + "step": 8550 + }, + { + "epoch": 0.6774410774410774, + "grad_norm": 1.496138733532675, + "learning_rate": 4.9777939989782185e-06, + "loss": 0.3216, + "step": 8551 + }, + { + "epoch": 0.6775203010497128, + "grad_norm": 1.4446178105813183, + "learning_rate": 4.975575215248926e-06, + "loss": 0.2606, + "step": 8552 + }, + { + "epoch": 0.6775995246583482, + "grad_norm": 1.71587294719814, + "learning_rate": 4.9733567623538245e-06, + "loss": 0.3823, + "step": 8553 + }, + { + "epoch": 0.6776787482669836, + "grad_norm": 1.3290831786811594, + "learning_rate": 4.9711386404389995e-06, + "loss": 0.1807, + "step": 8554 + }, + { + "epoch": 0.6777579718756189, + "grad_norm": 1.8292531658663334, + "learning_rate": 4.968920849650496e-06, + "loss": 0.291, + "step": 8555 + }, + { + "epoch": 0.6778371954842544, + "grad_norm": 1.4476667885143453, + "learning_rate": 4.966703390134343e-06, + "loss": 0.2544, + "step": 8556 + }, + { + "epoch": 0.6779164190928897, + "grad_norm": 1.2217352397274268, + "learning_rate": 4.964486262036557e-06, + "loss": 0.2115, + "step": 8557 + }, + { + "epoch": 0.677995642701525, + "grad_norm": 1.2727769476316784, + "learning_rate": 4.962269465503121e-06, + "loss": 0.2694, + "step": 8558 + }, + { + "epoch": 0.6780748663101605, + "grad_norm": 1.3004398328897144, + "learning_rate": 4.960053000679997e-06, + "loss": 0.281, + "step": 8559 + }, + { + "epoch": 0.6781540899187958, + "grad_norm": 1.0766637861645403, + "learning_rate": 4.957836867713138e-06, + "loss": 0.2118, + "step": 8560 + }, + { + "epoch": 0.6782333135274312, + "grad_norm": 0.9833972780603426, + "learning_rate": 4.955621066748457e-06, + "loss": 0.1567, + "step": 8561 + }, + { + "epoch": 0.6783125371360665, + "grad_norm": 1.272943105731747, + "learning_rate": 4.953405597931854e-06, + "loss": 0.2609, + "step": 8562 + }, + { + "epoch": 0.6783917607447019, + "grad_norm": 1.34055412456491, + "learning_rate": 4.951190461409214e-06, + "loss": 0.3105, + "step": 8563 + }, + { + "epoch": 0.6784709843533373, + "grad_norm": 1.0597316522501141, + "learning_rate": 4.948975657326388e-06, + "loss": 0.1565, + "step": 8564 + }, + { + "epoch": 0.6785502079619726, + "grad_norm": 1.2370891530969725, + "learning_rate": 4.946761185829208e-06, + "loss": 0.2557, + "step": 8565 + }, + { + "epoch": 0.6786294315706081, + "grad_norm": 1.251515660269242, + "learning_rate": 4.944547047063493e-06, + "loss": 0.2265, + "step": 8566 + }, + { + "epoch": 0.6787086551792434, + "grad_norm": 1.267939094041213, + "learning_rate": 4.942333241175029e-06, + "loss": 0.2647, + "step": 8567 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 1.2703385903366424, + "learning_rate": 4.940119768309585e-06, + "loss": 0.3006, + "step": 8568 + }, + { + "epoch": 0.6788671023965142, + "grad_norm": 1.641252186872801, + "learning_rate": 4.937906628612905e-06, + "loss": 0.2762, + "step": 8569 + }, + { + "epoch": 0.6789463260051495, + "grad_norm": 1.1852139783363618, + "learning_rate": 4.93569382223072e-06, + "loss": 0.1936, + "step": 8570 + }, + { + "epoch": 0.6790255496137849, + "grad_norm": 1.1237934425917269, + "learning_rate": 4.933481349308728e-06, + "loss": 0.1646, + "step": 8571 + }, + { + "epoch": 0.6791047732224202, + "grad_norm": 1.2792892159284825, + "learning_rate": 4.931269209992607e-06, + "loss": 0.1964, + "step": 8572 + }, + { + "epoch": 0.6791839968310557, + "grad_norm": 1.3535169506301947, + "learning_rate": 4.929057404428023e-06, + "loss": 0.229, + "step": 8573 + }, + { + "epoch": 0.679263220439691, + "grad_norm": 1.270914990598429, + "learning_rate": 4.926845932760609e-06, + "loss": 0.236, + "step": 8574 + }, + { + "epoch": 0.6793424440483264, + "grad_norm": 1.4601992781719328, + "learning_rate": 4.924634795135976e-06, + "loss": 0.2838, + "step": 8575 + }, + { + "epoch": 0.6794216676569618, + "grad_norm": 1.0870324693730369, + "learning_rate": 4.922423991699725e-06, + "loss": 0.2215, + "step": 8576 + }, + { + "epoch": 0.6795008912655971, + "grad_norm": 1.2837534706134157, + "learning_rate": 4.920213522597422e-06, + "loss": 0.1865, + "step": 8577 + }, + { + "epoch": 0.6795801148742325, + "grad_norm": 1.5147009069352122, + "learning_rate": 4.918003387974614e-06, + "loss": 0.2342, + "step": 8578 + }, + { + "epoch": 0.6796593384828679, + "grad_norm": 1.4185544406192958, + "learning_rate": 4.915793587976832e-06, + "loss": 0.2612, + "step": 8579 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 1.230529519506329, + "learning_rate": 4.913584122749578e-06, + "loss": 0.217, + "step": 8580 + }, + { + "epoch": 0.6798177857001386, + "grad_norm": 1.2563515486373664, + "learning_rate": 4.911374992438334e-06, + "loss": 0.2081, + "step": 8581 + }, + { + "epoch": 0.6798970093087741, + "grad_norm": 1.278051742881388, + "learning_rate": 4.909166197188563e-06, + "loss": 0.2414, + "step": 8582 + }, + { + "epoch": 0.6799762329174094, + "grad_norm": 1.5299302325629525, + "learning_rate": 4.906957737145703e-06, + "loss": 0.2439, + "step": 8583 + }, + { + "epoch": 0.6800554565260447, + "grad_norm": 1.4409084299545565, + "learning_rate": 4.904749612455171e-06, + "loss": 0.2634, + "step": 8584 + }, + { + "epoch": 0.6801346801346801, + "grad_norm": 1.388945805481724, + "learning_rate": 4.902541823262356e-06, + "loss": 0.2138, + "step": 8585 + }, + { + "epoch": 0.6802139037433155, + "grad_norm": 1.4827560577832648, + "learning_rate": 4.900334369712637e-06, + "loss": 0.2761, + "step": 8586 + }, + { + "epoch": 0.6802931273519509, + "grad_norm": 1.4864774984070275, + "learning_rate": 4.898127251951363e-06, + "loss": 0.2993, + "step": 8587 + }, + { + "epoch": 0.6803723509605862, + "grad_norm": 1.3524092702526227, + "learning_rate": 4.895920470123857e-06, + "loss": 0.2232, + "step": 8588 + }, + { + "epoch": 0.6804515745692217, + "grad_norm": 1.4298295528186709, + "learning_rate": 4.893714024375432e-06, + "loss": 0.2615, + "step": 8589 + }, + { + "epoch": 0.680530798177857, + "grad_norm": 1.3163579260613203, + "learning_rate": 4.89150791485137e-06, + "loss": 0.2048, + "step": 8590 + }, + { + "epoch": 0.6806100217864923, + "grad_norm": 1.5571368110245922, + "learning_rate": 4.889302141696925e-06, + "loss": 0.2324, + "step": 8591 + }, + { + "epoch": 0.6806892453951278, + "grad_norm": 1.1372708376378766, + "learning_rate": 4.88709670505735e-06, + "loss": 0.2016, + "step": 8592 + }, + { + "epoch": 0.6807684690037631, + "grad_norm": 1.3949320182730078, + "learning_rate": 4.884891605077853e-06, + "loss": 0.2366, + "step": 8593 + }, + { + "epoch": 0.6808476926123985, + "grad_norm": 1.4697455827538954, + "learning_rate": 4.882686841903627e-06, + "loss": 0.2734, + "step": 8594 + }, + { + "epoch": 0.6809269162210339, + "grad_norm": 1.1974708909381966, + "learning_rate": 4.8804824156798544e-06, + "loss": 0.2149, + "step": 8595 + }, + { + "epoch": 0.6810061398296693, + "grad_norm": 1.334623519574587, + "learning_rate": 4.878278326551682e-06, + "loss": 0.2597, + "step": 8596 + }, + { + "epoch": 0.6810853634383046, + "grad_norm": 1.041702276895194, + "learning_rate": 4.876074574664232e-06, + "loss": 0.1712, + "step": 8597 + }, + { + "epoch": 0.6811645870469399, + "grad_norm": 1.2225275755342317, + "learning_rate": 4.873871160162622e-06, + "loss": 0.1966, + "step": 8598 + }, + { + "epoch": 0.6812438106555754, + "grad_norm": 1.32063055974464, + "learning_rate": 4.871668083191931e-06, + "loss": 0.2505, + "step": 8599 + }, + { + "epoch": 0.6813230342642107, + "grad_norm": 1.5434533961499866, + "learning_rate": 4.8694653438972195e-06, + "loss": 0.2377, + "step": 8600 + }, + { + "epoch": 0.6814022578728461, + "grad_norm": 1.259061429644916, + "learning_rate": 4.867262942423525e-06, + "loss": 0.2205, + "step": 8601 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 1.3001787816459887, + "learning_rate": 4.865060878915873e-06, + "loss": 0.2244, + "step": 8602 + }, + { + "epoch": 0.6815607050901169, + "grad_norm": 1.5300656722630785, + "learning_rate": 4.862859153519252e-06, + "loss": 0.3558, + "step": 8603 + }, + { + "epoch": 0.6816399286987522, + "grad_norm": 1.3489123113385095, + "learning_rate": 4.860657766378637e-06, + "loss": 0.2644, + "step": 8604 + }, + { + "epoch": 0.6817191523073876, + "grad_norm": 1.1164682734736109, + "learning_rate": 4.858456717638981e-06, + "loss": 0.1658, + "step": 8605 + }, + { + "epoch": 0.681798375916023, + "grad_norm": 1.1368699256579105, + "learning_rate": 4.856256007445211e-06, + "loss": 0.1782, + "step": 8606 + }, + { + "epoch": 0.6818775995246583, + "grad_norm": 1.117845241395544, + "learning_rate": 4.8540556359422335e-06, + "loss": 0.2178, + "step": 8607 + }, + { + "epoch": 0.6819568231332938, + "grad_norm": 0.9665361599182319, + "learning_rate": 4.85185560327493e-06, + "loss": 0.1887, + "step": 8608 + }, + { + "epoch": 0.6820360467419291, + "grad_norm": 1.117663236029897, + "learning_rate": 4.849655909588165e-06, + "loss": 0.2507, + "step": 8609 + }, + { + "epoch": 0.6821152703505645, + "grad_norm": 1.6921984046164105, + "learning_rate": 4.847456555026773e-06, + "loss": 0.3705, + "step": 8610 + }, + { + "epoch": 0.6821944939591998, + "grad_norm": 1.1210723898425137, + "learning_rate": 4.845257539735577e-06, + "loss": 0.1664, + "step": 8611 + }, + { + "epoch": 0.6822737175678352, + "grad_norm": 1.415947074903872, + "learning_rate": 4.843058863859369e-06, + "loss": 0.2803, + "step": 8612 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 1.24483140501919, + "learning_rate": 4.840860527542919e-06, + "loss": 0.205, + "step": 8613 + }, + { + "epoch": 0.6824321647851059, + "grad_norm": 1.2923913122358617, + "learning_rate": 4.838662530930981e-06, + "loss": 0.1862, + "step": 8614 + }, + { + "epoch": 0.6825113883937414, + "grad_norm": 1.3062788589390226, + "learning_rate": 4.836464874168282e-06, + "loss": 0.2518, + "step": 8615 + }, + { + "epoch": 0.6825906120023767, + "grad_norm": 1.347790117500209, + "learning_rate": 4.834267557399521e-06, + "loss": 0.2, + "step": 8616 + }, + { + "epoch": 0.6826698356110121, + "grad_norm": 1.2176999549205083, + "learning_rate": 4.832070580769389e-06, + "loss": 0.234, + "step": 8617 + }, + { + "epoch": 0.6827490592196475, + "grad_norm": 1.3712042554632222, + "learning_rate": 4.829873944422544e-06, + "loss": 0.2095, + "step": 8618 + }, + { + "epoch": 0.6828282828282828, + "grad_norm": 1.5183410923960414, + "learning_rate": 4.8276776485036185e-06, + "loss": 0.2868, + "step": 8619 + }, + { + "epoch": 0.6829075064369182, + "grad_norm": 1.4883322989058514, + "learning_rate": 4.825481693157235e-06, + "loss": 0.2597, + "step": 8620 + }, + { + "epoch": 0.6829867300455535, + "grad_norm": 1.145485347967497, + "learning_rate": 4.823286078527984e-06, + "loss": 0.1854, + "step": 8621 + }, + { + "epoch": 0.683065953654189, + "grad_norm": 1.6704907213039475, + "learning_rate": 4.8210908047604336e-06, + "loss": 0.2967, + "step": 8622 + }, + { + "epoch": 0.6831451772628243, + "grad_norm": 1.3095787412204616, + "learning_rate": 4.818895871999136e-06, + "loss": 0.2335, + "step": 8623 + }, + { + "epoch": 0.6832244008714597, + "grad_norm": 1.1791792651962514, + "learning_rate": 4.816701280388617e-06, + "loss": 0.1793, + "step": 8624 + }, + { + "epoch": 0.6833036244800951, + "grad_norm": 1.3638562304683728, + "learning_rate": 4.814507030073377e-06, + "loss": 0.2062, + "step": 8625 + }, + { + "epoch": 0.6833828480887304, + "grad_norm": 1.6530399676521763, + "learning_rate": 4.812313121197896e-06, + "loss": 0.2895, + "step": 8626 + }, + { + "epoch": 0.6834620716973658, + "grad_norm": 1.7617160150098754, + "learning_rate": 4.810119553906637e-06, + "loss": 0.3051, + "step": 8627 + }, + { + "epoch": 0.6835412953060012, + "grad_norm": 1.2416989542473023, + "learning_rate": 4.807926328344033e-06, + "loss": 0.1912, + "step": 8628 + }, + { + "epoch": 0.6836205189146366, + "grad_norm": 1.323281728304749, + "learning_rate": 4.805733444654496e-06, + "loss": 0.2318, + "step": 8629 + }, + { + "epoch": 0.6836997425232719, + "grad_norm": 1.1683418309452485, + "learning_rate": 4.8035409029824195e-06, + "loss": 0.218, + "step": 8630 + }, + { + "epoch": 0.6837789661319074, + "grad_norm": 1.3856863044127457, + "learning_rate": 4.801348703472173e-06, + "loss": 0.2756, + "step": 8631 + }, + { + "epoch": 0.6838581897405427, + "grad_norm": 1.9462143901227849, + "learning_rate": 4.7991568462680945e-06, + "loss": 0.2219, + "step": 8632 + }, + { + "epoch": 0.683937413349178, + "grad_norm": 1.1454987336134659, + "learning_rate": 4.796965331514517e-06, + "loss": 0.157, + "step": 8633 + }, + { + "epoch": 0.6840166369578135, + "grad_norm": 1.1352715343075432, + "learning_rate": 4.794774159355737e-06, + "loss": 0.1711, + "step": 8634 + }, + { + "epoch": 0.6840958605664488, + "grad_norm": 1.2226293572108666, + "learning_rate": 4.79258332993603e-06, + "loss": 0.2928, + "step": 8635 + }, + { + "epoch": 0.6841750841750842, + "grad_norm": 1.2016806855619928, + "learning_rate": 4.7903928433996576e-06, + "loss": 0.1816, + "step": 8636 + }, + { + "epoch": 0.6842543077837195, + "grad_norm": 1.1763682111881113, + "learning_rate": 4.788202699890848e-06, + "loss": 0.2076, + "step": 8637 + }, + { + "epoch": 0.684333531392355, + "grad_norm": 1.7213378312112722, + "learning_rate": 4.786012899553815e-06, + "loss": 0.3442, + "step": 8638 + }, + { + "epoch": 0.6844127550009903, + "grad_norm": 1.2935252698058537, + "learning_rate": 4.783823442532739e-06, + "loss": 0.2891, + "step": 8639 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 1.2061771396523617, + "learning_rate": 4.781634328971796e-06, + "loss": 0.2254, + "step": 8640 + }, + { + "epoch": 0.6845712022182611, + "grad_norm": 1.3499445483141876, + "learning_rate": 4.779445559015122e-06, + "loss": 0.2311, + "step": 8641 + }, + { + "epoch": 0.6846504258268964, + "grad_norm": 1.5390461832226396, + "learning_rate": 4.777257132806835e-06, + "loss": 0.262, + "step": 8642 + }, + { + "epoch": 0.6847296494355318, + "grad_norm": 1.154896922438916, + "learning_rate": 4.775069050491039e-06, + "loss": 0.1816, + "step": 8643 + }, + { + "epoch": 0.6848088730441672, + "grad_norm": 1.3634878824321564, + "learning_rate": 4.772881312211805e-06, + "loss": 0.2733, + "step": 8644 + }, + { + "epoch": 0.6848880966528025, + "grad_norm": 1.3856473474547926, + "learning_rate": 4.770693918113183e-06, + "loss": 0.2377, + "step": 8645 + }, + { + "epoch": 0.6849673202614379, + "grad_norm": 1.128114675175529, + "learning_rate": 4.768506868339206e-06, + "loss": 0.1942, + "step": 8646 + }, + { + "epoch": 0.6850465438700732, + "grad_norm": 1.3358746521230798, + "learning_rate": 4.766320163033882e-06, + "loss": 0.2585, + "step": 8647 + }, + { + "epoch": 0.6851257674787087, + "grad_norm": 1.2952707079162062, + "learning_rate": 4.764133802341188e-06, + "loss": 0.2014, + "step": 8648 + }, + { + "epoch": 0.685204991087344, + "grad_norm": 1.758093567423146, + "learning_rate": 4.761947786405092e-06, + "loss": 0.3067, + "step": 8649 + }, + { + "epoch": 0.6852842146959794, + "grad_norm": 1.2860424786279239, + "learning_rate": 4.759762115369531e-06, + "loss": 0.2163, + "step": 8650 + }, + { + "epoch": 0.6853634383046148, + "grad_norm": 1.5019812376462904, + "learning_rate": 4.7575767893784174e-06, + "loss": 0.2491, + "step": 8651 + }, + { + "epoch": 0.6854426619132501, + "grad_norm": 1.2064717851010112, + "learning_rate": 4.755391808575651e-06, + "loss": 0.1965, + "step": 8652 + }, + { + "epoch": 0.6855218855218855, + "grad_norm": 1.727082139943516, + "learning_rate": 4.7532071731050975e-06, + "loss": 0.343, + "step": 8653 + }, + { + "epoch": 0.6856011091305209, + "grad_norm": 1.0309992307517994, + "learning_rate": 4.7510228831106064e-06, + "loss": 0.1793, + "step": 8654 + }, + { + "epoch": 0.6856803327391563, + "grad_norm": 1.3969742025231717, + "learning_rate": 4.748838938735999e-06, + "loss": 0.2725, + "step": 8655 + }, + { + "epoch": 0.6857595563477916, + "grad_norm": 1.190236086142712, + "learning_rate": 4.746655340125082e-06, + "loss": 0.2112, + "step": 8656 + }, + { + "epoch": 0.6858387799564271, + "grad_norm": 1.1543977751939545, + "learning_rate": 4.744472087421635e-06, + "loss": 0.1994, + "step": 8657 + }, + { + "epoch": 0.6859180035650624, + "grad_norm": 1.1914671245124866, + "learning_rate": 4.74228918076941e-06, + "loss": 0.1743, + "step": 8658 + }, + { + "epoch": 0.6859972271736977, + "grad_norm": 1.3011104136012606, + "learning_rate": 4.740106620312147e-06, + "loss": 0.2404, + "step": 8659 + }, + { + "epoch": 0.6860764507823331, + "grad_norm": 1.2144773451360018, + "learning_rate": 4.737924406193554e-06, + "loss": 0.2079, + "step": 8660 + }, + { + "epoch": 0.6861556743909685, + "grad_norm": 1.1849785985586108, + "learning_rate": 4.735742538557316e-06, + "loss": 0.2054, + "step": 8661 + }, + { + "epoch": 0.6862348979996039, + "grad_norm": 1.3869187978249207, + "learning_rate": 4.733561017547104e-06, + "loss": 0.281, + "step": 8662 + }, + { + "epoch": 0.6863141216082392, + "grad_norm": 1.2918667619297424, + "learning_rate": 4.73137984330656e-06, + "loss": 0.2742, + "step": 8663 + }, + { + "epoch": 0.6863933452168747, + "grad_norm": 1.0196352551807941, + "learning_rate": 4.729199015979298e-06, + "loss": 0.1816, + "step": 8664 + }, + { + "epoch": 0.68647256882551, + "grad_norm": 1.2178115970693308, + "learning_rate": 4.727018535708922e-06, + "loss": 0.172, + "step": 8665 + }, + { + "epoch": 0.6865517924341453, + "grad_norm": 1.5066039014437018, + "learning_rate": 4.724838402639006e-06, + "loss": 0.3132, + "step": 8666 + }, + { + "epoch": 0.6866310160427808, + "grad_norm": 1.1456257437859685, + "learning_rate": 4.7226586169130925e-06, + "loss": 0.2016, + "step": 8667 + }, + { + "epoch": 0.6867102396514161, + "grad_norm": 1.2602859007210447, + "learning_rate": 4.7204791786747215e-06, + "loss": 0.2405, + "step": 8668 + }, + { + "epoch": 0.6867894632600515, + "grad_norm": 1.4354888054579482, + "learning_rate": 4.718300088067392e-06, + "loss": 0.2455, + "step": 8669 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 1.3860828594033565, + "learning_rate": 4.716121345234589e-06, + "loss": 0.2227, + "step": 8670 + }, + { + "epoch": 0.6869479104773223, + "grad_norm": 1.7746613521586, + "learning_rate": 4.713942950319767e-06, + "loss": 0.2306, + "step": 8671 + }, + { + "epoch": 0.6870271340859576, + "grad_norm": 1.3516699810515453, + "learning_rate": 4.71176490346637e-06, + "loss": 0.2236, + "step": 8672 + }, + { + "epoch": 0.6871063576945929, + "grad_norm": 1.195409475282975, + "learning_rate": 4.709587204817809e-06, + "loss": 0.146, + "step": 8673 + }, + { + "epoch": 0.6871855813032284, + "grad_norm": 1.221400101328681, + "learning_rate": 4.707409854517471e-06, + "loss": 0.2292, + "step": 8674 + }, + { + "epoch": 0.6872648049118637, + "grad_norm": 1.65413198229928, + "learning_rate": 4.705232852708732e-06, + "loss": 0.3104, + "step": 8675 + }, + { + "epoch": 0.6873440285204991, + "grad_norm": 1.5063706668556864, + "learning_rate": 4.703056199534933e-06, + "loss": 0.2406, + "step": 8676 + }, + { + "epoch": 0.6874232521291345, + "grad_norm": 1.1346410709801327, + "learning_rate": 4.700879895139391e-06, + "loss": 0.181, + "step": 8677 + }, + { + "epoch": 0.6875024757377699, + "grad_norm": 1.3318105281097143, + "learning_rate": 4.698703939665414e-06, + "loss": 0.2183, + "step": 8678 + }, + { + "epoch": 0.6875816993464052, + "grad_norm": 1.4999602492663062, + "learning_rate": 4.696528333256275e-06, + "loss": 0.2284, + "step": 8679 + }, + { + "epoch": 0.6876609229550406, + "grad_norm": 1.358231022935779, + "learning_rate": 4.694353076055222e-06, + "loss": 0.2322, + "step": 8680 + }, + { + "epoch": 0.687740146563676, + "grad_norm": 1.039892398264782, + "learning_rate": 4.6921781682054954e-06, + "loss": 0.1552, + "step": 8681 + }, + { + "epoch": 0.6878193701723113, + "grad_norm": 1.208743790314156, + "learning_rate": 4.6900036098502956e-06, + "loss": 0.19, + "step": 8682 + }, + { + "epoch": 0.6878985937809468, + "grad_norm": 1.3307110870309475, + "learning_rate": 4.687829401132804e-06, + "loss": 0.2304, + "step": 8683 + }, + { + "epoch": 0.6879778173895821, + "grad_norm": 1.0195743451844448, + "learning_rate": 4.685655542196194e-06, + "loss": 0.1875, + "step": 8684 + }, + { + "epoch": 0.6880570409982175, + "grad_norm": 1.2583843440268834, + "learning_rate": 4.6834820331835915e-06, + "loss": 0.2802, + "step": 8685 + }, + { + "epoch": 0.6881362646068528, + "grad_norm": 0.84717147414347, + "learning_rate": 4.681308874238112e-06, + "loss": 0.0957, + "step": 8686 + }, + { + "epoch": 0.6882154882154882, + "grad_norm": 1.5709025061132116, + "learning_rate": 4.679136065502855e-06, + "loss": 0.281, + "step": 8687 + }, + { + "epoch": 0.6882947118241236, + "grad_norm": 1.3563168476709715, + "learning_rate": 4.676963607120886e-06, + "loss": 0.2647, + "step": 8688 + }, + { + "epoch": 0.6883739354327589, + "grad_norm": 1.4025748676997238, + "learning_rate": 4.674791499235246e-06, + "loss": 0.3222, + "step": 8689 + }, + { + "epoch": 0.6884531590413944, + "grad_norm": 1.27085195451568, + "learning_rate": 4.672619741988966e-06, + "loss": 0.2055, + "step": 8690 + }, + { + "epoch": 0.6885323826500297, + "grad_norm": 1.1802933538323492, + "learning_rate": 4.670448335525043e-06, + "loss": 0.2221, + "step": 8691 + }, + { + "epoch": 0.6886116062586651, + "grad_norm": 1.0575423849188226, + "learning_rate": 4.66827727998645e-06, + "loss": 0.1819, + "step": 8692 + }, + { + "epoch": 0.6886908298673005, + "grad_norm": 1.4898335149535489, + "learning_rate": 4.666106575516146e-06, + "loss": 0.3458, + "step": 8693 + }, + { + "epoch": 0.6887700534759358, + "grad_norm": 1.187573317501121, + "learning_rate": 4.663936222257059e-06, + "loss": 0.1664, + "step": 8694 + }, + { + "epoch": 0.6888492770845712, + "grad_norm": 1.7699323738081327, + "learning_rate": 4.661766220352098e-06, + "loss": 0.2722, + "step": 8695 + }, + { + "epoch": 0.6889285006932065, + "grad_norm": 1.285605046750008, + "learning_rate": 4.659596569944139e-06, + "loss": 0.2059, + "step": 8696 + }, + { + "epoch": 0.689007724301842, + "grad_norm": 1.3943419936684098, + "learning_rate": 4.657427271176055e-06, + "loss": 0.2118, + "step": 8697 + }, + { + "epoch": 0.6890869479104773, + "grad_norm": 1.7547872137325948, + "learning_rate": 4.655258324190678e-06, + "loss": 0.4043, + "step": 8698 + }, + { + "epoch": 0.6891661715191127, + "grad_norm": 1.4085062101251735, + "learning_rate": 4.65308972913082e-06, + "loss": 0.3605, + "step": 8699 + }, + { + "epoch": 0.6892453951277481, + "grad_norm": 1.2008387531312104, + "learning_rate": 4.6509214861392785e-06, + "loss": 0.2962, + "step": 8700 + }, + { + "epoch": 0.6893246187363834, + "grad_norm": 1.2940143376387314, + "learning_rate": 4.648753595358818e-06, + "loss": 0.2615, + "step": 8701 + }, + { + "epoch": 0.6894038423450188, + "grad_norm": 1.2277870085967522, + "learning_rate": 4.646586056932183e-06, + "loss": 0.2294, + "step": 8702 + }, + { + "epoch": 0.6894830659536542, + "grad_norm": 1.3006327385280338, + "learning_rate": 4.6444188710021e-06, + "loss": 0.2783, + "step": 8703 + }, + { + "epoch": 0.6895622895622896, + "grad_norm": 1.3089766592434473, + "learning_rate": 4.6422520377112646e-06, + "loss": 0.2622, + "step": 8704 + }, + { + "epoch": 0.6896415131709249, + "grad_norm": 1.1188427806612686, + "learning_rate": 4.640085557202349e-06, + "loss": 0.1849, + "step": 8705 + }, + { + "epoch": 0.6897207367795604, + "grad_norm": 1.337276875350201, + "learning_rate": 4.637919429618014e-06, + "loss": 0.2179, + "step": 8706 + }, + { + "epoch": 0.6897999603881957, + "grad_norm": 0.944755643224885, + "learning_rate": 4.635753655100883e-06, + "loss": 0.1451, + "step": 8707 + }, + { + "epoch": 0.689879183996831, + "grad_norm": 1.2254631560513103, + "learning_rate": 4.633588233793559e-06, + "loss": 0.2135, + "step": 8708 + }, + { + "epoch": 0.6899584076054665, + "grad_norm": 0.9947477879968354, + "learning_rate": 4.631423165838632e-06, + "loss": 0.1134, + "step": 8709 + }, + { + "epoch": 0.6900376312141018, + "grad_norm": 1.509402594511407, + "learning_rate": 4.629258451378658e-06, + "loss": 0.2853, + "step": 8710 + }, + { + "epoch": 0.6901168548227372, + "grad_norm": 1.6917343469637296, + "learning_rate": 4.6270940905561725e-06, + "loss": 0.2887, + "step": 8711 + }, + { + "epoch": 0.6901960784313725, + "grad_norm": 1.0878979515818141, + "learning_rate": 4.624930083513684e-06, + "loss": 0.1687, + "step": 8712 + }, + { + "epoch": 0.690275302040008, + "grad_norm": 1.262003386822182, + "learning_rate": 4.62276643039369e-06, + "loss": 0.2617, + "step": 8713 + }, + { + "epoch": 0.6903545256486433, + "grad_norm": 1.3320596668585638, + "learning_rate": 4.620603131338655e-06, + "loss": 0.2516, + "step": 8714 + }, + { + "epoch": 0.6904337492572786, + "grad_norm": 1.483981182453267, + "learning_rate": 4.6184401864910136e-06, + "loss": 0.2622, + "step": 8715 + }, + { + "epoch": 0.6905129728659141, + "grad_norm": 1.3631144009995455, + "learning_rate": 4.616277595993196e-06, + "loss": 0.2521, + "step": 8716 + }, + { + "epoch": 0.6905921964745494, + "grad_norm": 1.4478497043496281, + "learning_rate": 4.614115359987595e-06, + "loss": 0.209, + "step": 8717 + }, + { + "epoch": 0.6906714200831848, + "grad_norm": 1.2430777705725242, + "learning_rate": 4.6119534786165765e-06, + "loss": 0.192, + "step": 8718 + }, + { + "epoch": 0.6907506436918202, + "grad_norm": 1.384934937383765, + "learning_rate": 4.609791952022501e-06, + "loss": 0.2355, + "step": 8719 + }, + { + "epoch": 0.6908298673004556, + "grad_norm": 1.7022029348205618, + "learning_rate": 4.607630780347689e-06, + "loss": 0.3012, + "step": 8720 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 1.3707122641620741, + "learning_rate": 4.60546996373444e-06, + "loss": 0.2147, + "step": 8721 + }, + { + "epoch": 0.6909883145177262, + "grad_norm": 1.4744656815451067, + "learning_rate": 4.603309502325041e-06, + "loss": 0.2657, + "step": 8722 + }, + { + "epoch": 0.6910675381263617, + "grad_norm": 1.6494406511377315, + "learning_rate": 4.601149396261744e-06, + "loss": 0.2953, + "step": 8723 + }, + { + "epoch": 0.691146761734997, + "grad_norm": 1.1806655749821362, + "learning_rate": 4.598989645686782e-06, + "loss": 0.1276, + "step": 8724 + }, + { + "epoch": 0.6912259853436324, + "grad_norm": 1.7525038805497757, + "learning_rate": 4.596830250742359e-06, + "loss": 0.2903, + "step": 8725 + }, + { + "epoch": 0.6913052089522678, + "grad_norm": 1.0571604790375972, + "learning_rate": 4.594671211570671e-06, + "loss": 0.1928, + "step": 8726 + }, + { + "epoch": 0.6913844325609031, + "grad_norm": 1.3804828341951167, + "learning_rate": 4.592512528313874e-06, + "loss": 0.2436, + "step": 8727 + }, + { + "epoch": 0.6914636561695385, + "grad_norm": 1.6509485856087844, + "learning_rate": 4.590354201114103e-06, + "loss": 0.3109, + "step": 8728 + }, + { + "epoch": 0.6915428797781739, + "grad_norm": 1.6471259632083965, + "learning_rate": 4.588196230113483e-06, + "loss": 0.2787, + "step": 8729 + }, + { + "epoch": 0.6916221033868093, + "grad_norm": 1.3425949322709636, + "learning_rate": 4.586038615454102e-06, + "loss": 0.2169, + "step": 8730 + }, + { + "epoch": 0.6917013269954446, + "grad_norm": 1.2102864100087452, + "learning_rate": 4.583881357278023e-06, + "loss": 0.2457, + "step": 8731 + }, + { + "epoch": 0.6917805506040801, + "grad_norm": 1.4576335760363308, + "learning_rate": 4.5817244557273e-06, + "loss": 0.2565, + "step": 8732 + }, + { + "epoch": 0.6918597742127154, + "grad_norm": 1.0434994412803442, + "learning_rate": 4.5795679109439505e-06, + "loss": 0.2039, + "step": 8733 + }, + { + "epoch": 0.6919389978213507, + "grad_norm": 1.420031312849618, + "learning_rate": 4.57741172306997e-06, + "loss": 0.3024, + "step": 8734 + }, + { + "epoch": 0.6920182214299861, + "grad_norm": 1.566416117201726, + "learning_rate": 4.5752558922473376e-06, + "loss": 0.3626, + "step": 8735 + }, + { + "epoch": 0.6920974450386215, + "grad_norm": 1.4186331093927886, + "learning_rate": 4.573100418618004e-06, + "loss": 0.2393, + "step": 8736 + }, + { + "epoch": 0.6921766686472569, + "grad_norm": 1.52562327832964, + "learning_rate": 4.57094530232389e-06, + "loss": 0.3389, + "step": 8737 + }, + { + "epoch": 0.6922558922558922, + "grad_norm": 1.0635909690795307, + "learning_rate": 4.5687905435069106e-06, + "loss": 0.193, + "step": 8738 + }, + { + "epoch": 0.6923351158645277, + "grad_norm": 1.348352342256825, + "learning_rate": 4.566636142308939e-06, + "loss": 0.1924, + "step": 8739 + }, + { + "epoch": 0.692414339473163, + "grad_norm": 1.994858954210715, + "learning_rate": 4.564482098871834e-06, + "loss": 0.2743, + "step": 8740 + }, + { + "epoch": 0.6924935630817983, + "grad_norm": 1.5937476550343863, + "learning_rate": 4.562328413337426e-06, + "loss": 0.1965, + "step": 8741 + }, + { + "epoch": 0.6925727866904338, + "grad_norm": 1.1209159642424666, + "learning_rate": 4.56017508584753e-06, + "loss": 0.1776, + "step": 8742 + }, + { + "epoch": 0.6926520102990691, + "grad_norm": 1.4812936938993486, + "learning_rate": 4.558022116543931e-06, + "loss": 0.215, + "step": 8743 + }, + { + "epoch": 0.6927312339077045, + "grad_norm": 1.2060702534011785, + "learning_rate": 4.555869505568386e-06, + "loss": 0.2142, + "step": 8744 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 1.365700120754585, + "learning_rate": 4.553717253062643e-06, + "loss": 0.2796, + "step": 8745 + }, + { + "epoch": 0.6928896811249753, + "grad_norm": 1.2835470597011995, + "learning_rate": 4.551565359168411e-06, + "loss": 0.251, + "step": 8746 + }, + { + "epoch": 0.6929689047336106, + "grad_norm": 1.155391005013538, + "learning_rate": 4.549413824027382e-06, + "loss": 0.2516, + "step": 8747 + }, + { + "epoch": 0.6930481283422459, + "grad_norm": 1.7505118271213098, + "learning_rate": 4.54726264778123e-06, + "loss": 0.2689, + "step": 8748 + }, + { + "epoch": 0.6931273519508814, + "grad_norm": 1.526987493710666, + "learning_rate": 4.5451118305715954e-06, + "loss": 0.2771, + "step": 8749 + }, + { + "epoch": 0.6932065755595167, + "grad_norm": 1.3722794258858013, + "learning_rate": 4.542961372540096e-06, + "loss": 0.223, + "step": 8750 + }, + { + "epoch": 0.6932857991681521, + "grad_norm": 1.504889370782872, + "learning_rate": 4.540811273828336e-06, + "loss": 0.247, + "step": 8751 + }, + { + "epoch": 0.6933650227767875, + "grad_norm": 1.2212461020321752, + "learning_rate": 4.538661534577886e-06, + "loss": 0.2314, + "step": 8752 + }, + { + "epoch": 0.6934442463854229, + "grad_norm": 1.3492761550397696, + "learning_rate": 4.5365121549302916e-06, + "loss": 0.2101, + "step": 8753 + }, + { + "epoch": 0.6935234699940582, + "grad_norm": 1.257409896677816, + "learning_rate": 4.534363135027086e-06, + "loss": 0.1682, + "step": 8754 + }, + { + "epoch": 0.6936026936026936, + "grad_norm": 1.304270526398794, + "learning_rate": 4.532214475009771e-06, + "loss": 0.2333, + "step": 8755 + }, + { + "epoch": 0.693681917211329, + "grad_norm": 1.647257818436006, + "learning_rate": 4.530066175019823e-06, + "loss": 0.2754, + "step": 8756 + }, + { + "epoch": 0.6937611408199643, + "grad_norm": 1.1709194583314289, + "learning_rate": 4.527918235198692e-06, + "loss": 0.2048, + "step": 8757 + }, + { + "epoch": 0.6938403644285998, + "grad_norm": 1.3617125210134675, + "learning_rate": 4.525770655687821e-06, + "loss": 0.2177, + "step": 8758 + }, + { + "epoch": 0.6939195880372351, + "grad_norm": 1.5215139691324813, + "learning_rate": 4.523623436628611e-06, + "loss": 0.253, + "step": 8759 + }, + { + "epoch": 0.6939988116458705, + "grad_norm": 1.2925865333906732, + "learning_rate": 4.521476578162445e-06, + "loss": 0.2122, + "step": 8760 + }, + { + "epoch": 0.6940780352545058, + "grad_norm": 1.3929779419343227, + "learning_rate": 4.519330080430687e-06, + "loss": 0.2246, + "step": 8761 + }, + { + "epoch": 0.6941572588631412, + "grad_norm": 1.3636242206882143, + "learning_rate": 4.517183943574673e-06, + "loss": 0.172, + "step": 8762 + }, + { + "epoch": 0.6942364824717766, + "grad_norm": 1.3772760187609399, + "learning_rate": 4.515038167735715e-06, + "loss": 0.302, + "step": 8763 + }, + { + "epoch": 0.6943157060804119, + "grad_norm": 1.4778475696989446, + "learning_rate": 4.5128927530551e-06, + "loss": 0.231, + "step": 8764 + }, + { + "epoch": 0.6943949296890474, + "grad_norm": 0.8099686810018009, + "learning_rate": 4.510747699674096e-06, + "loss": 0.0897, + "step": 8765 + }, + { + "epoch": 0.6944741532976827, + "grad_norm": 1.4415160462821115, + "learning_rate": 4.50860300773394e-06, + "loss": 0.2337, + "step": 8766 + }, + { + "epoch": 0.6945533769063181, + "grad_norm": 1.5181661334498546, + "learning_rate": 4.506458677375856e-06, + "loss": 0.3006, + "step": 8767 + }, + { + "epoch": 0.6946326005149535, + "grad_norm": 1.3778953504845226, + "learning_rate": 4.504314708741037e-06, + "loss": 0.2874, + "step": 8768 + }, + { + "epoch": 0.6947118241235888, + "grad_norm": 1.2925908182928736, + "learning_rate": 4.502171101970645e-06, + "loss": 0.1839, + "step": 8769 + }, + { + "epoch": 0.6947910477322242, + "grad_norm": 1.2884098423161836, + "learning_rate": 4.5000278572058365e-06, + "loss": 0.1839, + "step": 8770 + }, + { + "epoch": 0.6948702713408595, + "grad_norm": 1.2785623006265123, + "learning_rate": 4.497884974587729e-06, + "loss": 0.2067, + "step": 8771 + }, + { + "epoch": 0.694949494949495, + "grad_norm": 1.4554851056660039, + "learning_rate": 4.495742454257418e-06, + "loss": 0.238, + "step": 8772 + }, + { + "epoch": 0.6950287185581303, + "grad_norm": 1.447509004133046, + "learning_rate": 4.493600296355986e-06, + "loss": 0.2989, + "step": 8773 + }, + { + "epoch": 0.6951079421667657, + "grad_norm": 1.3943162340471837, + "learning_rate": 4.491458501024479e-06, + "loss": 0.2883, + "step": 8774 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 1.2076899978167548, + "learning_rate": 4.489317068403919e-06, + "loss": 0.187, + "step": 8775 + }, + { + "epoch": 0.6952663893840364, + "grad_norm": 1.4105980297316858, + "learning_rate": 4.487175998635319e-06, + "loss": 0.2658, + "step": 8776 + }, + { + "epoch": 0.6953456129926718, + "grad_norm": 1.416529640210077, + "learning_rate": 4.485035291859654e-06, + "loss": 0.2537, + "step": 8777 + }, + { + "epoch": 0.6954248366013072, + "grad_norm": 1.1543317782707463, + "learning_rate": 4.482894948217875e-06, + "loss": 0.1839, + "step": 8778 + }, + { + "epoch": 0.6955040602099426, + "grad_norm": 1.0478537328893316, + "learning_rate": 4.48075496785092e-06, + "loss": 0.1744, + "step": 8779 + }, + { + "epoch": 0.6955832838185779, + "grad_norm": 1.642981889691146, + "learning_rate": 4.4786153508996944e-06, + "loss": 0.2143, + "step": 8780 + }, + { + "epoch": 0.6956625074272134, + "grad_norm": 1.4862071057729505, + "learning_rate": 4.47647609750508e-06, + "loss": 0.2416, + "step": 8781 + }, + { + "epoch": 0.6957417310358487, + "grad_norm": 1.2940984396724406, + "learning_rate": 4.4743372078079335e-06, + "loss": 0.2544, + "step": 8782 + }, + { + "epoch": 0.695820954644484, + "grad_norm": 1.4054540989895064, + "learning_rate": 4.472198681949098e-06, + "loss": 0.2561, + "step": 8783 + }, + { + "epoch": 0.6959001782531195, + "grad_norm": 1.5158475747576985, + "learning_rate": 4.470060520069381e-06, + "loss": 0.2757, + "step": 8784 + }, + { + "epoch": 0.6959794018617548, + "grad_norm": 1.2560370940733032, + "learning_rate": 4.467922722309567e-06, + "loss": 0.2924, + "step": 8785 + }, + { + "epoch": 0.6960586254703902, + "grad_norm": 1.4106584520775325, + "learning_rate": 4.465785288810427e-06, + "loss": 0.2588, + "step": 8786 + }, + { + "epoch": 0.6961378490790255, + "grad_norm": 1.4586344429334912, + "learning_rate": 4.4636482197126965e-06, + "loss": 0.2467, + "step": 8787 + }, + { + "epoch": 0.696217072687661, + "grad_norm": 1.1379517706083984, + "learning_rate": 4.461511515157087e-06, + "loss": 0.1914, + "step": 8788 + }, + { + "epoch": 0.6962962962962963, + "grad_norm": 1.385806669472669, + "learning_rate": 4.459375175284299e-06, + "loss": 0.2247, + "step": 8789 + }, + { + "epoch": 0.6963755199049316, + "grad_norm": 1.4387901667417589, + "learning_rate": 4.457239200234996e-06, + "loss": 0.2696, + "step": 8790 + }, + { + "epoch": 0.6964547435135671, + "grad_norm": 1.0836588377750644, + "learning_rate": 4.4551035901498186e-06, + "loss": 0.2048, + "step": 8791 + }, + { + "epoch": 0.6965339671222024, + "grad_norm": 1.270436749011163, + "learning_rate": 4.4529683451693916e-06, + "loss": 0.2197, + "step": 8792 + }, + { + "epoch": 0.6966131907308378, + "grad_norm": 1.5154897511294212, + "learning_rate": 4.45083346543431e-06, + "loss": 0.2377, + "step": 8793 + }, + { + "epoch": 0.6966924143394732, + "grad_norm": 1.3066526828623988, + "learning_rate": 4.448698951085143e-06, + "loss": 0.2356, + "step": 8794 + }, + { + "epoch": 0.6967716379481086, + "grad_norm": 1.2470028333404923, + "learning_rate": 4.446564802262435e-06, + "loss": 0.1113, + "step": 8795 + }, + { + "epoch": 0.6968508615567439, + "grad_norm": 1.386025383460799, + "learning_rate": 4.444431019106718e-06, + "loss": 0.1977, + "step": 8796 + }, + { + "epoch": 0.6969300851653792, + "grad_norm": 1.5959343072064138, + "learning_rate": 4.4422976017584866e-06, + "loss": 0.3356, + "step": 8797 + }, + { + "epoch": 0.6970093087740147, + "grad_norm": 1.2099062541547179, + "learning_rate": 4.440164550358212e-06, + "loss": 0.2219, + "step": 8798 + }, + { + "epoch": 0.69708853238265, + "grad_norm": 1.243105102253641, + "learning_rate": 4.438031865046353e-06, + "loss": 0.1937, + "step": 8799 + }, + { + "epoch": 0.6971677559912854, + "grad_norm": 1.3659841787956428, + "learning_rate": 4.435899545963333e-06, + "loss": 0.2334, + "step": 8800 + }, + { + "epoch": 0.6972469795999208, + "grad_norm": 1.133632684945457, + "learning_rate": 4.4337675932495515e-06, + "loss": 0.167, + "step": 8801 + }, + { + "epoch": 0.6973262032085561, + "grad_norm": 1.2762805432610065, + "learning_rate": 4.431636007045396e-06, + "loss": 0.1663, + "step": 8802 + }, + { + "epoch": 0.6974054268171915, + "grad_norm": 1.3279277988480715, + "learning_rate": 4.429504787491214e-06, + "loss": 0.2183, + "step": 8803 + }, + { + "epoch": 0.6974846504258269, + "grad_norm": 1.5680494355985402, + "learning_rate": 4.427373934727337e-06, + "loss": 0.3935, + "step": 8804 + }, + { + "epoch": 0.6975638740344623, + "grad_norm": 1.0921913552068514, + "learning_rate": 4.425243448894074e-06, + "loss": 0.1628, + "step": 8805 + }, + { + "epoch": 0.6976430976430976, + "grad_norm": 1.2228603767237676, + "learning_rate": 4.423113330131708e-06, + "loss": 0.2317, + "step": 8806 + }, + { + "epoch": 0.6977223212517331, + "grad_norm": 1.6127320662513773, + "learning_rate": 4.42098357858049e-06, + "loss": 0.3141, + "step": 8807 + }, + { + "epoch": 0.6978015448603684, + "grad_norm": 1.2127393656729695, + "learning_rate": 4.418854194380663e-06, + "loss": 0.251, + "step": 8808 + }, + { + "epoch": 0.6978807684690037, + "grad_norm": 1.0391332710304808, + "learning_rate": 4.416725177672432e-06, + "loss": 0.1568, + "step": 8809 + }, + { + "epoch": 0.6979599920776391, + "grad_norm": 1.2022421904540985, + "learning_rate": 4.4145965285959836e-06, + "loss": 0.2347, + "step": 8810 + }, + { + "epoch": 0.6980392156862745, + "grad_norm": 1.2580558238458839, + "learning_rate": 4.412468247291474e-06, + "loss": 0.2178, + "step": 8811 + }, + { + "epoch": 0.6981184392949099, + "grad_norm": 1.2965298613678125, + "learning_rate": 4.410340333899049e-06, + "loss": 0.2408, + "step": 8812 + }, + { + "epoch": 0.6981976629035452, + "grad_norm": 1.5271511246798966, + "learning_rate": 4.408212788558818e-06, + "loss": 0.271, + "step": 8813 + }, + { + "epoch": 0.6982768865121807, + "grad_norm": 1.5411104285903274, + "learning_rate": 4.406085611410864e-06, + "loss": 0.3036, + "step": 8814 + }, + { + "epoch": 0.698356110120816, + "grad_norm": 1.4777125680054977, + "learning_rate": 4.403958802595261e-06, + "loss": 0.3012, + "step": 8815 + }, + { + "epoch": 0.6984353337294513, + "grad_norm": 1.4592262357239205, + "learning_rate": 4.401832362252044e-06, + "loss": 0.2601, + "step": 8816 + }, + { + "epoch": 0.6985145573380868, + "grad_norm": 1.2754787670211158, + "learning_rate": 4.399706290521225e-06, + "loss": 0.1998, + "step": 8817 + }, + { + "epoch": 0.6985937809467221, + "grad_norm": 1.374528557590866, + "learning_rate": 4.397580587542805e-06, + "loss": 0.2236, + "step": 8818 + }, + { + "epoch": 0.6986730045553575, + "grad_norm": 1.2649836213993118, + "learning_rate": 4.3954552534567455e-06, + "loss": 0.2244, + "step": 8819 + }, + { + "epoch": 0.6987522281639929, + "grad_norm": 1.3787977130235152, + "learning_rate": 4.393330288402986e-06, + "loss": 0.29, + "step": 8820 + }, + { + "epoch": 0.6988314517726283, + "grad_norm": 1.573556956514344, + "learning_rate": 4.391205692521453e-06, + "loss": 0.2901, + "step": 8821 + }, + { + "epoch": 0.6989106753812636, + "grad_norm": 1.4970212671112728, + "learning_rate": 4.389081465952039e-06, + "loss": 0.2867, + "step": 8822 + }, + { + "epoch": 0.6989898989898989, + "grad_norm": 1.1030741586174122, + "learning_rate": 4.386957608834607e-06, + "loss": 0.1685, + "step": 8823 + }, + { + "epoch": 0.6990691225985344, + "grad_norm": 1.7454605754173802, + "learning_rate": 4.384834121309013e-06, + "loss": 0.3222, + "step": 8824 + }, + { + "epoch": 0.6991483462071697, + "grad_norm": 1.4506190724576338, + "learning_rate": 4.382711003515072e-06, + "loss": 0.2763, + "step": 8825 + }, + { + "epoch": 0.6992275698158051, + "grad_norm": 1.3550989101118316, + "learning_rate": 4.3805882555925846e-06, + "loss": 0.1947, + "step": 8826 + }, + { + "epoch": 0.6993067934244405, + "grad_norm": 1.3405342202457475, + "learning_rate": 4.378465877681317e-06, + "loss": 0.2151, + "step": 8827 + }, + { + "epoch": 0.6993860170330759, + "grad_norm": 1.3577711192679864, + "learning_rate": 4.376343869921027e-06, + "loss": 0.2469, + "step": 8828 + }, + { + "epoch": 0.6994652406417112, + "grad_norm": 1.4776447890626994, + "learning_rate": 4.374222232451433e-06, + "loss": 0.3133, + "step": 8829 + }, + { + "epoch": 0.6995444642503466, + "grad_norm": 1.2475273626649297, + "learning_rate": 4.3721009654122315e-06, + "loss": 0.2606, + "step": 8830 + }, + { + "epoch": 0.699623687858982, + "grad_norm": 1.0841324754200954, + "learning_rate": 4.369980068943106e-06, + "loss": 0.166, + "step": 8831 + }, + { + "epoch": 0.6997029114676173, + "grad_norm": 1.2375589900776478, + "learning_rate": 4.367859543183702e-06, + "loss": 0.2523, + "step": 8832 + }, + { + "epoch": 0.6997821350762528, + "grad_norm": 1.150124650157148, + "learning_rate": 4.3657393882736456e-06, + "loss": 0.1865, + "step": 8833 + }, + { + "epoch": 0.6998613586848881, + "grad_norm": 1.396953914316769, + "learning_rate": 4.3636196043525415e-06, + "loss": 0.2713, + "step": 8834 + }, + { + "epoch": 0.6999405822935235, + "grad_norm": 1.1166195642514403, + "learning_rate": 4.361500191559967e-06, + "loss": 0.218, + "step": 8835 + }, + { + "epoch": 0.7000198059021588, + "grad_norm": 1.2507957612721141, + "learning_rate": 4.35938115003547e-06, + "loss": 0.2655, + "step": 8836 + }, + { + "epoch": 0.7000990295107942, + "grad_norm": 1.2561618669072205, + "learning_rate": 4.357262479918587e-06, + "loss": 0.1881, + "step": 8837 + }, + { + "epoch": 0.7001782531194296, + "grad_norm": 1.1331421191175617, + "learning_rate": 4.355144181348819e-06, + "loss": 0.2144, + "step": 8838 + }, + { + "epoch": 0.7002574767280649, + "grad_norm": 1.329150152318647, + "learning_rate": 4.353026254465642e-06, + "loss": 0.2442, + "step": 8839 + }, + { + "epoch": 0.7003367003367004, + "grad_norm": 1.3699642315685037, + "learning_rate": 4.350908699408521e-06, + "loss": 0.2486, + "step": 8840 + }, + { + "epoch": 0.7004159239453357, + "grad_norm": 1.3698029164621957, + "learning_rate": 4.348791516316878e-06, + "loss": 0.2504, + "step": 8841 + }, + { + "epoch": 0.7004951475539711, + "grad_norm": 1.4660385399250195, + "learning_rate": 4.346674705330117e-06, + "loss": 0.2627, + "step": 8842 + }, + { + "epoch": 0.7005743711626065, + "grad_norm": 1.1877712750901945, + "learning_rate": 4.344558266587628e-06, + "loss": 0.2314, + "step": 8843 + }, + { + "epoch": 0.7006535947712418, + "grad_norm": 1.0924659198478786, + "learning_rate": 4.342442200228766e-06, + "loss": 0.1674, + "step": 8844 + }, + { + "epoch": 0.7007328183798772, + "grad_norm": 1.2391572113579326, + "learning_rate": 4.340326506392859e-06, + "loss": 0.1906, + "step": 8845 + }, + { + "epoch": 0.7008120419885125, + "grad_norm": 1.2861973452650104, + "learning_rate": 4.338211185219222e-06, + "loss": 0.2424, + "step": 8846 + }, + { + "epoch": 0.700891265597148, + "grad_norm": 1.4967423782653613, + "learning_rate": 4.336096236847136e-06, + "loss": 0.2651, + "step": 8847 + }, + { + "epoch": 0.7009704892057833, + "grad_norm": 1.106477185134246, + "learning_rate": 4.333981661415856e-06, + "loss": 0.1363, + "step": 8848 + }, + { + "epoch": 0.7010497128144187, + "grad_norm": 1.3555705126951718, + "learning_rate": 4.331867459064623e-06, + "loss": 0.2175, + "step": 8849 + }, + { + "epoch": 0.7011289364230541, + "grad_norm": 1.4114150946018673, + "learning_rate": 4.329753629932646e-06, + "loss": 0.2885, + "step": 8850 + }, + { + "epoch": 0.7012081600316894, + "grad_norm": 1.3218864577591556, + "learning_rate": 4.327640174159109e-06, + "loss": 0.2847, + "step": 8851 + }, + { + "epoch": 0.7012873836403248, + "grad_norm": 1.1831231914701292, + "learning_rate": 4.325527091883168e-06, + "loss": 0.1438, + "step": 8852 + }, + { + "epoch": 0.7013666072489602, + "grad_norm": 1.5070682114586984, + "learning_rate": 4.323414383243969e-06, + "loss": 0.2667, + "step": 8853 + }, + { + "epoch": 0.7014458308575956, + "grad_norm": 1.2902207942796624, + "learning_rate": 4.321302048380619e-06, + "loss": 0.2427, + "step": 8854 + }, + { + "epoch": 0.7015250544662309, + "grad_norm": 1.6337809821668723, + "learning_rate": 4.319190087432201e-06, + "loss": 0.2736, + "step": 8855 + }, + { + "epoch": 0.7016042780748664, + "grad_norm": 1.2964343606334223, + "learning_rate": 4.317078500537785e-06, + "loss": 0.3022, + "step": 8856 + }, + { + "epoch": 0.7016835016835017, + "grad_norm": 1.407194086118684, + "learning_rate": 4.314967287836405e-06, + "loss": 0.2619, + "step": 8857 + }, + { + "epoch": 0.701762725292137, + "grad_norm": 1.5025381898005565, + "learning_rate": 4.3128564494670715e-06, + "loss": 0.2507, + "step": 8858 + }, + { + "epoch": 0.7018419489007724, + "grad_norm": 1.5473653448593045, + "learning_rate": 4.310745985568779e-06, + "loss": 0.263, + "step": 8859 + }, + { + "epoch": 0.7019211725094078, + "grad_norm": 1.5279687347061008, + "learning_rate": 4.3086358962804885e-06, + "loss": 0.2252, + "step": 8860 + }, + { + "epoch": 0.7020003961180432, + "grad_norm": 1.248919945168579, + "learning_rate": 4.306526181741135e-06, + "loss": 0.2516, + "step": 8861 + }, + { + "epoch": 0.7020796197266785, + "grad_norm": 1.4363142308416414, + "learning_rate": 4.304416842089641e-06, + "loss": 0.2416, + "step": 8862 + }, + { + "epoch": 0.702158843335314, + "grad_norm": 1.5856464484106991, + "learning_rate": 4.302307877464893e-06, + "loss": 0.2462, + "step": 8863 + }, + { + "epoch": 0.7022380669439493, + "grad_norm": 1.2228645686668598, + "learning_rate": 4.300199288005753e-06, + "loss": 0.2222, + "step": 8864 + }, + { + "epoch": 0.7023172905525846, + "grad_norm": 1.1943693296604285, + "learning_rate": 4.298091073851066e-06, + "loss": 0.1743, + "step": 8865 + }, + { + "epoch": 0.7023965141612201, + "grad_norm": 0.9489549208756705, + "learning_rate": 4.295983235139647e-06, + "loss": 0.1473, + "step": 8866 + }, + { + "epoch": 0.7024757377698554, + "grad_norm": 1.6134968380053571, + "learning_rate": 4.293875772010287e-06, + "loss": 0.2231, + "step": 8867 + }, + { + "epoch": 0.7025549613784908, + "grad_norm": 1.5126120617975245, + "learning_rate": 4.291768684601746e-06, + "loss": 0.2647, + "step": 8868 + }, + { + "epoch": 0.7026341849871262, + "grad_norm": 1.234192928214084, + "learning_rate": 4.289661973052774e-06, + "loss": 0.2126, + "step": 8869 + }, + { + "epoch": 0.7027134085957616, + "grad_norm": 1.3349316479660362, + "learning_rate": 4.287555637502086e-06, + "loss": 0.2585, + "step": 8870 + }, + { + "epoch": 0.7027926322043969, + "grad_norm": 1.5783766651416529, + "learning_rate": 4.285449678088369e-06, + "loss": 0.2997, + "step": 8871 + }, + { + "epoch": 0.7028718558130322, + "grad_norm": 1.2489360253198072, + "learning_rate": 4.283344094950297e-06, + "loss": 0.2515, + "step": 8872 + }, + { + "epoch": 0.7029510794216677, + "grad_norm": 1.4003730092696396, + "learning_rate": 4.2812388882265095e-06, + "loss": 0.2608, + "step": 8873 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 1.2116296877476789, + "learning_rate": 4.279134058055622e-06, + "loss": 0.2208, + "step": 8874 + }, + { + "epoch": 0.7031095266389384, + "grad_norm": 1.5623281862728544, + "learning_rate": 4.2770296045762315e-06, + "loss": 0.2656, + "step": 8875 + }, + { + "epoch": 0.7031887502475738, + "grad_norm": 1.1472092646930603, + "learning_rate": 4.274925527926907e-06, + "loss": 0.2592, + "step": 8876 + }, + { + "epoch": 0.7032679738562092, + "grad_norm": 1.0869502010717735, + "learning_rate": 4.272821828246183e-06, + "loss": 0.159, + "step": 8877 + }, + { + "epoch": 0.7033471974648445, + "grad_norm": 1.4178843089143776, + "learning_rate": 4.270718505672588e-06, + "loss": 0.2633, + "step": 8878 + }, + { + "epoch": 0.7034264210734799, + "grad_norm": 1.24786991171341, + "learning_rate": 4.2686155603446134e-06, + "loss": 0.2291, + "step": 8879 + }, + { + "epoch": 0.7035056446821153, + "grad_norm": 1.5405362632685256, + "learning_rate": 4.266512992400726e-06, + "loss": 0.3126, + "step": 8880 + }, + { + "epoch": 0.7035848682907506, + "grad_norm": 1.2324526643331342, + "learning_rate": 4.2644108019793665e-06, + "loss": 0.2487, + "step": 8881 + }, + { + "epoch": 0.7036640918993861, + "grad_norm": 1.3283865467153089, + "learning_rate": 4.262308989218961e-06, + "loss": 0.2385, + "step": 8882 + }, + { + "epoch": 0.7037433155080214, + "grad_norm": 1.2483571111991931, + "learning_rate": 4.2602075542579e-06, + "loss": 0.224, + "step": 8883 + }, + { + "epoch": 0.7038225391166567, + "grad_norm": 1.303042737802848, + "learning_rate": 4.258106497234551e-06, + "loss": 0.2442, + "step": 8884 + }, + { + "epoch": 0.7039017627252921, + "grad_norm": 1.4292886046190605, + "learning_rate": 4.256005818287265e-06, + "loss": 0.291, + "step": 8885 + }, + { + "epoch": 0.7039809863339275, + "grad_norm": 1.2123406052092123, + "learning_rate": 4.253905517554356e-06, + "loss": 0.2432, + "step": 8886 + }, + { + "epoch": 0.7040602099425629, + "grad_norm": 1.0967983864533966, + "learning_rate": 4.251805595174117e-06, + "loss": 0.1886, + "step": 8887 + }, + { + "epoch": 0.7041394335511982, + "grad_norm": 1.5317372605693833, + "learning_rate": 4.249706051284824e-06, + "loss": 0.2696, + "step": 8888 + }, + { + "epoch": 0.7042186571598337, + "grad_norm": 1.4409124523580885, + "learning_rate": 4.24760688602472e-06, + "loss": 0.3022, + "step": 8889 + }, + { + "epoch": 0.704297880768469, + "grad_norm": 1.4902861652147272, + "learning_rate": 4.245508099532021e-06, + "loss": 0.2617, + "step": 8890 + }, + { + "epoch": 0.7043771043771043, + "grad_norm": 1.4545460296068426, + "learning_rate": 4.243409691944927e-06, + "loss": 0.2749, + "step": 8891 + }, + { + "epoch": 0.7044563279857398, + "grad_norm": 1.3462913922671005, + "learning_rate": 4.241311663401606e-06, + "loss": 0.2506, + "step": 8892 + }, + { + "epoch": 0.7045355515943751, + "grad_norm": 1.4933935746132918, + "learning_rate": 4.2392140140401996e-06, + "loss": 0.2073, + "step": 8893 + }, + { + "epoch": 0.7046147752030105, + "grad_norm": 1.431178771877433, + "learning_rate": 4.237116743998835e-06, + "loss": 0.3002, + "step": 8894 + }, + { + "epoch": 0.7046939988116458, + "grad_norm": 1.1504529671048034, + "learning_rate": 4.235019853415603e-06, + "loss": 0.2432, + "step": 8895 + }, + { + "epoch": 0.7047732224202813, + "grad_norm": 1.0942706664230106, + "learning_rate": 4.232923342428574e-06, + "loss": 0.2122, + "step": 8896 + }, + { + "epoch": 0.7048524460289166, + "grad_norm": 1.2535495924159599, + "learning_rate": 4.230827211175791e-06, + "loss": 0.26, + "step": 8897 + }, + { + "epoch": 0.7049316696375519, + "grad_norm": 1.2115789151055185, + "learning_rate": 4.22873145979528e-06, + "loss": 0.1836, + "step": 8898 + }, + { + "epoch": 0.7050108932461874, + "grad_norm": 1.3120290099293208, + "learning_rate": 4.226636088425033e-06, + "loss": 0.2248, + "step": 8899 + }, + { + "epoch": 0.7050901168548227, + "grad_norm": 1.4289531951763148, + "learning_rate": 4.2245410972030154e-06, + "loss": 0.2398, + "step": 8900 + }, + { + "epoch": 0.7051693404634581, + "grad_norm": 1.2498053136329115, + "learning_rate": 4.222446486267181e-06, + "loss": 0.22, + "step": 8901 + }, + { + "epoch": 0.7052485640720935, + "grad_norm": 1.4639804149280384, + "learning_rate": 4.220352255755445e-06, + "loss": 0.2612, + "step": 8902 + }, + { + "epoch": 0.7053277876807289, + "grad_norm": 1.1922254344167316, + "learning_rate": 4.218258405805701e-06, + "loss": 0.1632, + "step": 8903 + }, + { + "epoch": 0.7054070112893642, + "grad_norm": 1.5265282643117082, + "learning_rate": 4.216164936555823e-06, + "loss": 0.3068, + "step": 8904 + }, + { + "epoch": 0.7054862348979996, + "grad_norm": 1.8747856188475003, + "learning_rate": 4.214071848143655e-06, + "loss": 0.3932, + "step": 8905 + }, + { + "epoch": 0.705565458506635, + "grad_norm": 1.0341393801837642, + "learning_rate": 4.211979140707012e-06, + "loss": 0.1925, + "step": 8906 + }, + { + "epoch": 0.7056446821152703, + "grad_norm": 1.4229911545021316, + "learning_rate": 4.209886814383696e-06, + "loss": 0.3113, + "step": 8907 + }, + { + "epoch": 0.7057239057239058, + "grad_norm": 1.5059989869798511, + "learning_rate": 4.207794869311472e-06, + "loss": 0.2933, + "step": 8908 + }, + { + "epoch": 0.7058031293325411, + "grad_norm": 1.345198624372794, + "learning_rate": 4.205703305628082e-06, + "loss": 0.2746, + "step": 8909 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.3773974894062775, + "learning_rate": 4.203612123471254e-06, + "loss": 0.2837, + "step": 8910 + }, + { + "epoch": 0.7059615765498118, + "grad_norm": 1.0800277934005587, + "learning_rate": 4.201521322978677e-06, + "loss": 0.1694, + "step": 8911 + }, + { + "epoch": 0.7060408001584472, + "grad_norm": 1.2959808655530942, + "learning_rate": 4.19943090428802e-06, + "loss": 0.3015, + "step": 8912 + }, + { + "epoch": 0.7061200237670826, + "grad_norm": 1.171000056245596, + "learning_rate": 4.197340867536923e-06, + "loss": 0.1526, + "step": 8913 + }, + { + "epoch": 0.7061992473757179, + "grad_norm": 1.130753341245374, + "learning_rate": 4.195251212863014e-06, + "loss": 0.1913, + "step": 8914 + }, + { + "epoch": 0.7062784709843534, + "grad_norm": 1.8869494786582943, + "learning_rate": 4.193161940403882e-06, + "loss": 0.4051, + "step": 8915 + }, + { + "epoch": 0.7063576945929887, + "grad_norm": 0.9810038352177908, + "learning_rate": 4.191073050297091e-06, + "loss": 0.1223, + "step": 8916 + }, + { + "epoch": 0.7064369182016241, + "grad_norm": 1.6368855284270838, + "learning_rate": 4.188984542680192e-06, + "loss": 0.3014, + "step": 8917 + }, + { + "epoch": 0.7065161418102595, + "grad_norm": 1.3344330523306043, + "learning_rate": 4.186896417690701e-06, + "loss": 0.2324, + "step": 8918 + }, + { + "epoch": 0.7065953654188948, + "grad_norm": 1.4222349404829133, + "learning_rate": 4.18480867546611e-06, + "loss": 0.2174, + "step": 8919 + }, + { + "epoch": 0.7066745890275302, + "grad_norm": 1.530686486745354, + "learning_rate": 4.182721316143888e-06, + "loss": 0.2746, + "step": 8920 + }, + { + "epoch": 0.7067538126361655, + "grad_norm": 1.1611083898133914, + "learning_rate": 4.180634339861474e-06, + "loss": 0.1831, + "step": 8921 + }, + { + "epoch": 0.706833036244801, + "grad_norm": 1.5052280522508614, + "learning_rate": 4.178547746756285e-06, + "loss": 0.289, + "step": 8922 + }, + { + "epoch": 0.7069122598534363, + "grad_norm": 1.187851917689239, + "learning_rate": 4.17646153696572e-06, + "loss": 0.1766, + "step": 8923 + }, + { + "epoch": 0.7069914834620717, + "grad_norm": 1.2496962311411817, + "learning_rate": 4.174375710627141e-06, + "loss": 0.2302, + "step": 8924 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 1.2421867203157149, + "learning_rate": 4.172290267877887e-06, + "loss": 0.2272, + "step": 8925 + }, + { + "epoch": 0.7071499306793424, + "grad_norm": 1.0984019318469123, + "learning_rate": 4.170205208855281e-06, + "loss": 0.1465, + "step": 8926 + }, + { + "epoch": 0.7072291542879778, + "grad_norm": 1.1395597288490344, + "learning_rate": 4.1681205336966115e-06, + "loss": 0.2093, + "step": 8927 + }, + { + "epoch": 0.7073083778966132, + "grad_norm": 1.2487292701753407, + "learning_rate": 4.16603624253914e-06, + "loss": 0.1816, + "step": 8928 + }, + { + "epoch": 0.7073876015052486, + "grad_norm": 1.7038540930200028, + "learning_rate": 4.163952335520114e-06, + "loss": 0.3728, + "step": 8929 + }, + { + "epoch": 0.7074668251138839, + "grad_norm": 0.9933114233267937, + "learning_rate": 4.161868812776746e-06, + "loss": 0.162, + "step": 8930 + }, + { + "epoch": 0.7075460487225194, + "grad_norm": 1.253010524215799, + "learning_rate": 4.15978567444622e-06, + "loss": 0.1933, + "step": 8931 + }, + { + "epoch": 0.7076252723311547, + "grad_norm": 1.2080224557839214, + "learning_rate": 4.157702920665712e-06, + "loss": 0.1833, + "step": 8932 + }, + { + "epoch": 0.70770449593979, + "grad_norm": 1.4994320465947872, + "learning_rate": 4.155620551572354e-06, + "loss": 0.2954, + "step": 8933 + }, + { + "epoch": 0.7077837195484254, + "grad_norm": 1.178940771929154, + "learning_rate": 4.153538567303258e-06, + "loss": 0.1421, + "step": 8934 + }, + { + "epoch": 0.7078629431570608, + "grad_norm": 1.4522507863236107, + "learning_rate": 4.151456967995519e-06, + "loss": 0.2621, + "step": 8935 + }, + { + "epoch": 0.7079421667656962, + "grad_norm": 1.1402514628989178, + "learning_rate": 4.149375753786198e-06, + "loss": 0.1736, + "step": 8936 + }, + { + "epoch": 0.7080213903743315, + "grad_norm": 1.310741945449736, + "learning_rate": 4.147294924812332e-06, + "loss": 0.1997, + "step": 8937 + }, + { + "epoch": 0.708100613982967, + "grad_norm": 1.382323889213825, + "learning_rate": 4.14521448121093e-06, + "loss": 0.2456, + "step": 8938 + }, + { + "epoch": 0.7081798375916023, + "grad_norm": 1.358592866893608, + "learning_rate": 4.143134423118986e-06, + "loss": 0.2317, + "step": 8939 + }, + { + "epoch": 0.7082590612002376, + "grad_norm": 1.3878803532731059, + "learning_rate": 4.14105475067346e-06, + "loss": 0.2804, + "step": 8940 + }, + { + "epoch": 0.7083382848088731, + "grad_norm": 1.5010143660187203, + "learning_rate": 4.138975464011284e-06, + "loss": 0.2855, + "step": 8941 + }, + { + "epoch": 0.7084175084175084, + "grad_norm": 1.4822281166949847, + "learning_rate": 4.136896563269375e-06, + "loss": 0.2604, + "step": 8942 + }, + { + "epoch": 0.7084967320261438, + "grad_norm": 1.7823968046660603, + "learning_rate": 4.1348180485846145e-06, + "loss": 0.2746, + "step": 8943 + }, + { + "epoch": 0.7085759556347792, + "grad_norm": 1.6793395161121027, + "learning_rate": 4.1327399200938625e-06, + "loss": 0.355, + "step": 8944 + }, + { + "epoch": 0.7086551792434146, + "grad_norm": 1.1469810030817083, + "learning_rate": 4.1306621779339585e-06, + "loss": 0.1675, + "step": 8945 + }, + { + "epoch": 0.7087344028520499, + "grad_norm": 1.7302084421352244, + "learning_rate": 4.128584822241708e-06, + "loss": 0.2946, + "step": 8946 + }, + { + "epoch": 0.7088136264606852, + "grad_norm": 1.2352175931965101, + "learning_rate": 4.126507853153891e-06, + "loss": 0.1795, + "step": 8947 + }, + { + "epoch": 0.7088928500693207, + "grad_norm": 1.1199262750356624, + "learning_rate": 4.124431270807277e-06, + "loss": 0.1704, + "step": 8948 + }, + { + "epoch": 0.708972073677956, + "grad_norm": 1.6278931663285423, + "learning_rate": 4.12235507533859e-06, + "loss": 0.2821, + "step": 8949 + }, + { + "epoch": 0.7090512972865914, + "grad_norm": 1.5660534492520206, + "learning_rate": 4.120279266884537e-06, + "loss": 0.2983, + "step": 8950 + }, + { + "epoch": 0.7091305208952268, + "grad_norm": 1.3064092237363494, + "learning_rate": 4.118203845581807e-06, + "loss": 0.2881, + "step": 8951 + }, + { + "epoch": 0.7092097445038622, + "grad_norm": 1.5817005002859825, + "learning_rate": 4.11612881156705e-06, + "loss": 0.2902, + "step": 8952 + }, + { + "epoch": 0.7092889681124975, + "grad_norm": 1.4482026214695563, + "learning_rate": 4.114054164976902e-06, + "loss": 0.2221, + "step": 8953 + }, + { + "epoch": 0.7093681917211329, + "grad_norm": 1.1715292738475596, + "learning_rate": 4.111979905947961e-06, + "loss": 0.2134, + "step": 8954 + }, + { + "epoch": 0.7094474153297683, + "grad_norm": 1.007314089879112, + "learning_rate": 4.109906034616816e-06, + "loss": 0.1743, + "step": 8955 + }, + { + "epoch": 0.7095266389384036, + "grad_norm": 1.330620689314987, + "learning_rate": 4.107832551120017e-06, + "loss": 0.2446, + "step": 8956 + }, + { + "epoch": 0.7096058625470391, + "grad_norm": 1.1128299120690779, + "learning_rate": 4.105759455594091e-06, + "loss": 0.1884, + "step": 8957 + }, + { + "epoch": 0.7096850861556744, + "grad_norm": 1.2270307392421749, + "learning_rate": 4.103686748175545e-06, + "loss": 0.2158, + "step": 8958 + }, + { + "epoch": 0.7097643097643098, + "grad_norm": 1.2344572270629277, + "learning_rate": 4.101614429000857e-06, + "loss": 0.199, + "step": 8959 + }, + { + "epoch": 0.7098435333729451, + "grad_norm": 1.374465883047471, + "learning_rate": 4.099542498206473e-06, + "loss": 0.2417, + "step": 8960 + }, + { + "epoch": 0.7099227569815805, + "grad_norm": 1.4821912590017494, + "learning_rate": 4.0974709559288275e-06, + "loss": 0.288, + "step": 8961 + }, + { + "epoch": 0.7100019805902159, + "grad_norm": 1.2705373231103056, + "learning_rate": 4.095399802304319e-06, + "loss": 0.1886, + "step": 8962 + }, + { + "epoch": 0.7100812041988512, + "grad_norm": 1.8521915691468798, + "learning_rate": 4.093329037469319e-06, + "loss": 0.3131, + "step": 8963 + }, + { + "epoch": 0.7101604278074867, + "grad_norm": 1.464937948944682, + "learning_rate": 4.091258661560184e-06, + "loss": 0.2031, + "step": 8964 + }, + { + "epoch": 0.710239651416122, + "grad_norm": 1.3769760266370796, + "learning_rate": 4.0891886747132356e-06, + "loss": 0.2451, + "step": 8965 + }, + { + "epoch": 0.7103188750247573, + "grad_norm": 1.2849825892640707, + "learning_rate": 4.087119077064772e-06, + "loss": 0.1866, + "step": 8966 + }, + { + "epoch": 0.7103980986333928, + "grad_norm": 1.1656718412010967, + "learning_rate": 4.085049868751062e-06, + "loss": 0.2546, + "step": 8967 + }, + { + "epoch": 0.7104773222420281, + "grad_norm": 1.2174788292014693, + "learning_rate": 4.082981049908362e-06, + "loss": 0.1285, + "step": 8968 + }, + { + "epoch": 0.7105565458506635, + "grad_norm": 1.5469373528723134, + "learning_rate": 4.080912620672888e-06, + "loss": 0.2923, + "step": 8969 + }, + { + "epoch": 0.7106357694592988, + "grad_norm": 1.1328493991348587, + "learning_rate": 4.078844581180833e-06, + "loss": 0.2217, + "step": 8970 + }, + { + "epoch": 0.7107149930679343, + "grad_norm": 1.7402433485228144, + "learning_rate": 4.076776931568376e-06, + "loss": 0.3558, + "step": 8971 + }, + { + "epoch": 0.7107942166765696, + "grad_norm": 1.4668948850751835, + "learning_rate": 4.074709671971657e-06, + "loss": 0.1751, + "step": 8972 + }, + { + "epoch": 0.7108734402852049, + "grad_norm": 1.2441211406329185, + "learning_rate": 4.0726428025267925e-06, + "loss": 0.2068, + "step": 8973 + }, + { + "epoch": 0.7109526638938404, + "grad_norm": 1.7137251639933813, + "learning_rate": 4.070576323369882e-06, + "loss": 0.3064, + "step": 8974 + }, + { + "epoch": 0.7110318875024757, + "grad_norm": 2.1190465614055443, + "learning_rate": 4.06851023463699e-06, + "loss": 0.3796, + "step": 8975 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 1.3114276834040828, + "learning_rate": 4.066444536464155e-06, + "loss": 0.2966, + "step": 8976 + }, + { + "epoch": 0.7111903347197465, + "grad_norm": 1.5385039036910069, + "learning_rate": 4.0643792289874e-06, + "loss": 0.2879, + "step": 8977 + }, + { + "epoch": 0.7112695583283819, + "grad_norm": 1.5517305636079035, + "learning_rate": 4.062314312342712e-06, + "loss": 0.2147, + "step": 8978 + }, + { + "epoch": 0.7113487819370172, + "grad_norm": 1.5057773684239024, + "learning_rate": 4.060249786666054e-06, + "loss": 0.2703, + "step": 8979 + }, + { + "epoch": 0.7114280055456526, + "grad_norm": 1.3201734721898857, + "learning_rate": 4.0581856520933706e-06, + "loss": 0.3002, + "step": 8980 + }, + { + "epoch": 0.711507229154288, + "grad_norm": 1.4272849989351317, + "learning_rate": 4.056121908760571e-06, + "loss": 0.2749, + "step": 8981 + }, + { + "epoch": 0.7115864527629233, + "grad_norm": 1.5035970356560153, + "learning_rate": 4.054058556803544e-06, + "loss": 0.256, + "step": 8982 + }, + { + "epoch": 0.7116656763715588, + "grad_norm": 1.287171503303611, + "learning_rate": 4.051995596358147e-06, + "loss": 0.2919, + "step": 8983 + }, + { + "epoch": 0.7117448999801941, + "grad_norm": 2.5897559008098447, + "learning_rate": 4.049933027560225e-06, + "loss": 0.3718, + "step": 8984 + }, + { + "epoch": 0.7118241235888295, + "grad_norm": 1.5390994862845784, + "learning_rate": 4.047870850545581e-06, + "loss": 0.3018, + "step": 8985 + }, + { + "epoch": 0.7119033471974648, + "grad_norm": 1.1898032086420869, + "learning_rate": 4.045809065449999e-06, + "loss": 0.1878, + "step": 8986 + }, + { + "epoch": 0.7119825708061002, + "grad_norm": 1.328411055156878, + "learning_rate": 4.043747672409245e-06, + "loss": 0.2012, + "step": 8987 + }, + { + "epoch": 0.7120617944147356, + "grad_norm": 1.4234533009263486, + "learning_rate": 4.041686671559046e-06, + "loss": 0.2763, + "step": 8988 + }, + { + "epoch": 0.7121410180233709, + "grad_norm": 1.4652878913669343, + "learning_rate": 4.039626063035107e-06, + "loss": 0.274, + "step": 8989 + }, + { + "epoch": 0.7122202416320064, + "grad_norm": 1.263822406020965, + "learning_rate": 4.0375658469731164e-06, + "loss": 0.2465, + "step": 8990 + }, + { + "epoch": 0.7122994652406417, + "grad_norm": 1.20848533173463, + "learning_rate": 4.035506023508724e-06, + "loss": 0.2154, + "step": 8991 + }, + { + "epoch": 0.7123786888492771, + "grad_norm": 1.2170323054802772, + "learning_rate": 4.033446592777558e-06, + "loss": 0.1773, + "step": 8992 + }, + { + "epoch": 0.7124579124579125, + "grad_norm": 1.0294314339054975, + "learning_rate": 4.031387554915228e-06, + "loss": 0.1985, + "step": 8993 + }, + { + "epoch": 0.7125371360665478, + "grad_norm": 1.193940916802525, + "learning_rate": 4.029328910057308e-06, + "loss": 0.2637, + "step": 8994 + }, + { + "epoch": 0.7126163596751832, + "grad_norm": 1.5566493177144933, + "learning_rate": 4.027270658339347e-06, + "loss": 0.1989, + "step": 8995 + }, + { + "epoch": 0.7126955832838185, + "grad_norm": 1.5295165194469522, + "learning_rate": 4.025212799896881e-06, + "loss": 0.2743, + "step": 8996 + }, + { + "epoch": 0.712774806892454, + "grad_norm": 1.4613446268286052, + "learning_rate": 4.023155334865401e-06, + "loss": 0.2218, + "step": 8997 + }, + { + "epoch": 0.7128540305010893, + "grad_norm": 1.2398734197867007, + "learning_rate": 4.0210982633803784e-06, + "loss": 0.2364, + "step": 8998 + }, + { + "epoch": 0.7129332541097247, + "grad_norm": 1.6925614390069985, + "learning_rate": 4.01904158557727e-06, + "loss": 0.2184, + "step": 8999 + }, + { + "epoch": 0.7130124777183601, + "grad_norm": 1.4202977316688459, + "learning_rate": 4.016985301591496e-06, + "loss": 0.2616, + "step": 9000 + }, + { + "epoch": 0.7130917013269954, + "grad_norm": 1.2506375432868608, + "learning_rate": 4.014929411558447e-06, + "loss": 0.2094, + "step": 9001 + }, + { + "epoch": 0.7131709249356308, + "grad_norm": 1.3054118280473486, + "learning_rate": 4.012873915613501e-06, + "loss": 0.176, + "step": 9002 + }, + { + "epoch": 0.7132501485442662, + "grad_norm": 1.444516804791575, + "learning_rate": 4.010818813892e-06, + "loss": 0.2752, + "step": 9003 + }, + { + "epoch": 0.7133293721529016, + "grad_norm": 1.5373647334029328, + "learning_rate": 4.008764106529259e-06, + "loss": 0.2647, + "step": 9004 + }, + { + "epoch": 0.7134085957615369, + "grad_norm": 1.6671742934707288, + "learning_rate": 4.006709793660577e-06, + "loss": 0.2392, + "step": 9005 + }, + { + "epoch": 0.7134878193701724, + "grad_norm": 1.2668977071434002, + "learning_rate": 4.004655875421217e-06, + "loss": 0.2297, + "step": 9006 + }, + { + "epoch": 0.7135670429788077, + "grad_norm": 1.5688026064058769, + "learning_rate": 4.00260235194642e-06, + "loss": 0.2741, + "step": 9007 + }, + { + "epoch": 0.713646266587443, + "grad_norm": 1.3969267008213884, + "learning_rate": 4.0005492233713964e-06, + "loss": 0.3198, + "step": 9008 + }, + { + "epoch": 0.7137254901960784, + "grad_norm": 1.417873309013034, + "learning_rate": 3.998496489831343e-06, + "loss": 0.2897, + "step": 9009 + }, + { + "epoch": 0.7138047138047138, + "grad_norm": 1.3945581015026076, + "learning_rate": 3.996444151461417e-06, + "loss": 0.2293, + "step": 9010 + }, + { + "epoch": 0.7138839374133492, + "grad_norm": 1.3622241021418544, + "learning_rate": 3.994392208396754e-06, + "loss": 0.2906, + "step": 9011 + }, + { + "epoch": 0.7139631610219845, + "grad_norm": 1.3626724666421084, + "learning_rate": 3.992340660772472e-06, + "loss": 0.1835, + "step": 9012 + }, + { + "epoch": 0.71404238463062, + "grad_norm": 1.50551276746951, + "learning_rate": 3.990289508723648e-06, + "loss": 0.2766, + "step": 9013 + }, + { + "epoch": 0.7141216082392553, + "grad_norm": 0.9556086942653758, + "learning_rate": 3.988238752385341e-06, + "loss": 0.1384, + "step": 9014 + }, + { + "epoch": 0.7142008318478906, + "grad_norm": 2.0880808386667127, + "learning_rate": 3.986188391892587e-06, + "loss": 0.4237, + "step": 9015 + }, + { + "epoch": 0.7142800554565261, + "grad_norm": 1.7198087895852605, + "learning_rate": 3.984138427380393e-06, + "loss": 0.2723, + "step": 9016 + }, + { + "epoch": 0.7143592790651614, + "grad_norm": 1.4521358032938667, + "learning_rate": 3.982088858983733e-06, + "loss": 0.2654, + "step": 9017 + }, + { + "epoch": 0.7144385026737968, + "grad_norm": 1.3883799319391696, + "learning_rate": 3.9800396868375675e-06, + "loss": 0.2716, + "step": 9018 + }, + { + "epoch": 0.7145177262824322, + "grad_norm": 1.0780511348095898, + "learning_rate": 3.977990911076823e-06, + "loss": 0.2129, + "step": 9019 + }, + { + "epoch": 0.7145969498910676, + "grad_norm": 1.5947985654274843, + "learning_rate": 3.975942531836397e-06, + "loss": 0.2882, + "step": 9020 + }, + { + "epoch": 0.7146761734997029, + "grad_norm": 1.95268466018519, + "learning_rate": 3.973894549251175e-06, + "loss": 0.2846, + "step": 9021 + }, + { + "epoch": 0.7147553971083382, + "grad_norm": 1.3787962054997573, + "learning_rate": 3.971846963455999e-06, + "loss": 0.2275, + "step": 9022 + }, + { + "epoch": 0.7148346207169737, + "grad_norm": 1.1991493716763608, + "learning_rate": 3.969799774585696e-06, + "loss": 0.1521, + "step": 9023 + }, + { + "epoch": 0.714913844325609, + "grad_norm": 1.289898908853806, + "learning_rate": 3.967752982775058e-06, + "loss": 0.2519, + "step": 9024 + }, + { + "epoch": 0.7149930679342444, + "grad_norm": 1.544903453103133, + "learning_rate": 3.965706588158865e-06, + "loss": 0.2991, + "step": 9025 + }, + { + "epoch": 0.7150722915428798, + "grad_norm": 1.294177410951474, + "learning_rate": 3.963660590871858e-06, + "loss": 0.2407, + "step": 9026 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 1.1544579395495873, + "learning_rate": 3.961614991048752e-06, + "loss": 0.2209, + "step": 9027 + }, + { + "epoch": 0.7152307387601505, + "grad_norm": 1.382340644960581, + "learning_rate": 3.959569788824248e-06, + "loss": 0.2418, + "step": 9028 + }, + { + "epoch": 0.7153099623687859, + "grad_norm": 1.2359153387844068, + "learning_rate": 3.957524984333009e-06, + "loss": 0.1819, + "step": 9029 + }, + { + "epoch": 0.7153891859774213, + "grad_norm": 1.690780457319807, + "learning_rate": 3.955480577709672e-06, + "loss": 0.2685, + "step": 9030 + }, + { + "epoch": 0.7154684095860566, + "grad_norm": 1.6204886194279187, + "learning_rate": 3.953436569088856e-06, + "loss": 0.2346, + "step": 9031 + }, + { + "epoch": 0.7155476331946921, + "grad_norm": 1.2425747479937628, + "learning_rate": 3.951392958605149e-06, + "loss": 0.2361, + "step": 9032 + }, + { + "epoch": 0.7156268568033274, + "grad_norm": 1.4058289208351415, + "learning_rate": 3.949349746393108e-06, + "loss": 0.2157, + "step": 9033 + }, + { + "epoch": 0.7157060804119628, + "grad_norm": 1.3169195354893017, + "learning_rate": 3.947306932587277e-06, + "loss": 0.2385, + "step": 9034 + }, + { + "epoch": 0.7157853040205981, + "grad_norm": 1.0951620641319275, + "learning_rate": 3.945264517322159e-06, + "loss": 0.2082, + "step": 9035 + }, + { + "epoch": 0.7158645276292335, + "grad_norm": 1.197357149111598, + "learning_rate": 3.943222500732241e-06, + "loss": 0.2261, + "step": 9036 + }, + { + "epoch": 0.7159437512378689, + "grad_norm": 1.3422019346954976, + "learning_rate": 3.941180882951972e-06, + "loss": 0.2534, + "step": 9037 + }, + { + "epoch": 0.7160229748465042, + "grad_norm": 1.3501328623608448, + "learning_rate": 3.9391396641157945e-06, + "loss": 0.2615, + "step": 9038 + }, + { + "epoch": 0.7161021984551397, + "grad_norm": 0.9942498582722339, + "learning_rate": 3.937098844358106e-06, + "loss": 0.143, + "step": 9039 + }, + { + "epoch": 0.716181422063775, + "grad_norm": 1.4172362741288296, + "learning_rate": 3.935058423813282e-06, + "loss": 0.3341, + "step": 9040 + }, + { + "epoch": 0.7162606456724103, + "grad_norm": 1.2363248904148585, + "learning_rate": 3.933018402615683e-06, + "loss": 0.1997, + "step": 9041 + }, + { + "epoch": 0.7163398692810458, + "grad_norm": 1.3422468309943558, + "learning_rate": 3.9309787808996284e-06, + "loss": 0.2196, + "step": 9042 + }, + { + "epoch": 0.7164190928896811, + "grad_norm": 0.9774040519574582, + "learning_rate": 3.928939558799415e-06, + "loss": 0.1153, + "step": 9043 + }, + { + "epoch": 0.7164983164983165, + "grad_norm": 1.0863916992133649, + "learning_rate": 3.926900736449324e-06, + "loss": 0.2064, + "step": 9044 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 1.0002593720277957, + "learning_rate": 3.924862313983597e-06, + "loss": 0.1596, + "step": 9045 + }, + { + "epoch": 0.7166567637155873, + "grad_norm": 1.3932472778366485, + "learning_rate": 3.922824291536452e-06, + "loss": 0.2481, + "step": 9046 + }, + { + "epoch": 0.7167359873242226, + "grad_norm": 0.9962748413897758, + "learning_rate": 3.920786669242089e-06, + "loss": 0.1278, + "step": 9047 + }, + { + "epoch": 0.7168152109328579, + "grad_norm": 1.6232137006139786, + "learning_rate": 3.918749447234674e-06, + "loss": 0.2955, + "step": 9048 + }, + { + "epoch": 0.7168944345414934, + "grad_norm": 1.5235827747133797, + "learning_rate": 3.9167126256483415e-06, + "loss": 0.2681, + "step": 9049 + }, + { + "epoch": 0.7169736581501287, + "grad_norm": 1.3367344251523108, + "learning_rate": 3.914676204617216e-06, + "loss": 0.2435, + "step": 9050 + }, + { + "epoch": 0.7170528817587641, + "grad_norm": 1.3680093831329665, + "learning_rate": 3.912640184275381e-06, + "loss": 0.246, + "step": 9051 + }, + { + "epoch": 0.7171321053673995, + "grad_norm": 1.3377380656431945, + "learning_rate": 3.9106045647569005e-06, + "loss": 0.2599, + "step": 9052 + }, + { + "epoch": 0.7172113289760349, + "grad_norm": 1.409688831657273, + "learning_rate": 3.908569346195804e-06, + "loss": 0.2097, + "step": 9053 + }, + { + "epoch": 0.7172905525846702, + "grad_norm": 1.3583176684963718, + "learning_rate": 3.90653452872611e-06, + "loss": 0.2054, + "step": 9054 + }, + { + "epoch": 0.7173697761933056, + "grad_norm": 1.7883433832273263, + "learning_rate": 3.904500112481798e-06, + "loss": 0.2189, + "step": 9055 + }, + { + "epoch": 0.717448999801941, + "grad_norm": 1.232030654344082, + "learning_rate": 3.902466097596821e-06, + "loss": 0.2295, + "step": 9056 + }, + { + "epoch": 0.7175282234105763, + "grad_norm": 1.4565580123232083, + "learning_rate": 3.900432484205115e-06, + "loss": 0.2396, + "step": 9057 + }, + { + "epoch": 0.7176074470192118, + "grad_norm": 1.414573804697823, + "learning_rate": 3.89839927244058e-06, + "loss": 0.273, + "step": 9058 + }, + { + "epoch": 0.7176866706278471, + "grad_norm": 1.0414269176818227, + "learning_rate": 3.89636646243709e-06, + "loss": 0.1537, + "step": 9059 + }, + { + "epoch": 0.7177658942364825, + "grad_norm": 1.2462945110856305, + "learning_rate": 3.894334054328505e-06, + "loss": 0.2506, + "step": 9060 + }, + { + "epoch": 0.7178451178451178, + "grad_norm": 1.1804790382461414, + "learning_rate": 3.892302048248642e-06, + "loss": 0.2585, + "step": 9061 + }, + { + "epoch": 0.7179243414537532, + "grad_norm": 1.4667374757539555, + "learning_rate": 3.890270444331298e-06, + "loss": 0.2645, + "step": 9062 + }, + { + "epoch": 0.7180035650623886, + "grad_norm": 0.9600023172585878, + "learning_rate": 3.888239242710251e-06, + "loss": 0.1723, + "step": 9063 + }, + { + "epoch": 0.7180827886710239, + "grad_norm": 1.01333676930565, + "learning_rate": 3.886208443519242e-06, + "loss": 0.195, + "step": 9064 + }, + { + "epoch": 0.7181620122796594, + "grad_norm": 1.3008146269289977, + "learning_rate": 3.884178046891984e-06, + "loss": 0.2526, + "step": 9065 + }, + { + "epoch": 0.7182412358882947, + "grad_norm": 1.6311986018573938, + "learning_rate": 3.88214805296218e-06, + "loss": 0.2837, + "step": 9066 + }, + { + "epoch": 0.7183204594969301, + "grad_norm": 1.6269981796246011, + "learning_rate": 3.880118461863488e-06, + "loss": 0.2876, + "step": 9067 + }, + { + "epoch": 0.7183996831055655, + "grad_norm": 1.11590203203802, + "learning_rate": 3.878089273729549e-06, + "loss": 0.1671, + "step": 9068 + }, + { + "epoch": 0.7184789067142008, + "grad_norm": 1.5151874495742101, + "learning_rate": 3.876060488693971e-06, + "loss": 0.2842, + "step": 9069 + }, + { + "epoch": 0.7185581303228362, + "grad_norm": 1.3642308576024993, + "learning_rate": 3.874032106890347e-06, + "loss": 0.3031, + "step": 9070 + }, + { + "epoch": 0.7186373539314715, + "grad_norm": 1.2313416433394608, + "learning_rate": 3.872004128452231e-06, + "loss": 0.1562, + "step": 9071 + }, + { + "epoch": 0.718716577540107, + "grad_norm": 1.230306013720861, + "learning_rate": 3.8699765535131565e-06, + "loss": 0.2573, + "step": 9072 + }, + { + "epoch": 0.7187958011487423, + "grad_norm": 1.1694303246651188, + "learning_rate": 3.867949382206632e-06, + "loss": 0.254, + "step": 9073 + }, + { + "epoch": 0.7188750247573777, + "grad_norm": 1.3620704442743587, + "learning_rate": 3.8659226146661344e-06, + "loss": 0.1971, + "step": 9074 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 1.5346681498097818, + "learning_rate": 3.8638962510251175e-06, + "loss": 0.2883, + "step": 9075 + }, + { + "epoch": 0.7190334719746484, + "grad_norm": 1.4799091518088818, + "learning_rate": 3.861870291417008e-06, + "loss": 0.2909, + "step": 9076 + }, + { + "epoch": 0.7191126955832838, + "grad_norm": 1.2910406950038384, + "learning_rate": 3.859844735975205e-06, + "loss": 0.2187, + "step": 9077 + }, + { + "epoch": 0.7191919191919192, + "grad_norm": 1.2724622355374415, + "learning_rate": 3.857819584833078e-06, + "loss": 0.1962, + "step": 9078 + }, + { + "epoch": 0.7192711428005546, + "grad_norm": 1.3141976310423746, + "learning_rate": 3.855794838123981e-06, + "loss": 0.2358, + "step": 9079 + }, + { + "epoch": 0.7193503664091899, + "grad_norm": 1.2624793606379443, + "learning_rate": 3.85377049598123e-06, + "loss": 0.1814, + "step": 9080 + }, + { + "epoch": 0.7194295900178254, + "grad_norm": 1.1727626485389202, + "learning_rate": 3.851746558538113e-06, + "loss": 0.154, + "step": 9081 + }, + { + "epoch": 0.7195088136264607, + "grad_norm": 1.4935629384103712, + "learning_rate": 3.849723025927907e-06, + "loss": 0.2135, + "step": 9082 + }, + { + "epoch": 0.719588037235096, + "grad_norm": 1.4589405184697402, + "learning_rate": 3.847699898283846e-06, + "loss": 0.2659, + "step": 9083 + }, + { + "epoch": 0.7196672608437314, + "grad_norm": 1.548261058681011, + "learning_rate": 3.84567717573914e-06, + "loss": 0.286, + "step": 9084 + }, + { + "epoch": 0.7197464844523668, + "grad_norm": 1.704379046714596, + "learning_rate": 3.843654858426981e-06, + "loss": 0.2722, + "step": 9085 + }, + { + "epoch": 0.7198257080610022, + "grad_norm": 1.3708585904493755, + "learning_rate": 3.84163294648053e-06, + "loss": 0.2662, + "step": 9086 + }, + { + "epoch": 0.7199049316696375, + "grad_norm": 1.722818761832865, + "learning_rate": 3.839611440032912e-06, + "loss": 0.2947, + "step": 9087 + }, + { + "epoch": 0.719984155278273, + "grad_norm": 1.6219138598153262, + "learning_rate": 3.837590339217243e-06, + "loss": 0.2602, + "step": 9088 + }, + { + "epoch": 0.7200633788869083, + "grad_norm": 1.5719260667691108, + "learning_rate": 3.835569644166599e-06, + "loss": 0.2513, + "step": 9089 + }, + { + "epoch": 0.7201426024955436, + "grad_norm": 1.1547441760721175, + "learning_rate": 3.833549355014028e-06, + "loss": 0.2083, + "step": 9090 + }, + { + "epoch": 0.7202218261041791, + "grad_norm": 1.4110336340101988, + "learning_rate": 3.8315294718925656e-06, + "loss": 0.2201, + "step": 9091 + }, + { + "epoch": 0.7203010497128144, + "grad_norm": 1.598816391263384, + "learning_rate": 3.829509994935206e-06, + "loss": 0.2865, + "step": 9092 + }, + { + "epoch": 0.7203802733214498, + "grad_norm": 1.2538371785748803, + "learning_rate": 3.827490924274922e-06, + "loss": 0.2382, + "step": 9093 + }, + { + "epoch": 0.7204594969300852, + "grad_norm": 1.0873609452603825, + "learning_rate": 3.825472260044658e-06, + "loss": 0.1541, + "step": 9094 + }, + { + "epoch": 0.7205387205387206, + "grad_norm": 1.3573680158424042, + "learning_rate": 3.8234540023773385e-06, + "loss": 0.2475, + "step": 9095 + }, + { + "epoch": 0.7206179441473559, + "grad_norm": 1.4837327294446518, + "learning_rate": 3.821436151405854e-06, + "loss": 0.191, + "step": 9096 + }, + { + "epoch": 0.7206971677559912, + "grad_norm": 1.6275015565994009, + "learning_rate": 3.819418707263065e-06, + "loss": 0.3439, + "step": 9097 + }, + { + "epoch": 0.7207763913646267, + "grad_norm": 1.2794147769301913, + "learning_rate": 3.8174016700818196e-06, + "loss": 0.1915, + "step": 9098 + }, + { + "epoch": 0.720855614973262, + "grad_norm": 1.5547963555298354, + "learning_rate": 3.815385039994925e-06, + "loss": 0.2681, + "step": 9099 + }, + { + "epoch": 0.7209348385818974, + "grad_norm": 1.3879000668113997, + "learning_rate": 3.8133688171351645e-06, + "loss": 0.2446, + "step": 9100 + }, + { + "epoch": 0.7210140621905328, + "grad_norm": 1.1918122028659293, + "learning_rate": 3.811353001635302e-06, + "loss": 0.2133, + "step": 9101 + }, + { + "epoch": 0.7210932857991682, + "grad_norm": 0.9975258595949281, + "learning_rate": 3.8093375936280665e-06, + "loss": 0.1668, + "step": 9102 + }, + { + "epoch": 0.7211725094078035, + "grad_norm": 1.7044578662922938, + "learning_rate": 3.807322593246159e-06, + "loss": 0.3496, + "step": 9103 + }, + { + "epoch": 0.7212517330164389, + "grad_norm": 1.2902476579414237, + "learning_rate": 3.805308000622265e-06, + "loss": 0.204, + "step": 9104 + }, + { + "epoch": 0.7213309566250743, + "grad_norm": 1.3657153123355068, + "learning_rate": 3.8032938158890333e-06, + "loss": 0.2819, + "step": 9105 + }, + { + "epoch": 0.7214101802337096, + "grad_norm": 1.1325856774703331, + "learning_rate": 3.8012800391790814e-06, + "loss": 0.163, + "step": 9106 + }, + { + "epoch": 0.7214894038423451, + "grad_norm": 1.2128913966401265, + "learning_rate": 3.799266670625018e-06, + "loss": 0.1836, + "step": 9107 + }, + { + "epoch": 0.7215686274509804, + "grad_norm": 1.06215324418043, + "learning_rate": 3.797253710359409e-06, + "loss": 0.2255, + "step": 9108 + }, + { + "epoch": 0.7216478510596158, + "grad_norm": 1.7021288409437525, + "learning_rate": 3.7952411585147954e-06, + "loss": 0.3169, + "step": 9109 + }, + { + "epoch": 0.7217270746682511, + "grad_norm": 1.3048481119217095, + "learning_rate": 3.793229015223694e-06, + "loss": 0.2835, + "step": 9110 + }, + { + "epoch": 0.7218062982768865, + "grad_norm": 1.155927614584592, + "learning_rate": 3.7912172806186e-06, + "loss": 0.2117, + "step": 9111 + }, + { + "epoch": 0.7218855218855219, + "grad_norm": 2.1252991957176386, + "learning_rate": 3.7892059548319726e-06, + "loss": 0.1649, + "step": 9112 + }, + { + "epoch": 0.7219647454941572, + "grad_norm": 1.4229190429994243, + "learning_rate": 3.7871950379962463e-06, + "loss": 0.2586, + "step": 9113 + }, + { + "epoch": 0.7220439691027927, + "grad_norm": 1.610405328284298, + "learning_rate": 3.785184530243835e-06, + "loss": 0.3102, + "step": 9114 + }, + { + "epoch": 0.722123192711428, + "grad_norm": 1.4951300508021137, + "learning_rate": 3.7831744317071194e-06, + "loss": 0.2944, + "step": 9115 + }, + { + "epoch": 0.7222024163200634, + "grad_norm": 1.3742903698509503, + "learning_rate": 3.7811647425184508e-06, + "loss": 0.2977, + "step": 9116 + }, + { + "epoch": 0.7222816399286988, + "grad_norm": 1.1168992732742293, + "learning_rate": 3.7791554628101635e-06, + "loss": 0.2328, + "step": 9117 + }, + { + "epoch": 0.7223608635373341, + "grad_norm": 1.2595954596448162, + "learning_rate": 3.777146592714557e-06, + "loss": 0.1812, + "step": 9118 + }, + { + "epoch": 0.7224400871459695, + "grad_norm": 1.4626741318563155, + "learning_rate": 3.7751381323639e-06, + "loss": 0.2663, + "step": 9119 + }, + { + "epoch": 0.7225193107546048, + "grad_norm": 1.3626353026404632, + "learning_rate": 3.7731300818904494e-06, + "loss": 0.2432, + "step": 9120 + }, + { + "epoch": 0.7225985343632403, + "grad_norm": 1.2176623681197087, + "learning_rate": 3.7711224414264216e-06, + "loss": 0.2071, + "step": 9121 + }, + { + "epoch": 0.7226777579718756, + "grad_norm": 1.386022587529955, + "learning_rate": 3.7691152111040087e-06, + "loss": 0.2965, + "step": 9122 + }, + { + "epoch": 0.7227569815805109, + "grad_norm": 1.3087252713515691, + "learning_rate": 3.767108391055374e-06, + "loss": 0.2382, + "step": 9123 + }, + { + "epoch": 0.7228362051891464, + "grad_norm": 1.2507412328548588, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.2175, + "step": 9124 + }, + { + "epoch": 0.7229154287977817, + "grad_norm": 1.496495849835261, + "learning_rate": 3.7630959823079914e-06, + "loss": 0.2511, + "step": 9125 + }, + { + "epoch": 0.7229946524064171, + "grad_norm": 1.0128113093324684, + "learning_rate": 3.761090393873432e-06, + "loss": 0.1491, + "step": 9126 + }, + { + "epoch": 0.7230738760150525, + "grad_norm": 1.10580729200062, + "learning_rate": 3.7590852162410553e-06, + "loss": 0.1461, + "step": 9127 + }, + { + "epoch": 0.7231530996236879, + "grad_norm": 1.1735246489626505, + "learning_rate": 3.757080449542887e-06, + "loss": 0.2685, + "step": 9128 + }, + { + "epoch": 0.7232323232323232, + "grad_norm": 1.5722172581561735, + "learning_rate": 3.7550760939109287e-06, + "loss": 0.3031, + "step": 9129 + }, + { + "epoch": 0.7233115468409586, + "grad_norm": 1.326354747879238, + "learning_rate": 3.7530721494771648e-06, + "loss": 0.2596, + "step": 9130 + }, + { + "epoch": 0.723390770449594, + "grad_norm": 1.4097677300592528, + "learning_rate": 3.751068616373541e-06, + "loss": 0.2374, + "step": 9131 + }, + { + "epoch": 0.7234699940582293, + "grad_norm": 1.2765079594111168, + "learning_rate": 3.749065494731978e-06, + "loss": 0.1791, + "step": 9132 + }, + { + "epoch": 0.7235492176668648, + "grad_norm": 1.2524215207437652, + "learning_rate": 3.747062784684378e-06, + "loss": 0.2361, + "step": 9133 + }, + { + "epoch": 0.7236284412755001, + "grad_norm": 1.496733449333491, + "learning_rate": 3.7450604863626063e-06, + "loss": 0.2117, + "step": 9134 + }, + { + "epoch": 0.7237076648841355, + "grad_norm": 1.5781423855302263, + "learning_rate": 3.7430585998985004e-06, + "loss": 0.3049, + "step": 9135 + }, + { + "epoch": 0.7237868884927708, + "grad_norm": 1.4988720101411532, + "learning_rate": 3.7410571254238835e-06, + "loss": 0.2465, + "step": 9136 + }, + { + "epoch": 0.7238661121014062, + "grad_norm": 1.1902463792591904, + "learning_rate": 3.7390560630705387e-06, + "loss": 0.2148, + "step": 9137 + }, + { + "epoch": 0.7239453357100416, + "grad_norm": 1.1679536509076427, + "learning_rate": 3.7370554129702265e-06, + "loss": 0.1793, + "step": 9138 + }, + { + "epoch": 0.7240245593186769, + "grad_norm": 1.372903686651834, + "learning_rate": 3.735055175254676e-06, + "loss": 0.2671, + "step": 9139 + }, + { + "epoch": 0.7241037829273124, + "grad_norm": 1.2645719699218168, + "learning_rate": 3.733055350055601e-06, + "loss": 0.2085, + "step": 9140 + }, + { + "epoch": 0.7241830065359477, + "grad_norm": 1.2307984472419495, + "learning_rate": 3.7310559375046774e-06, + "loss": 0.2338, + "step": 9141 + }, + { + "epoch": 0.7242622301445831, + "grad_norm": 0.9072967688136768, + "learning_rate": 3.7290569377335517e-06, + "loss": 0.1457, + "step": 9142 + }, + { + "epoch": 0.7243414537532185, + "grad_norm": 1.312261910175203, + "learning_rate": 3.7270583508738565e-06, + "loss": 0.2024, + "step": 9143 + }, + { + "epoch": 0.7244206773618538, + "grad_norm": 1.4524428509186667, + "learning_rate": 3.725060177057185e-06, + "loss": 0.3443, + "step": 9144 + }, + { + "epoch": 0.7244999009704892, + "grad_norm": 1.561340853007366, + "learning_rate": 3.723062416415105e-06, + "loss": 0.266, + "step": 9145 + }, + { + "epoch": 0.7245791245791245, + "grad_norm": 1.169470448614544, + "learning_rate": 3.721065069079165e-06, + "loss": 0.1993, + "step": 9146 + }, + { + "epoch": 0.72465834818776, + "grad_norm": 1.3998710336942795, + "learning_rate": 3.7190681351808778e-06, + "loss": 0.2344, + "step": 9147 + }, + { + "epoch": 0.7247375717963953, + "grad_norm": 1.8655369572716267, + "learning_rate": 3.7170716148517294e-06, + "loss": 0.279, + "step": 9148 + }, + { + "epoch": 0.7248167954050307, + "grad_norm": 1.1494656495790188, + "learning_rate": 3.715075508223187e-06, + "loss": 0.1921, + "step": 9149 + }, + { + "epoch": 0.7248960190136661, + "grad_norm": 1.2025807824181314, + "learning_rate": 3.71307981542668e-06, + "loss": 0.2421, + "step": 9150 + }, + { + "epoch": 0.7249752426223014, + "grad_norm": 1.497224958482197, + "learning_rate": 3.7110845365936144e-06, + "loss": 0.1951, + "step": 9151 + }, + { + "epoch": 0.7250544662309368, + "grad_norm": 1.2831204237393163, + "learning_rate": 3.709089671855378e-06, + "loss": 0.2439, + "step": 9152 + }, + { + "epoch": 0.7251336898395722, + "grad_norm": 1.1272935862878195, + "learning_rate": 3.707095221343313e-06, + "loss": 0.1831, + "step": 9153 + }, + { + "epoch": 0.7252129134482076, + "grad_norm": 1.3080852736183557, + "learning_rate": 3.7051011851887455e-06, + "loss": 0.2622, + "step": 9154 + }, + { + "epoch": 0.7252921370568429, + "grad_norm": 1.4049070830851684, + "learning_rate": 3.7031075635229787e-06, + "loss": 0.2894, + "step": 9155 + }, + { + "epoch": 0.7253713606654784, + "grad_norm": 1.3485045411849925, + "learning_rate": 3.70111435647728e-06, + "loss": 0.18, + "step": 9156 + }, + { + "epoch": 0.7254505842741137, + "grad_norm": 1.1033833265399628, + "learning_rate": 3.6991215641828903e-06, + "loss": 0.2039, + "step": 9157 + }, + { + "epoch": 0.725529807882749, + "grad_norm": 1.8296198157336274, + "learning_rate": 3.6971291867710303e-06, + "loss": 0.3798, + "step": 9158 + }, + { + "epoch": 0.7256090314913844, + "grad_norm": 1.062518835043534, + "learning_rate": 3.6951372243728854e-06, + "loss": 0.1571, + "step": 9159 + }, + { + "epoch": 0.7256882551000198, + "grad_norm": 1.48299692990113, + "learning_rate": 3.693145677119615e-06, + "loss": 0.2854, + "step": 9160 + }, + { + "epoch": 0.7257674787086552, + "grad_norm": 1.6220806568353399, + "learning_rate": 3.691154545142357e-06, + "loss": 0.3071, + "step": 9161 + }, + { + "epoch": 0.7258467023172905, + "grad_norm": 1.5203522106123646, + "learning_rate": 3.6891638285722176e-06, + "loss": 0.227, + "step": 9162 + }, + { + "epoch": 0.725925925925926, + "grad_norm": 1.328898106537588, + "learning_rate": 3.687173527540273e-06, + "loss": 0.279, + "step": 9163 + }, + { + "epoch": 0.7260051495345613, + "grad_norm": 1.7248760988692442, + "learning_rate": 3.6851836421775733e-06, + "loss": 0.3089, + "step": 9164 + }, + { + "epoch": 0.7260843731431966, + "grad_norm": 1.310575111967257, + "learning_rate": 3.683194172615149e-06, + "loss": 0.249, + "step": 9165 + }, + { + "epoch": 0.7261635967518321, + "grad_norm": 1.2477260115881732, + "learning_rate": 3.681205118983995e-06, + "loss": 0.2313, + "step": 9166 + }, + { + "epoch": 0.7262428203604674, + "grad_norm": 1.213549402713103, + "learning_rate": 3.6792164814150756e-06, + "loss": 0.1836, + "step": 9167 + }, + { + "epoch": 0.7263220439691028, + "grad_norm": 1.1400004162296868, + "learning_rate": 3.6772282600393393e-06, + "loss": 0.1793, + "step": 9168 + }, + { + "epoch": 0.7264012675777382, + "grad_norm": 1.2709422135491037, + "learning_rate": 3.675240454987701e-06, + "loss": 0.1752, + "step": 9169 + }, + { + "epoch": 0.7264804911863736, + "grad_norm": 1.4090124291424384, + "learning_rate": 3.6732530663910415e-06, + "loss": 0.1986, + "step": 9170 + }, + { + "epoch": 0.7265597147950089, + "grad_norm": 1.1188457934832954, + "learning_rate": 3.6712660943802292e-06, + "loss": 0.1625, + "step": 9171 + }, + { + "epoch": 0.7266389384036442, + "grad_norm": 1.316698601066028, + "learning_rate": 3.6692795390860913e-06, + "loss": 0.2277, + "step": 9172 + }, + { + "epoch": 0.7267181620122797, + "grad_norm": 1.3798962236311325, + "learning_rate": 3.667293400639432e-06, + "loss": 0.2361, + "step": 9173 + }, + { + "epoch": 0.726797385620915, + "grad_norm": 1.1470900392120604, + "learning_rate": 3.665307679171034e-06, + "loss": 0.1955, + "step": 9174 + }, + { + "epoch": 0.7268766092295504, + "grad_norm": 1.1680899340665103, + "learning_rate": 3.6633223748116454e-06, + "loss": 0.2032, + "step": 9175 + }, + { + "epoch": 0.7269558328381858, + "grad_norm": 1.5435944484658588, + "learning_rate": 3.661337487691985e-06, + "loss": 0.267, + "step": 9176 + }, + { + "epoch": 0.7270350564468212, + "grad_norm": 1.3897853506991977, + "learning_rate": 3.659353017942754e-06, + "loss": 0.3199, + "step": 9177 + }, + { + "epoch": 0.7271142800554565, + "grad_norm": 1.368592942867567, + "learning_rate": 3.6573689656946177e-06, + "loss": 0.2274, + "step": 9178 + }, + { + "epoch": 0.7271935036640919, + "grad_norm": 1.1122670078375376, + "learning_rate": 3.655385331078217e-06, + "loss": 0.1743, + "step": 9179 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.4071660036217781, + "learning_rate": 3.6534021142241595e-06, + "loss": 0.2613, + "step": 9180 + }, + { + "epoch": 0.7273519508813626, + "grad_norm": 1.8890939266097444, + "learning_rate": 3.6514193152630382e-06, + "loss": 0.272, + "step": 9181 + }, + { + "epoch": 0.7274311744899981, + "grad_norm": 1.3160664503159987, + "learning_rate": 3.649436934325409e-06, + "loss": 0.2217, + "step": 9182 + }, + { + "epoch": 0.7275103980986334, + "grad_norm": 1.0103078103482206, + "learning_rate": 3.647454971541796e-06, + "loss": 0.2157, + "step": 9183 + }, + { + "epoch": 0.7275896217072688, + "grad_norm": 0.8574222887958896, + "learning_rate": 3.6454734270427107e-06, + "loss": 0.1131, + "step": 9184 + }, + { + "epoch": 0.7276688453159041, + "grad_norm": 1.5113481448845036, + "learning_rate": 3.6434923009586244e-06, + "loss": 0.2745, + "step": 9185 + }, + { + "epoch": 0.7277480689245395, + "grad_norm": 1.2424786346479255, + "learning_rate": 3.6415115934199795e-06, + "loss": 0.2065, + "step": 9186 + }, + { + "epoch": 0.7278272925331749, + "grad_norm": 1.5154489509773454, + "learning_rate": 3.6395313045572055e-06, + "loss": 0.2691, + "step": 9187 + }, + { + "epoch": 0.7279065161418102, + "grad_norm": 1.2871245413142585, + "learning_rate": 3.6375514345006913e-06, + "loss": 0.2071, + "step": 9188 + }, + { + "epoch": 0.7279857397504457, + "grad_norm": 1.2526553455170215, + "learning_rate": 3.635571983380797e-06, + "loss": 0.1724, + "step": 9189 + }, + { + "epoch": 0.728064963359081, + "grad_norm": 1.5383174550681595, + "learning_rate": 3.6335929513278667e-06, + "loss": 0.2606, + "step": 9190 + }, + { + "epoch": 0.7281441869677164, + "grad_norm": 1.124724780323608, + "learning_rate": 3.631614338472208e-06, + "loss": 0.2089, + "step": 9191 + }, + { + "epoch": 0.7282234105763518, + "grad_norm": 1.1423749369986838, + "learning_rate": 3.6296361449440985e-06, + "loss": 0.2272, + "step": 9192 + }, + { + "epoch": 0.7283026341849871, + "grad_norm": 1.3092976939433734, + "learning_rate": 3.6276583708738013e-06, + "loss": 0.2267, + "step": 9193 + }, + { + "epoch": 0.7283818577936225, + "grad_norm": 1.2717162872533698, + "learning_rate": 3.6256810163915368e-06, + "loss": 0.2111, + "step": 9194 + }, + { + "epoch": 0.7284610814022578, + "grad_norm": 1.294649018524043, + "learning_rate": 3.623704081627507e-06, + "loss": 0.2673, + "step": 9195 + }, + { + "epoch": 0.7285403050108933, + "grad_norm": 1.1352388942796117, + "learning_rate": 3.62172756671188e-06, + "loss": 0.1596, + "step": 9196 + }, + { + "epoch": 0.7286195286195286, + "grad_norm": 1.3163293480876854, + "learning_rate": 3.619751471774805e-06, + "loss": 0.2469, + "step": 9197 + }, + { + "epoch": 0.728698752228164, + "grad_norm": 1.1337940027698996, + "learning_rate": 3.6177757969463956e-06, + "loss": 0.2014, + "step": 9198 + }, + { + "epoch": 0.7287779758367994, + "grad_norm": 1.317877051692378, + "learning_rate": 3.615800542356738e-06, + "loss": 0.1979, + "step": 9199 + }, + { + "epoch": 0.7288571994454347, + "grad_norm": 1.2159449622660556, + "learning_rate": 3.6138257081358985e-06, + "loss": 0.2114, + "step": 9200 + }, + { + "epoch": 0.7289364230540701, + "grad_norm": 1.298740027294028, + "learning_rate": 3.6118512944139084e-06, + "loss": 0.2574, + "step": 9201 + }, + { + "epoch": 0.7290156466627055, + "grad_norm": 1.4551696096768028, + "learning_rate": 3.609877301320769e-06, + "loss": 0.3134, + "step": 9202 + }, + { + "epoch": 0.7290948702713409, + "grad_norm": 1.5053194288121399, + "learning_rate": 3.607903728986465e-06, + "loss": 0.2715, + "step": 9203 + }, + { + "epoch": 0.7291740938799762, + "grad_norm": 1.5620221337434352, + "learning_rate": 3.6059305775409435e-06, + "loss": 0.3126, + "step": 9204 + }, + { + "epoch": 0.7292533174886116, + "grad_norm": 1.4039005881959574, + "learning_rate": 3.6039578471141244e-06, + "loss": 0.2692, + "step": 9205 + }, + { + "epoch": 0.729332541097247, + "grad_norm": 1.4417294694543692, + "learning_rate": 3.6019855378359092e-06, + "loss": 0.263, + "step": 9206 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 0.9734123703556665, + "learning_rate": 3.6000136498361605e-06, + "loss": 0.1242, + "step": 9207 + }, + { + "epoch": 0.7294909883145178, + "grad_norm": 0.7478774499811458, + "learning_rate": 3.5980421832447188e-06, + "loss": 0.1154, + "step": 9208 + }, + { + "epoch": 0.7295702119231531, + "grad_norm": 1.3087079723772048, + "learning_rate": 3.5960711381913904e-06, + "loss": 0.2268, + "step": 9209 + }, + { + "epoch": 0.7296494355317885, + "grad_norm": 1.3090031073400228, + "learning_rate": 3.5941005148059684e-06, + "loss": 0.2945, + "step": 9210 + }, + { + "epoch": 0.7297286591404238, + "grad_norm": 1.2752879997082822, + "learning_rate": 3.5921303132182038e-06, + "loss": 0.2515, + "step": 9211 + }, + { + "epoch": 0.7298078827490592, + "grad_norm": 1.2470509100316363, + "learning_rate": 3.5901605335578214e-06, + "loss": 0.2622, + "step": 9212 + }, + { + "epoch": 0.7298871063576946, + "grad_norm": 1.3744644439824854, + "learning_rate": 3.5881911759545296e-06, + "loss": 0.2886, + "step": 9213 + }, + { + "epoch": 0.7299663299663299, + "grad_norm": 0.9697025603182141, + "learning_rate": 3.5862222405379975e-06, + "loss": 0.1375, + "step": 9214 + }, + { + "epoch": 0.7300455535749654, + "grad_norm": 1.4726527285422166, + "learning_rate": 3.584253727437866e-06, + "loss": 0.2277, + "step": 9215 + }, + { + "epoch": 0.7301247771836007, + "grad_norm": 1.2100584204271536, + "learning_rate": 3.5822856367837587e-06, + "loss": 0.2463, + "step": 9216 + }, + { + "epoch": 0.7302040007922361, + "grad_norm": 1.280314552776881, + "learning_rate": 3.5803179687052636e-06, + "loss": 0.2011, + "step": 9217 + }, + { + "epoch": 0.7302832244008715, + "grad_norm": 1.2978310272709583, + "learning_rate": 3.578350723331937e-06, + "loss": 0.1776, + "step": 9218 + }, + { + "epoch": 0.7303624480095068, + "grad_norm": 1.481709808646171, + "learning_rate": 3.5763839007933186e-06, + "loss": 0.3289, + "step": 9219 + }, + { + "epoch": 0.7304416716181422, + "grad_norm": 1.193720926516714, + "learning_rate": 3.574417501218913e-06, + "loss": 0.2148, + "step": 9220 + }, + { + "epoch": 0.7305208952267775, + "grad_norm": 1.6336905063296179, + "learning_rate": 3.572451524738193e-06, + "loss": 0.3305, + "step": 9221 + }, + { + "epoch": 0.730600118835413, + "grad_norm": 1.1928713445998316, + "learning_rate": 3.5704859714806162e-06, + "loss": 0.2324, + "step": 9222 + }, + { + "epoch": 0.7306793424440483, + "grad_norm": 1.367042494144639, + "learning_rate": 3.568520841575601e-06, + "loss": 0.2348, + "step": 9223 + }, + { + "epoch": 0.7307585660526837, + "grad_norm": 1.2958420252155944, + "learning_rate": 3.5665561351525423e-06, + "loss": 0.2418, + "step": 9224 + }, + { + "epoch": 0.7308377896613191, + "grad_norm": 1.351444965577694, + "learning_rate": 3.564591852340803e-06, + "loss": 0.2164, + "step": 9225 + }, + { + "epoch": 0.7309170132699544, + "grad_norm": 1.3836486065638958, + "learning_rate": 3.562627993269728e-06, + "loss": 0.1892, + "step": 9226 + }, + { + "epoch": 0.7309962368785898, + "grad_norm": 1.3952793992642734, + "learning_rate": 3.5606645580686262e-06, + "loss": 0.2695, + "step": 9227 + }, + { + "epoch": 0.7310754604872252, + "grad_norm": 1.2718676658557244, + "learning_rate": 3.558701546866775e-06, + "loss": 0.2371, + "step": 9228 + }, + { + "epoch": 0.7311546840958606, + "grad_norm": 1.239544096955322, + "learning_rate": 3.5567389597934367e-06, + "loss": 0.2413, + "step": 9229 + }, + { + "epoch": 0.7312339077044959, + "grad_norm": 1.3831662880171955, + "learning_rate": 3.5547767969778355e-06, + "loss": 0.1706, + "step": 9230 + }, + { + "epoch": 0.7313131313131314, + "grad_norm": 1.497345554527076, + "learning_rate": 3.5528150585491695e-06, + "loss": 0.3261, + "step": 9231 + }, + { + "epoch": 0.7313923549217667, + "grad_norm": 1.5860677089133086, + "learning_rate": 3.5508537446366097e-06, + "loss": 0.2808, + "step": 9232 + }, + { + "epoch": 0.731471578530402, + "grad_norm": 1.395644138994821, + "learning_rate": 3.548892855369299e-06, + "loss": 0.2719, + "step": 9233 + }, + { + "epoch": 0.7315508021390374, + "grad_norm": 1.3163582283306305, + "learning_rate": 3.5469323908763507e-06, + "loss": 0.1699, + "step": 9234 + }, + { + "epoch": 0.7316300257476728, + "grad_norm": 1.3470163291986994, + "learning_rate": 3.544972351286857e-06, + "loss": 0.272, + "step": 9235 + }, + { + "epoch": 0.7317092493563082, + "grad_norm": 1.2846938658523657, + "learning_rate": 3.543012736729875e-06, + "loss": 0.2143, + "step": 9236 + }, + { + "epoch": 0.7317884729649435, + "grad_norm": 1.4181215349696352, + "learning_rate": 3.541053547334431e-06, + "loss": 0.1985, + "step": 9237 + }, + { + "epoch": 0.731867696573579, + "grad_norm": 1.2393105836174212, + "learning_rate": 3.5390947832295366e-06, + "loss": 0.2, + "step": 9238 + }, + { + "epoch": 0.7319469201822143, + "grad_norm": 1.4206063640871278, + "learning_rate": 3.5371364445441624e-06, + "loss": 0.2596, + "step": 9239 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 1.12714059429672, + "learning_rate": 3.535178531407253e-06, + "loss": 0.2472, + "step": 9240 + }, + { + "epoch": 0.7321053673994851, + "grad_norm": 1.1754069399453526, + "learning_rate": 3.5332210439477334e-06, + "loss": 0.2222, + "step": 9241 + }, + { + "epoch": 0.7321845910081204, + "grad_norm": 1.1840809153633325, + "learning_rate": 3.5312639822944917e-06, + "loss": 0.1683, + "step": 9242 + }, + { + "epoch": 0.7322638146167558, + "grad_norm": 1.2364644998397787, + "learning_rate": 3.529307346576388e-06, + "loss": 0.2998, + "step": 9243 + }, + { + "epoch": 0.7323430382253912, + "grad_norm": 1.4977267706621402, + "learning_rate": 3.527351136922265e-06, + "loss": 0.2293, + "step": 9244 + }, + { + "epoch": 0.7324222618340266, + "grad_norm": 1.598424269754826, + "learning_rate": 3.525395353460924e-06, + "loss": 0.2092, + "step": 9245 + }, + { + "epoch": 0.7325014854426619, + "grad_norm": 1.0762020920966138, + "learning_rate": 3.5234399963211418e-06, + "loss": 0.172, + "step": 9246 + }, + { + "epoch": 0.7325807090512972, + "grad_norm": 1.1283011982714446, + "learning_rate": 3.521485065631677e-06, + "loss": 0.1973, + "step": 9247 + }, + { + "epoch": 0.7326599326599327, + "grad_norm": 1.3472524578459015, + "learning_rate": 3.5195305615212473e-06, + "loss": 0.22, + "step": 9248 + }, + { + "epoch": 0.732739156268568, + "grad_norm": 1.2031077235456313, + "learning_rate": 3.517576484118549e-06, + "loss": 0.1709, + "step": 9249 + }, + { + "epoch": 0.7328183798772034, + "grad_norm": 1.3521594777320847, + "learning_rate": 3.5156228335522434e-06, + "loss": 0.2566, + "step": 9250 + }, + { + "epoch": 0.7328976034858388, + "grad_norm": 1.4394382310170906, + "learning_rate": 3.513669609950977e-06, + "loss": 0.2657, + "step": 9251 + }, + { + "epoch": 0.7329768270944742, + "grad_norm": 1.0113890499781821, + "learning_rate": 3.5117168134433566e-06, + "loss": 0.2191, + "step": 9252 + }, + { + "epoch": 0.7330560507031095, + "grad_norm": 1.7668251346988608, + "learning_rate": 3.5097644441579602e-06, + "loss": 0.3022, + "step": 9253 + }, + { + "epoch": 0.7331352743117449, + "grad_norm": 1.3264919356428448, + "learning_rate": 3.507812502223351e-06, + "loss": 0.2632, + "step": 9254 + }, + { + "epoch": 0.7332144979203803, + "grad_norm": 1.3202488044573206, + "learning_rate": 3.5058609877680495e-06, + "loss": 0.2346, + "step": 9255 + }, + { + "epoch": 0.7332937215290156, + "grad_norm": 1.249267761519028, + "learning_rate": 3.5039099009205503e-06, + "loss": 0.177, + "step": 9256 + }, + { + "epoch": 0.7333729451376511, + "grad_norm": 1.0385498589178, + "learning_rate": 3.5019592418093306e-06, + "loss": 0.1958, + "step": 9257 + }, + { + "epoch": 0.7334521687462864, + "grad_norm": 1.7322886298834497, + "learning_rate": 3.5000090105628282e-06, + "loss": 0.2548, + "step": 9258 + }, + { + "epoch": 0.7335313923549218, + "grad_norm": 1.6212657464488363, + "learning_rate": 3.4980592073094533e-06, + "loss": 0.3126, + "step": 9259 + }, + { + "epoch": 0.7336106159635571, + "grad_norm": 1.232431030794991, + "learning_rate": 3.4961098321775978e-06, + "loss": 0.2031, + "step": 9260 + }, + { + "epoch": 0.7336898395721925, + "grad_norm": 1.210809283044841, + "learning_rate": 3.4941608852956143e-06, + "loss": 0.2083, + "step": 9261 + }, + { + "epoch": 0.7337690631808279, + "grad_norm": 1.4815542113270626, + "learning_rate": 3.4922123667918305e-06, + "loss": 0.2493, + "step": 9262 + }, + { + "epoch": 0.7338482867894632, + "grad_norm": 1.370812047528853, + "learning_rate": 3.4902642767945506e-06, + "loss": 0.2685, + "step": 9263 + }, + { + "epoch": 0.7339275103980987, + "grad_norm": 1.307021871133628, + "learning_rate": 3.488316615432047e-06, + "loss": 0.2045, + "step": 9264 + }, + { + "epoch": 0.734006734006734, + "grad_norm": 1.3912112571895232, + "learning_rate": 3.486369382832561e-06, + "loss": 0.2461, + "step": 9265 + }, + { + "epoch": 0.7340859576153694, + "grad_norm": 1.6418173241747325, + "learning_rate": 3.484422579124306e-06, + "loss": 0.3147, + "step": 9266 + }, + { + "epoch": 0.7341651812240048, + "grad_norm": 1.3062255568355514, + "learning_rate": 3.4824762044354763e-06, + "loss": 0.2357, + "step": 9267 + }, + { + "epoch": 0.7342444048326401, + "grad_norm": 1.5047478729694654, + "learning_rate": 3.480530258894229e-06, + "loss": 0.3066, + "step": 9268 + }, + { + "epoch": 0.7343236284412755, + "grad_norm": 1.2897483515247474, + "learning_rate": 3.478584742628691e-06, + "loss": 0.2355, + "step": 9269 + }, + { + "epoch": 0.7344028520499108, + "grad_norm": 1.4894731624445268, + "learning_rate": 3.4766396557669712e-06, + "loss": 0.2566, + "step": 9270 + }, + { + "epoch": 0.7344820756585463, + "grad_norm": 1.5329417592975518, + "learning_rate": 3.4746949984371425e-06, + "loss": 0.2657, + "step": 9271 + }, + { + "epoch": 0.7345612992671816, + "grad_norm": 1.451594454701546, + "learning_rate": 3.472750770767247e-06, + "loss": 0.2656, + "step": 9272 + }, + { + "epoch": 0.734640522875817, + "grad_norm": 1.3640996989805043, + "learning_rate": 3.470806972885309e-06, + "loss": 0.2252, + "step": 9273 + }, + { + "epoch": 0.7347197464844524, + "grad_norm": 1.2277469136655474, + "learning_rate": 3.468863604919316e-06, + "loss": 0.17, + "step": 9274 + }, + { + "epoch": 0.7347989700930877, + "grad_norm": 1.4960432655706746, + "learning_rate": 3.4669206669972254e-06, + "loss": 0.2332, + "step": 9275 + }, + { + "epoch": 0.7348781937017231, + "grad_norm": 1.2216736409084543, + "learning_rate": 3.4649781592469765e-06, + "loss": 0.1903, + "step": 9276 + }, + { + "epoch": 0.7349574173103585, + "grad_norm": 1.44950255840134, + "learning_rate": 3.4630360817964715e-06, + "loss": 0.3536, + "step": 9277 + }, + { + "epoch": 0.7350366409189939, + "grad_norm": 1.3600526631535825, + "learning_rate": 3.4610944347735864e-06, + "loss": 0.2556, + "step": 9278 + }, + { + "epoch": 0.7351158645276292, + "grad_norm": 1.379654388160424, + "learning_rate": 3.459153218306167e-06, + "loss": 0.2525, + "step": 9279 + }, + { + "epoch": 0.7351950881362646, + "grad_norm": 1.3443176104254027, + "learning_rate": 3.457212432522038e-06, + "loss": 0.2215, + "step": 9280 + }, + { + "epoch": 0.7352743117449, + "grad_norm": 1.491289938673264, + "learning_rate": 3.455272077548989e-06, + "loss": 0.3319, + "step": 9281 + }, + { + "epoch": 0.7353535353535353, + "grad_norm": 1.3342119726559776, + "learning_rate": 3.453332153514779e-06, + "loss": 0.2904, + "step": 9282 + }, + { + "epoch": 0.7354327589621708, + "grad_norm": 1.4302412012794026, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.2005, + "step": 9283 + }, + { + "epoch": 0.7355119825708061, + "grad_norm": 1.1844039876753532, + "learning_rate": 3.449453598773804e-06, + "loss": 0.1767, + "step": 9284 + }, + { + "epoch": 0.7355912061794415, + "grad_norm": 1.4912406308192818, + "learning_rate": 3.4475149683224164e-06, + "loss": 0.3269, + "step": 9285 + }, + { + "epoch": 0.7356704297880768, + "grad_norm": 1.177134188332753, + "learning_rate": 3.445576769320642e-06, + "loss": 0.2069, + "step": 9286 + }, + { + "epoch": 0.7357496533967122, + "grad_norm": 1.3052468325488955, + "learning_rate": 3.4436390018960997e-06, + "loss": 0.2515, + "step": 9287 + }, + { + "epoch": 0.7358288770053476, + "grad_norm": 1.276417938819899, + "learning_rate": 3.4417016661763793e-06, + "loss": 0.2371, + "step": 9288 + }, + { + "epoch": 0.7359081006139829, + "grad_norm": 1.1018211843230064, + "learning_rate": 3.439764762289051e-06, + "loss": 0.1982, + "step": 9289 + }, + { + "epoch": 0.7359873242226184, + "grad_norm": 1.3027848068597259, + "learning_rate": 3.4378282903616457e-06, + "loss": 0.2057, + "step": 9290 + }, + { + "epoch": 0.7360665478312537, + "grad_norm": 1.4328870946841892, + "learning_rate": 3.4358922505216707e-06, + "loss": 0.306, + "step": 9291 + }, + { + "epoch": 0.7361457714398891, + "grad_norm": 0.9964209043201193, + "learning_rate": 3.4339566428966086e-06, + "loss": 0.1596, + "step": 9292 + }, + { + "epoch": 0.7362249950485245, + "grad_norm": 1.2152756409676986, + "learning_rate": 3.4320214676139087e-06, + "loss": 0.2524, + "step": 9293 + }, + { + "epoch": 0.7363042186571598, + "grad_norm": 1.6232576532946363, + "learning_rate": 3.4300867248009917e-06, + "loss": 0.278, + "step": 9294 + }, + { + "epoch": 0.7363834422657952, + "grad_norm": 1.5540840955000574, + "learning_rate": 3.4281524145852485e-06, + "loss": 0.2511, + "step": 9295 + }, + { + "epoch": 0.7364626658744305, + "grad_norm": 1.3783879673009558, + "learning_rate": 3.4262185370940504e-06, + "loss": 0.2009, + "step": 9296 + }, + { + "epoch": 0.736541889483066, + "grad_norm": 1.0583357887529627, + "learning_rate": 3.4242850924547297e-06, + "loss": 0.149, + "step": 9297 + }, + { + "epoch": 0.7366211130917013, + "grad_norm": 1.4592309555612877, + "learning_rate": 3.422352080794593e-06, + "loss": 0.2032, + "step": 9298 + }, + { + "epoch": 0.7367003367003367, + "grad_norm": 1.5684020107815297, + "learning_rate": 3.4204195022409247e-06, + "loss": 0.2315, + "step": 9299 + }, + { + "epoch": 0.7367795603089721, + "grad_norm": 1.617297646162543, + "learning_rate": 3.418487356920974e-06, + "loss": 0.2939, + "step": 9300 + }, + { + "epoch": 0.7368587839176074, + "grad_norm": 1.183777979118823, + "learning_rate": 3.4165556449619584e-06, + "loss": 0.185, + "step": 9301 + }, + { + "epoch": 0.7369380075262428, + "grad_norm": 1.7859773945344035, + "learning_rate": 3.4146243664910804e-06, + "loss": 0.2041, + "step": 9302 + }, + { + "epoch": 0.7370172311348782, + "grad_norm": 1.005660326886881, + "learning_rate": 3.4126935216355005e-06, + "loss": 0.1418, + "step": 9303 + }, + { + "epoch": 0.7370964547435136, + "grad_norm": 1.3574530398897882, + "learning_rate": 3.4107631105223528e-06, + "loss": 0.2562, + "step": 9304 + }, + { + "epoch": 0.7371756783521489, + "grad_norm": 1.4927142668338989, + "learning_rate": 3.4088331332787527e-06, + "loss": 0.2526, + "step": 9305 + }, + { + "epoch": 0.7372549019607844, + "grad_norm": 1.2867817885613204, + "learning_rate": 3.406903590031776e-06, + "loss": 0.2534, + "step": 9306 + }, + { + "epoch": 0.7373341255694197, + "grad_norm": 1.176489627015894, + "learning_rate": 3.4049744809084697e-06, + "loss": 0.1957, + "step": 9307 + }, + { + "epoch": 0.737413349178055, + "grad_norm": 1.1926070072609174, + "learning_rate": 3.4030458060358682e-06, + "loss": 0.2039, + "step": 9308 + }, + { + "epoch": 0.7374925727866904, + "grad_norm": 1.3437740891606778, + "learning_rate": 3.4011175655409546e-06, + "loss": 0.2742, + "step": 9309 + }, + { + "epoch": 0.7375717963953258, + "grad_norm": 1.118499232921844, + "learning_rate": 3.399189759550694e-06, + "loss": 0.194, + "step": 9310 + }, + { + "epoch": 0.7376510200039612, + "grad_norm": 1.299508216593726, + "learning_rate": 3.3972623881920296e-06, + "loss": 0.2468, + "step": 9311 + }, + { + "epoch": 0.7377302436125965, + "grad_norm": 1.190980108520649, + "learning_rate": 3.3953354515918667e-06, + "loss": 0.2117, + "step": 9312 + }, + { + "epoch": 0.737809467221232, + "grad_norm": 1.1517214436632648, + "learning_rate": 3.3934089498770816e-06, + "loss": 0.2051, + "step": 9313 + }, + { + "epoch": 0.7378886908298673, + "grad_norm": 1.4131190686879942, + "learning_rate": 3.3914828831745306e-06, + "loss": 0.2081, + "step": 9314 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 1.305668698482543, + "learning_rate": 3.3895572516110353e-06, + "loss": 0.2618, + "step": 9315 + }, + { + "epoch": 0.7380471380471381, + "grad_norm": 1.4289927646275424, + "learning_rate": 3.3876320553133834e-06, + "loss": 0.2351, + "step": 9316 + }, + { + "epoch": 0.7381263616557734, + "grad_norm": 1.2561146489296369, + "learning_rate": 3.385707294408347e-06, + "loss": 0.2313, + "step": 9317 + }, + { + "epoch": 0.7382055852644088, + "grad_norm": 1.2874340522406893, + "learning_rate": 3.38378296902266e-06, + "loss": 0.2394, + "step": 9318 + }, + { + "epoch": 0.7382848088730442, + "grad_norm": 1.105598495541412, + "learning_rate": 3.3818590792830285e-06, + "loss": 0.1684, + "step": 9319 + }, + { + "epoch": 0.7383640324816796, + "grad_norm": 1.6062150899695402, + "learning_rate": 3.3799356253161288e-06, + "loss": 0.2915, + "step": 9320 + }, + { + "epoch": 0.7384432560903149, + "grad_norm": 1.3679716411851794, + "learning_rate": 3.3780126072486188e-06, + "loss": 0.22, + "step": 9321 + }, + { + "epoch": 0.7385224796989502, + "grad_norm": 1.5656838834204838, + "learning_rate": 3.376090025207115e-06, + "loss": 0.2978, + "step": 9322 + }, + { + "epoch": 0.7386017033075857, + "grad_norm": 1.358561615519632, + "learning_rate": 3.3741678793182077e-06, + "loss": 0.2135, + "step": 9323 + }, + { + "epoch": 0.738680926916221, + "grad_norm": 1.116543505132853, + "learning_rate": 3.372246169708466e-06, + "loss": 0.1902, + "step": 9324 + }, + { + "epoch": 0.7387601505248564, + "grad_norm": 1.19632013296997, + "learning_rate": 3.3703248965044253e-06, + "loss": 0.2075, + "step": 9325 + }, + { + "epoch": 0.7388393741334918, + "grad_norm": 1.675068129797508, + "learning_rate": 3.368404059832586e-06, + "loss": 0.3717, + "step": 9326 + }, + { + "epoch": 0.7389185977421272, + "grad_norm": 1.174493597031542, + "learning_rate": 3.366483659819434e-06, + "loss": 0.2151, + "step": 9327 + }, + { + "epoch": 0.7389978213507625, + "grad_norm": 1.3595132861478358, + "learning_rate": 3.364563696591414e-06, + "loss": 0.2324, + "step": 9328 + }, + { + "epoch": 0.7390770449593979, + "grad_norm": 1.0135582023985867, + "learning_rate": 3.3626441702749436e-06, + "loss": 0.1709, + "step": 9329 + }, + { + "epoch": 0.7391562685680333, + "grad_norm": 1.2881326987708404, + "learning_rate": 3.360725080996421e-06, + "loss": 0.1937, + "step": 9330 + }, + { + "epoch": 0.7392354921766686, + "grad_norm": 1.8125928481184876, + "learning_rate": 3.3588064288822055e-06, + "loss": 0.3408, + "step": 9331 + }, + { + "epoch": 0.739314715785304, + "grad_norm": 1.608115395275425, + "learning_rate": 3.356888214058629e-06, + "loss": 0.2227, + "step": 9332 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 1.7541517618321727, + "learning_rate": 3.354970436652001e-06, + "loss": 0.292, + "step": 9333 + }, + { + "epoch": 0.7394731630025748, + "grad_norm": 1.1438638487660662, + "learning_rate": 3.3530530967885964e-06, + "loss": 0.2032, + "step": 9334 + }, + { + "epoch": 0.7395523866112101, + "grad_norm": 1.0269616738925067, + "learning_rate": 3.351136194594662e-06, + "loss": 0.1253, + "step": 9335 + }, + { + "epoch": 0.7396316102198455, + "grad_norm": 1.3222051669189687, + "learning_rate": 3.3492197301964145e-06, + "loss": 0.1968, + "step": 9336 + }, + { + "epoch": 0.7397108338284809, + "grad_norm": 1.6246901847849982, + "learning_rate": 3.3473037037200484e-06, + "loss": 0.3324, + "step": 9337 + }, + { + "epoch": 0.7397900574371162, + "grad_norm": 1.4138539785306259, + "learning_rate": 3.345388115291723e-06, + "loss": 0.2502, + "step": 9338 + }, + { + "epoch": 0.7398692810457517, + "grad_norm": 1.3342604312615531, + "learning_rate": 3.3434729650375675e-06, + "loss": 0.2127, + "step": 9339 + }, + { + "epoch": 0.739948504654387, + "grad_norm": 1.3255346940347943, + "learning_rate": 3.341558253083692e-06, + "loss": 0.2182, + "step": 9340 + }, + { + "epoch": 0.7400277282630224, + "grad_norm": 1.5704784999120203, + "learning_rate": 3.3396439795561662e-06, + "loss": 0.3077, + "step": 9341 + }, + { + "epoch": 0.7401069518716578, + "grad_norm": 1.350155520054778, + "learning_rate": 3.3377301445810327e-06, + "loss": 0.1995, + "step": 9342 + }, + { + "epoch": 0.7401861754802931, + "grad_norm": 1.4020594977913219, + "learning_rate": 3.3358167482843173e-06, + "loss": 0.2596, + "step": 9343 + }, + { + "epoch": 0.7402653990889285, + "grad_norm": 1.2113602218994153, + "learning_rate": 3.3339037907920024e-06, + "loss": 0.1901, + "step": 9344 + }, + { + "epoch": 0.7403446226975638, + "grad_norm": 1.312382290869686, + "learning_rate": 3.331991272230044e-06, + "loss": 0.1848, + "step": 9345 + }, + { + "epoch": 0.7404238463061993, + "grad_norm": 1.4020087520398796, + "learning_rate": 3.330079192724379e-06, + "loss": 0.2452, + "step": 9346 + }, + { + "epoch": 0.7405030699148346, + "grad_norm": 1.503886557144299, + "learning_rate": 3.328167552400906e-06, + "loss": 0.3149, + "step": 9347 + }, + { + "epoch": 0.74058229352347, + "grad_norm": 1.390393847020418, + "learning_rate": 3.326256351385494e-06, + "loss": 0.1894, + "step": 9348 + }, + { + "epoch": 0.7406615171321054, + "grad_norm": 1.204952612826259, + "learning_rate": 3.324345589803991e-06, + "loss": 0.1679, + "step": 9349 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 1.102868956440528, + "learning_rate": 3.3224352677822115e-06, + "loss": 0.1637, + "step": 9350 + }, + { + "epoch": 0.7408199643493761, + "grad_norm": 1.3170442454502893, + "learning_rate": 3.3205253854459386e-06, + "loss": 0.1963, + "step": 9351 + }, + { + "epoch": 0.7408991879580115, + "grad_norm": 1.7118820201009344, + "learning_rate": 3.3186159429209263e-06, + "loss": 0.2487, + "step": 9352 + }, + { + "epoch": 0.7409784115666469, + "grad_norm": 1.3228384432786102, + "learning_rate": 3.316706940332908e-06, + "loss": 0.2282, + "step": 9353 + }, + { + "epoch": 0.7410576351752822, + "grad_norm": 1.2812824330023773, + "learning_rate": 3.314798377807581e-06, + "loss": 0.2276, + "step": 9354 + }, + { + "epoch": 0.7411368587839177, + "grad_norm": 1.4483446977021537, + "learning_rate": 3.312890255470609e-06, + "loss": 0.2815, + "step": 9355 + }, + { + "epoch": 0.741216082392553, + "grad_norm": 1.1402000416932498, + "learning_rate": 3.3109825734476407e-06, + "loss": 0.2162, + "step": 9356 + }, + { + "epoch": 0.7412953060011883, + "grad_norm": 1.1035150403036862, + "learning_rate": 3.3090753318642855e-06, + "loss": 0.1856, + "step": 9357 + }, + { + "epoch": 0.7413745296098238, + "grad_norm": 1.270229413518336, + "learning_rate": 3.307168530846121e-06, + "loss": 0.1862, + "step": 9358 + }, + { + "epoch": 0.7414537532184591, + "grad_norm": 1.3788715034941004, + "learning_rate": 3.3052621705187083e-06, + "loss": 0.2669, + "step": 9359 + }, + { + "epoch": 0.7415329768270945, + "grad_norm": 1.5985590459322399, + "learning_rate": 3.303356251007569e-06, + "loss": 0.2632, + "step": 9360 + }, + { + "epoch": 0.7416122004357298, + "grad_norm": 0.9359331089641134, + "learning_rate": 3.301450772438195e-06, + "loss": 0.183, + "step": 9361 + }, + { + "epoch": 0.7416914240443652, + "grad_norm": 1.1797604146938332, + "learning_rate": 3.2995457349360595e-06, + "loss": 0.167, + "step": 9362 + }, + { + "epoch": 0.7417706476530006, + "grad_norm": 1.441246918230819, + "learning_rate": 3.297641138626597e-06, + "loss": 0.3442, + "step": 9363 + }, + { + "epoch": 0.7418498712616359, + "grad_norm": 1.489922558472212, + "learning_rate": 3.295736983635215e-06, + "loss": 0.2688, + "step": 9364 + }, + { + "epoch": 0.7419290948702714, + "grad_norm": 1.2666159934738908, + "learning_rate": 3.293833270087291e-06, + "loss": 0.2283, + "step": 9365 + }, + { + "epoch": 0.7420083184789067, + "grad_norm": 1.073465256701786, + "learning_rate": 3.291929998108182e-06, + "loss": 0.1883, + "step": 9366 + }, + { + "epoch": 0.7420875420875421, + "grad_norm": 1.364829554548889, + "learning_rate": 3.2900271678232045e-06, + "loss": 0.2387, + "step": 9367 + }, + { + "epoch": 0.7421667656961775, + "grad_norm": 1.2525372690383816, + "learning_rate": 3.2881247793576488e-06, + "loss": 0.2123, + "step": 9368 + }, + { + "epoch": 0.7422459893048128, + "grad_norm": 1.42498933805347, + "learning_rate": 3.286222832836784e-06, + "loss": 0.2469, + "step": 9369 + }, + { + "epoch": 0.7423252129134482, + "grad_norm": 1.0395948803097224, + "learning_rate": 3.284321328385842e-06, + "loss": 0.1803, + "step": 9370 + }, + { + "epoch": 0.7424044365220835, + "grad_norm": 1.1343727906686847, + "learning_rate": 3.282420266130022e-06, + "loss": 0.156, + "step": 9371 + }, + { + "epoch": 0.742483660130719, + "grad_norm": 1.2896552906941683, + "learning_rate": 3.280519646194509e-06, + "loss": 0.2883, + "step": 9372 + }, + { + "epoch": 0.7425628837393543, + "grad_norm": 1.144277478317561, + "learning_rate": 3.278619468704445e-06, + "loss": 0.1901, + "step": 9373 + }, + { + "epoch": 0.7426421073479897, + "grad_norm": 1.306115371250631, + "learning_rate": 3.276719733784943e-06, + "loss": 0.2369, + "step": 9374 + }, + { + "epoch": 0.7427213309566251, + "grad_norm": 1.1351004925493535, + "learning_rate": 3.2748204415611016e-06, + "loss": 0.183, + "step": 9375 + }, + { + "epoch": 0.7428005545652604, + "grad_norm": 1.5978591651667373, + "learning_rate": 3.2729215921579738e-06, + "loss": 0.2698, + "step": 9376 + }, + { + "epoch": 0.7428797781738958, + "grad_norm": 1.2230814751155743, + "learning_rate": 3.271023185700587e-06, + "loss": 0.1909, + "step": 9377 + }, + { + "epoch": 0.7429590017825312, + "grad_norm": 1.5315091948619288, + "learning_rate": 3.269125222313949e-06, + "loss": 0.2832, + "step": 9378 + }, + { + "epoch": 0.7430382253911666, + "grad_norm": 1.0749651270596776, + "learning_rate": 3.2672277021230283e-06, + "loss": 0.1364, + "step": 9379 + }, + { + "epoch": 0.7431174489998019, + "grad_norm": 1.340169914459609, + "learning_rate": 3.2653306252527673e-06, + "loss": 0.2201, + "step": 9380 + }, + { + "epoch": 0.7431966726084374, + "grad_norm": 1.5406688127686632, + "learning_rate": 3.2634339918280765e-06, + "loss": 0.2878, + "step": 9381 + }, + { + "epoch": 0.7432758962170727, + "grad_norm": 1.145921811277957, + "learning_rate": 3.2615378019738455e-06, + "loss": 0.1986, + "step": 9382 + }, + { + "epoch": 0.743355119825708, + "grad_norm": 1.3886567263108123, + "learning_rate": 3.2596420558149277e-06, + "loss": 0.2859, + "step": 9383 + }, + { + "epoch": 0.7434343434343434, + "grad_norm": 1.288110451264625, + "learning_rate": 3.257746753476144e-06, + "loss": 0.2582, + "step": 9384 + }, + { + "epoch": 0.7435135670429788, + "grad_norm": 1.282485742503463, + "learning_rate": 3.255851895082299e-06, + "loss": 0.1978, + "step": 9385 + }, + { + "epoch": 0.7435927906516142, + "grad_norm": 1.2656152737878041, + "learning_rate": 3.2539574807581555e-06, + "loss": 0.2331, + "step": 9386 + }, + { + "epoch": 0.7436720142602495, + "grad_norm": 1.5588947267541353, + "learning_rate": 3.2520635106284516e-06, + "loss": 0.2379, + "step": 9387 + }, + { + "epoch": 0.743751237868885, + "grad_norm": 1.498764868012792, + "learning_rate": 3.250169984817897e-06, + "loss": 0.3503, + "step": 9388 + }, + { + "epoch": 0.7438304614775203, + "grad_norm": 1.6338774467513675, + "learning_rate": 3.248276903451171e-06, + "loss": 0.246, + "step": 9389 + }, + { + "epoch": 0.7439096850861556, + "grad_norm": 1.447355605954297, + "learning_rate": 3.24638426665292e-06, + "loss": 0.3435, + "step": 9390 + }, + { + "epoch": 0.7439889086947911, + "grad_norm": 1.4814802268143872, + "learning_rate": 3.2444920745477727e-06, + "loss": 0.2474, + "step": 9391 + }, + { + "epoch": 0.7440681323034264, + "grad_norm": 1.3975331282726227, + "learning_rate": 3.2426003272603158e-06, + "loss": 0.2944, + "step": 9392 + }, + { + "epoch": 0.7441473559120618, + "grad_norm": 1.2730895826492326, + "learning_rate": 3.2407090249151105e-06, + "loss": 0.2151, + "step": 9393 + }, + { + "epoch": 0.7442265795206972, + "grad_norm": 1.1512579437931159, + "learning_rate": 3.238818167636695e-06, + "loss": 0.1818, + "step": 9394 + }, + { + "epoch": 0.7443058031293326, + "grad_norm": 1.2852383478758886, + "learning_rate": 3.2369277555495705e-06, + "loss": 0.2205, + "step": 9395 + }, + { + "epoch": 0.7443850267379679, + "grad_norm": 1.4782483356658025, + "learning_rate": 3.235037788778208e-06, + "loss": 0.2166, + "step": 9396 + }, + { + "epoch": 0.7444642503466032, + "grad_norm": 1.3791635627572578, + "learning_rate": 3.2331482674470605e-06, + "loss": 0.3152, + "step": 9397 + }, + { + "epoch": 0.7445434739552387, + "grad_norm": 1.319750949406635, + "learning_rate": 3.2312591916805382e-06, + "loss": 0.2256, + "step": 9398 + }, + { + "epoch": 0.744622697563874, + "grad_norm": 1.0207439209579192, + "learning_rate": 3.2293705616030267e-06, + "loss": 0.1507, + "step": 9399 + }, + { + "epoch": 0.7447019211725094, + "grad_norm": 1.0894260346706084, + "learning_rate": 3.2274823773388885e-06, + "loss": 0.1402, + "step": 9400 + }, + { + "epoch": 0.7447811447811448, + "grad_norm": 1.2563275498512658, + "learning_rate": 3.2255946390124482e-06, + "loss": 0.1691, + "step": 9401 + }, + { + "epoch": 0.7448603683897802, + "grad_norm": 1.5389010121626974, + "learning_rate": 3.223707346748002e-06, + "loss": 0.196, + "step": 9402 + }, + { + "epoch": 0.7449395919984155, + "grad_norm": 1.1763971645552154, + "learning_rate": 3.221820500669823e-06, + "loss": 0.1593, + "step": 9403 + }, + { + "epoch": 0.7450188156070509, + "grad_norm": 1.3794139382948734, + "learning_rate": 3.2199341009021514e-06, + "loss": 0.2419, + "step": 9404 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 1.602692670876231, + "learning_rate": 3.218048147569195e-06, + "loss": 0.3202, + "step": 9405 + }, + { + "epoch": 0.7451772628243216, + "grad_norm": 2.1733670177972346, + "learning_rate": 3.216162640795133e-06, + "loss": 0.2975, + "step": 9406 + }, + { + "epoch": 0.745256486432957, + "grad_norm": 1.5008987525985351, + "learning_rate": 3.2142775807041214e-06, + "loss": 0.2438, + "step": 9407 + }, + { + "epoch": 0.7453357100415924, + "grad_norm": 1.401590559714894, + "learning_rate": 3.2123929674202816e-06, + "loss": 0.277, + "step": 9408 + }, + { + "epoch": 0.7454149336502278, + "grad_norm": 1.4131344961909658, + "learning_rate": 3.2105088010677e-06, + "loss": 0.2571, + "step": 9409 + }, + { + "epoch": 0.7454941572588631, + "grad_norm": 1.3357463628827901, + "learning_rate": 3.2086250817704488e-06, + "loss": 0.2026, + "step": 9410 + }, + { + "epoch": 0.7455733808674985, + "grad_norm": 1.3362518109459844, + "learning_rate": 3.2067418096525593e-06, + "loss": 0.2325, + "step": 9411 + }, + { + "epoch": 0.7456526044761339, + "grad_norm": 1.4926871147088352, + "learning_rate": 3.2048589848380297e-06, + "loss": 0.194, + "step": 9412 + }, + { + "epoch": 0.7457318280847692, + "grad_norm": 1.090169403182317, + "learning_rate": 3.202976607450844e-06, + "loss": 0.1293, + "step": 9413 + }, + { + "epoch": 0.7458110516934047, + "grad_norm": 1.390001396079619, + "learning_rate": 3.201094677614943e-06, + "loss": 0.2167, + "step": 9414 + }, + { + "epoch": 0.74589027530204, + "grad_norm": 1.5013227373329407, + "learning_rate": 3.1992131954542404e-06, + "loss": 0.2715, + "step": 9415 + }, + { + "epoch": 0.7459694989106754, + "grad_norm": 1.3790550209074108, + "learning_rate": 3.1973321610926277e-06, + "loss": 0.2343, + "step": 9416 + }, + { + "epoch": 0.7460487225193108, + "grad_norm": 1.3541490417400641, + "learning_rate": 3.1954515746539616e-06, + "loss": 0.263, + "step": 9417 + }, + { + "epoch": 0.7461279461279461, + "grad_norm": 1.2788272538823096, + "learning_rate": 3.193571436262064e-06, + "loss": 0.2382, + "step": 9418 + }, + { + "epoch": 0.7462071697365815, + "grad_norm": 1.138278157720186, + "learning_rate": 3.191691746040739e-06, + "loss": 0.1841, + "step": 9419 + }, + { + "epoch": 0.7462863933452168, + "grad_norm": 1.1083300899303032, + "learning_rate": 3.189812504113754e-06, + "loss": 0.1574, + "step": 9420 + }, + { + "epoch": 0.7463656169538523, + "grad_norm": 1.4827937501877615, + "learning_rate": 3.187933710604847e-06, + "loss": 0.2568, + "step": 9421 + }, + { + "epoch": 0.7464448405624876, + "grad_norm": 1.2168448782762713, + "learning_rate": 3.186055365637725e-06, + "loss": 0.198, + "step": 9422 + }, + { + "epoch": 0.746524064171123, + "grad_norm": 1.8300571771067111, + "learning_rate": 3.184177469336073e-06, + "loss": 0.3388, + "step": 9423 + }, + { + "epoch": 0.7466032877797584, + "grad_norm": 1.3376084543935285, + "learning_rate": 3.1823000218235388e-06, + "loss": 0.2035, + "step": 9424 + }, + { + "epoch": 0.7466825113883937, + "grad_norm": 1.3971110358923733, + "learning_rate": 3.180423023223741e-06, + "loss": 0.2405, + "step": 9425 + }, + { + "epoch": 0.7467617349970291, + "grad_norm": 2.097276449333718, + "learning_rate": 3.1785464736602754e-06, + "loss": 0.3482, + "step": 9426 + }, + { + "epoch": 0.7468409586056645, + "grad_norm": 1.294310265785697, + "learning_rate": 3.1766703732567027e-06, + "loss": 0.2069, + "step": 9427 + }, + { + "epoch": 0.7469201822142999, + "grad_norm": 1.5895978455063275, + "learning_rate": 3.1747947221365517e-06, + "loss": 0.2403, + "step": 9428 + }, + { + "epoch": 0.7469994058229352, + "grad_norm": 1.1535859219206004, + "learning_rate": 3.17291952042333e-06, + "loss": 0.1927, + "step": 9429 + }, + { + "epoch": 0.7470786294315707, + "grad_norm": 1.5031014669288114, + "learning_rate": 3.171044768240508e-06, + "loss": 0.2773, + "step": 9430 + }, + { + "epoch": 0.747157853040206, + "grad_norm": 1.4734598814275917, + "learning_rate": 3.169170465711525e-06, + "loss": 0.2835, + "step": 9431 + }, + { + "epoch": 0.7472370766488413, + "grad_norm": 1.2373302455978916, + "learning_rate": 3.167296612959803e-06, + "loss": 0.1604, + "step": 9432 + }, + { + "epoch": 0.7473163002574768, + "grad_norm": 1.6178309639539645, + "learning_rate": 3.1654232101087225e-06, + "loss": 0.3613, + "step": 9433 + }, + { + "epoch": 0.7473955238661121, + "grad_norm": 1.262968555120363, + "learning_rate": 3.1635502572816333e-06, + "loss": 0.203, + "step": 9434 + }, + { + "epoch": 0.7474747474747475, + "grad_norm": 1.163022347376603, + "learning_rate": 3.1616777546018696e-06, + "loss": 0.2306, + "step": 9435 + }, + { + "epoch": 0.7475539710833828, + "grad_norm": 1.6540756082833261, + "learning_rate": 3.1598057021927207e-06, + "loss": 0.2017, + "step": 9436 + }, + { + "epoch": 0.7476331946920183, + "grad_norm": 1.2626823362028161, + "learning_rate": 3.1579341001774546e-06, + "loss": 0.2295, + "step": 9437 + }, + { + "epoch": 0.7477124183006536, + "grad_norm": 1.0592146146929295, + "learning_rate": 3.1560629486793014e-06, + "loss": 0.1668, + "step": 9438 + }, + { + "epoch": 0.7477916419092889, + "grad_norm": 1.3127302032741552, + "learning_rate": 3.154192247821476e-06, + "loss": 0.2523, + "step": 9439 + }, + { + "epoch": 0.7478708655179244, + "grad_norm": 1.1647802678212953, + "learning_rate": 3.1523219977271515e-06, + "loss": 0.1996, + "step": 9440 + }, + { + "epoch": 0.7479500891265597, + "grad_norm": 1.5647371157342005, + "learning_rate": 3.1504521985194715e-06, + "loss": 0.2719, + "step": 9441 + }, + { + "epoch": 0.7480293127351951, + "grad_norm": 1.5630162121234015, + "learning_rate": 3.1485828503215588e-06, + "loss": 0.2543, + "step": 9442 + }, + { + "epoch": 0.7481085363438305, + "grad_norm": 1.2100950588025379, + "learning_rate": 3.1467139532564985e-06, + "loss": 0.162, + "step": 9443 + }, + { + "epoch": 0.7481877599524658, + "grad_norm": 1.2043503266422373, + "learning_rate": 3.144845507447345e-06, + "loss": 0.1887, + "step": 9444 + }, + { + "epoch": 0.7482669835611012, + "grad_norm": 1.2921367172680136, + "learning_rate": 3.1429775130171337e-06, + "loss": 0.2382, + "step": 9445 + }, + { + "epoch": 0.7483462071697365, + "grad_norm": 1.2495429039947195, + "learning_rate": 3.141109970088859e-06, + "loss": 0.2479, + "step": 9446 + }, + { + "epoch": 0.748425430778372, + "grad_norm": 1.1069075926656768, + "learning_rate": 3.1392428787854865e-06, + "loss": 0.1447, + "step": 9447 + }, + { + "epoch": 0.7485046543870073, + "grad_norm": 1.4397127315683689, + "learning_rate": 3.1373762392299632e-06, + "loss": 0.2683, + "step": 9448 + }, + { + "epoch": 0.7485838779956427, + "grad_norm": 1.2169207604471608, + "learning_rate": 3.135510051545192e-06, + "loss": 0.2281, + "step": 9449 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 1.0603902525045215, + "learning_rate": 3.133644315854055e-06, + "loss": 0.2329, + "step": 9450 + }, + { + "epoch": 0.7487423252129134, + "grad_norm": 1.1278550844491935, + "learning_rate": 3.131779032279397e-06, + "loss": 0.2111, + "step": 9451 + }, + { + "epoch": 0.7488215488215488, + "grad_norm": 1.2612176210777584, + "learning_rate": 3.1299142009440463e-06, + "loss": 0.2302, + "step": 9452 + }, + { + "epoch": 0.7489007724301842, + "grad_norm": 1.3104144120060726, + "learning_rate": 3.1280498219707876e-06, + "loss": 0.1999, + "step": 9453 + }, + { + "epoch": 0.7489799960388196, + "grad_norm": 1.593919469996017, + "learning_rate": 3.1261858954823798e-06, + "loss": 0.2601, + "step": 9454 + }, + { + "epoch": 0.7490592196474549, + "grad_norm": 1.4810734301495718, + "learning_rate": 3.12432242160156e-06, + "loss": 0.265, + "step": 9455 + }, + { + "epoch": 0.7491384432560904, + "grad_norm": 1.232098949405676, + "learning_rate": 3.1224594004510246e-06, + "loss": 0.2258, + "step": 9456 + }, + { + "epoch": 0.7492176668647257, + "grad_norm": 1.3008262681884801, + "learning_rate": 3.1205968321534406e-06, + "loss": 0.2256, + "step": 9457 + }, + { + "epoch": 0.749296890473361, + "grad_norm": 1.3652165436394026, + "learning_rate": 3.1187347168314586e-06, + "loss": 0.3011, + "step": 9458 + }, + { + "epoch": 0.7493761140819964, + "grad_norm": 1.2019777872383308, + "learning_rate": 3.1168730546076844e-06, + "loss": 0.1736, + "step": 9459 + }, + { + "epoch": 0.7494553376906318, + "grad_norm": 1.4344420786214576, + "learning_rate": 3.1150118456046963e-06, + "loss": 0.2771, + "step": 9460 + }, + { + "epoch": 0.7495345612992672, + "grad_norm": 1.1784019137698762, + "learning_rate": 3.1131510899450533e-06, + "loss": 0.218, + "step": 9461 + }, + { + "epoch": 0.7496137849079025, + "grad_norm": 1.4764273319575023, + "learning_rate": 3.1112907877512732e-06, + "loss": 0.2423, + "step": 9462 + }, + { + "epoch": 0.749693008516538, + "grad_norm": 1.744742682660706, + "learning_rate": 3.1094309391458455e-06, + "loss": 0.3954, + "step": 9463 + }, + { + "epoch": 0.7497722321251733, + "grad_norm": 1.7072083694017732, + "learning_rate": 3.107571544251241e-06, + "loss": 0.2741, + "step": 9464 + }, + { + "epoch": 0.7498514557338086, + "grad_norm": 1.3629758318455325, + "learning_rate": 3.1057126031898843e-06, + "loss": 0.2522, + "step": 9465 + }, + { + "epoch": 0.7499306793424441, + "grad_norm": 1.093599098222902, + "learning_rate": 3.1038541160841752e-06, + "loss": 0.1698, + "step": 9466 + }, + { + "epoch": 0.7500099029510794, + "grad_norm": 1.351543909752522, + "learning_rate": 3.1019960830564945e-06, + "loss": 0.2193, + "step": 9467 + }, + { + "epoch": 0.7500891265597148, + "grad_norm": 2.0966597716556294, + "learning_rate": 3.1001385042291797e-06, + "loss": 0.3362, + "step": 9468 + }, + { + "epoch": 0.7501683501683502, + "grad_norm": 1.5211743504519524, + "learning_rate": 3.0982813797245413e-06, + "loss": 0.3306, + "step": 9469 + }, + { + "epoch": 0.7502475737769856, + "grad_norm": 1.2358795506796807, + "learning_rate": 3.096424709664868e-06, + "loss": 0.2208, + "step": 9470 + }, + { + "epoch": 0.7503267973856209, + "grad_norm": 1.1294991235195104, + "learning_rate": 3.094568494172411e-06, + "loss": 0.1982, + "step": 9471 + }, + { + "epoch": 0.7504060209942562, + "grad_norm": 1.313782117735638, + "learning_rate": 3.0927127333693872e-06, + "loss": 0.211, + "step": 9472 + }, + { + "epoch": 0.7504852446028917, + "grad_norm": 1.5260213198657948, + "learning_rate": 3.090857427377998e-06, + "loss": 0.2891, + "step": 9473 + }, + { + "epoch": 0.750564468211527, + "grad_norm": 1.727509647485842, + "learning_rate": 3.0890025763204025e-06, + "loss": 0.2981, + "step": 9474 + }, + { + "epoch": 0.7506436918201624, + "grad_norm": 1.1328880165124846, + "learning_rate": 3.087148180318734e-06, + "loss": 0.1931, + "step": 9475 + }, + { + "epoch": 0.7507229154287978, + "grad_norm": 1.4866288309273816, + "learning_rate": 3.0852942394950915e-06, + "loss": 0.2789, + "step": 9476 + }, + { + "epoch": 0.7508021390374332, + "grad_norm": 1.3697140300964896, + "learning_rate": 3.083440753971556e-06, + "loss": 0.2254, + "step": 9477 + }, + { + "epoch": 0.7508813626460685, + "grad_norm": 1.1896947340579718, + "learning_rate": 3.0815877238701653e-06, + "loss": 0.1875, + "step": 9478 + }, + { + "epoch": 0.7509605862547039, + "grad_norm": 1.5220249568042512, + "learning_rate": 3.079735149312931e-06, + "loss": 0.3272, + "step": 9479 + }, + { + "epoch": 0.7510398098633393, + "grad_norm": 1.4274812682359288, + "learning_rate": 3.077883030421843e-06, + "loss": 0.1905, + "step": 9480 + }, + { + "epoch": 0.7511190334719746, + "grad_norm": 1.148347303567613, + "learning_rate": 3.0760313673188493e-06, + "loss": 0.1689, + "step": 9481 + }, + { + "epoch": 0.75119825708061, + "grad_norm": 1.3045861258190539, + "learning_rate": 3.0741801601258714e-06, + "loss": 0.2333, + "step": 9482 + }, + { + "epoch": 0.7512774806892454, + "grad_norm": 1.541061156672222, + "learning_rate": 3.072329408964808e-06, + "loss": 0.2338, + "step": 9483 + }, + { + "epoch": 0.7513567042978808, + "grad_norm": 1.0135960605517473, + "learning_rate": 3.0704791139575195e-06, + "loss": 0.1608, + "step": 9484 + }, + { + "epoch": 0.7514359279065161, + "grad_norm": 1.5085062572010506, + "learning_rate": 3.0686292752258352e-06, + "loss": 0.2657, + "step": 9485 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 1.2458356638595636, + "learning_rate": 3.066779892891564e-06, + "loss": 0.207, + "step": 9486 + }, + { + "epoch": 0.7515943751237869, + "grad_norm": 1.2736112655279421, + "learning_rate": 3.064930967076477e-06, + "loss": 0.2092, + "step": 9487 + }, + { + "epoch": 0.7516735987324222, + "grad_norm": 1.4314328367833653, + "learning_rate": 3.063082497902313e-06, + "loss": 0.2227, + "step": 9488 + }, + { + "epoch": 0.7517528223410577, + "grad_norm": 1.394462967768676, + "learning_rate": 3.0612344854907917e-06, + "loss": 0.2461, + "step": 9489 + }, + { + "epoch": 0.751832045949693, + "grad_norm": 1.1942375880349436, + "learning_rate": 3.0593869299635925e-06, + "loss": 0.1888, + "step": 9490 + }, + { + "epoch": 0.7519112695583284, + "grad_norm": 1.368240226170967, + "learning_rate": 3.0575398314423677e-06, + "loss": 0.2445, + "step": 9491 + }, + { + "epoch": 0.7519904931669638, + "grad_norm": 0.9101826210930218, + "learning_rate": 3.0556931900487365e-06, + "loss": 0.1348, + "step": 9492 + }, + { + "epoch": 0.7520697167755991, + "grad_norm": 1.290594676041806, + "learning_rate": 3.053847005904298e-06, + "loss": 0.2429, + "step": 9493 + }, + { + "epoch": 0.7521489403842345, + "grad_norm": 1.5415974361876146, + "learning_rate": 3.052001279130612e-06, + "loss": 0.238, + "step": 9494 + }, + { + "epoch": 0.7522281639928698, + "grad_norm": 1.5041012964745173, + "learning_rate": 3.0501560098492056e-06, + "loss": 0.2446, + "step": 9495 + }, + { + "epoch": 0.7523073876015053, + "grad_norm": 1.6898963618734115, + "learning_rate": 3.0483111981815906e-06, + "loss": 0.2575, + "step": 9496 + }, + { + "epoch": 0.7523866112101406, + "grad_norm": 1.1767442627187736, + "learning_rate": 3.046466844249232e-06, + "loss": 0.2172, + "step": 9497 + }, + { + "epoch": 0.752465834818776, + "grad_norm": 1.4569689780534123, + "learning_rate": 3.0446229481735713e-06, + "loss": 0.2465, + "step": 9498 + }, + { + "epoch": 0.7525450584274114, + "grad_norm": 1.468710675687599, + "learning_rate": 3.042779510076025e-06, + "loss": 0.2394, + "step": 9499 + }, + { + "epoch": 0.7526242820360467, + "grad_norm": 1.2814993580961627, + "learning_rate": 3.0409365300779725e-06, + "loss": 0.2472, + "step": 9500 + }, + { + "epoch": 0.7527035056446821, + "grad_norm": 1.3358978428510522, + "learning_rate": 3.039094008300761e-06, + "loss": 0.1886, + "step": 9501 + }, + { + "epoch": 0.7527827292533175, + "grad_norm": 1.5352179574927975, + "learning_rate": 3.0372519448657188e-06, + "loss": 0.2542, + "step": 9502 + }, + { + "epoch": 0.7528619528619529, + "grad_norm": 1.1781326724959185, + "learning_rate": 3.0354103398941327e-06, + "loss": 0.1727, + "step": 9503 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 1.5323387135404547, + "learning_rate": 3.0335691935072618e-06, + "loss": 0.2667, + "step": 9504 + }, + { + "epoch": 0.7530204000792237, + "grad_norm": 1.42261100595425, + "learning_rate": 3.0317285058263426e-06, + "loss": 0.2447, + "step": 9505 + }, + { + "epoch": 0.753099623687859, + "grad_norm": 1.415446079918829, + "learning_rate": 3.029888276972571e-06, + "loss": 0.2289, + "step": 9506 + }, + { + "epoch": 0.7531788472964943, + "grad_norm": 1.4968788512640403, + "learning_rate": 3.0280485070671197e-06, + "loss": 0.3105, + "step": 9507 + }, + { + "epoch": 0.7532580709051298, + "grad_norm": 1.0715515709813266, + "learning_rate": 3.0262091962311234e-06, + "loss": 0.1649, + "step": 9508 + }, + { + "epoch": 0.7533372945137651, + "grad_norm": 1.3075741828816314, + "learning_rate": 3.0243703445856985e-06, + "loss": 0.2199, + "step": 9509 + }, + { + "epoch": 0.7534165181224005, + "grad_norm": 0.9690785440856734, + "learning_rate": 3.0225319522519226e-06, + "loss": 0.1289, + "step": 9510 + }, + { + "epoch": 0.7534957417310358, + "grad_norm": 1.8770743582388911, + "learning_rate": 3.0206940193508404e-06, + "loss": 0.2559, + "step": 9511 + }, + { + "epoch": 0.7535749653396713, + "grad_norm": 1.2434742312321545, + "learning_rate": 3.018856546003479e-06, + "loss": 0.208, + "step": 9512 + }, + { + "epoch": 0.7536541889483066, + "grad_norm": 1.262959253719853, + "learning_rate": 3.0170195323308216e-06, + "loss": 0.2348, + "step": 9513 + }, + { + "epoch": 0.7537334125569419, + "grad_norm": 1.3354250949069246, + "learning_rate": 3.0151829784538257e-06, + "loss": 0.2857, + "step": 9514 + }, + { + "epoch": 0.7538126361655774, + "grad_norm": 1.3675365592528514, + "learning_rate": 3.0133468844934245e-06, + "loss": 0.2698, + "step": 9515 + }, + { + "epoch": 0.7538918597742127, + "grad_norm": 1.4365720898127374, + "learning_rate": 3.0115112505705134e-06, + "loss": 0.2075, + "step": 9516 + }, + { + "epoch": 0.7539710833828481, + "grad_norm": 1.1988970087854138, + "learning_rate": 3.0096760768059576e-06, + "loss": 0.1956, + "step": 9517 + }, + { + "epoch": 0.7540503069914835, + "grad_norm": 1.1522947267556833, + "learning_rate": 3.0078413633205995e-06, + "loss": 0.1861, + "step": 9518 + }, + { + "epoch": 0.7541295306001188, + "grad_norm": 1.2209150591935631, + "learning_rate": 3.0060071102352438e-06, + "loss": 0.2628, + "step": 9519 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 1.4103845726564295, + "learning_rate": 3.0041733176706668e-06, + "loss": 0.2706, + "step": 9520 + }, + { + "epoch": 0.7542879778173895, + "grad_norm": 1.2875364793294792, + "learning_rate": 3.002339985747611e-06, + "loss": 0.2163, + "step": 9521 + }, + { + "epoch": 0.754367201426025, + "grad_norm": 1.3317610253988597, + "learning_rate": 3.0005071145868004e-06, + "loss": 0.2273, + "step": 9522 + }, + { + "epoch": 0.7544464250346603, + "grad_norm": 1.1607337882390345, + "learning_rate": 2.998674704308917e-06, + "loss": 0.2016, + "step": 9523 + }, + { + "epoch": 0.7545256486432957, + "grad_norm": 1.1084687185978421, + "learning_rate": 2.9968427550346136e-06, + "loss": 0.1855, + "step": 9524 + }, + { + "epoch": 0.7546048722519311, + "grad_norm": 1.2487578857852424, + "learning_rate": 2.9950112668845198e-06, + "loss": 0.1952, + "step": 9525 + }, + { + "epoch": 0.7546840958605664, + "grad_norm": 1.38847565888578, + "learning_rate": 2.9931802399792285e-06, + "loss": 0.2416, + "step": 9526 + }, + { + "epoch": 0.7547633194692018, + "grad_norm": 1.2632913089183573, + "learning_rate": 2.9913496744393e-06, + "loss": 0.2295, + "step": 9527 + }, + { + "epoch": 0.7548425430778372, + "grad_norm": 1.5195212094501305, + "learning_rate": 2.9895195703852763e-06, + "loss": 0.2458, + "step": 9528 + }, + { + "epoch": 0.7549217666864726, + "grad_norm": 1.9625672878995102, + "learning_rate": 2.987689927937656e-06, + "loss": 0.2459, + "step": 9529 + }, + { + "epoch": 0.7550009902951079, + "grad_norm": 1.2635610926153735, + "learning_rate": 2.98586074721691e-06, + "loss": 0.2229, + "step": 9530 + }, + { + "epoch": 0.7550802139037434, + "grad_norm": 1.4812100305828517, + "learning_rate": 2.9840320283434865e-06, + "loss": 0.28, + "step": 9531 + }, + { + "epoch": 0.7551594375123787, + "grad_norm": 1.4769865636154187, + "learning_rate": 2.982203771437796e-06, + "loss": 0.3241, + "step": 9532 + }, + { + "epoch": 0.755238661121014, + "grad_norm": 1.3150086989581382, + "learning_rate": 2.9803759766202157e-06, + "loss": 0.2836, + "step": 9533 + }, + { + "epoch": 0.7553178847296494, + "grad_norm": 1.3087754527590916, + "learning_rate": 2.9785486440111044e-06, + "loss": 0.236, + "step": 9534 + }, + { + "epoch": 0.7553971083382848, + "grad_norm": 1.0365413676042066, + "learning_rate": 2.9767217737307805e-06, + "loss": 0.1812, + "step": 9535 + }, + { + "epoch": 0.7554763319469202, + "grad_norm": 1.3340487885286738, + "learning_rate": 2.974895365899534e-06, + "loss": 0.2268, + "step": 9536 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 1.1422599602684496, + "learning_rate": 2.973069420637621e-06, + "loss": 0.2095, + "step": 9537 + }, + { + "epoch": 0.755634779164191, + "grad_norm": 1.3553187726065785, + "learning_rate": 2.971243938065279e-06, + "loss": 0.1969, + "step": 9538 + }, + { + "epoch": 0.7557140027728263, + "grad_norm": 1.1594535084901914, + "learning_rate": 2.9694189183027034e-06, + "loss": 0.1943, + "step": 9539 + }, + { + "epoch": 0.7557932263814616, + "grad_norm": 1.3311149925183907, + "learning_rate": 2.9675943614700588e-06, + "loss": 0.2043, + "step": 9540 + }, + { + "epoch": 0.7558724499900971, + "grad_norm": 1.3556045657692193, + "learning_rate": 2.965770267687492e-06, + "loss": 0.2091, + "step": 9541 + }, + { + "epoch": 0.7559516735987324, + "grad_norm": 1.7533325389689827, + "learning_rate": 2.963946637075107e-06, + "loss": 0.2925, + "step": 9542 + }, + { + "epoch": 0.7560308972073678, + "grad_norm": 1.3636354750017925, + "learning_rate": 2.9621234697529787e-06, + "loss": 0.246, + "step": 9543 + }, + { + "epoch": 0.7561101208160032, + "grad_norm": 1.2756807963810644, + "learning_rate": 2.9603007658411575e-06, + "loss": 0.2206, + "step": 9544 + }, + { + "epoch": 0.7561893444246386, + "grad_norm": 1.479260983103371, + "learning_rate": 2.958478525459657e-06, + "loss": 0.242, + "step": 9545 + }, + { + "epoch": 0.7562685680332739, + "grad_norm": 1.399242252878075, + "learning_rate": 2.9566567487284613e-06, + "loss": 0.3363, + "step": 9546 + }, + { + "epoch": 0.7563477916419092, + "grad_norm": 1.1762853626261915, + "learning_rate": 2.9548354357675325e-06, + "loss": 0.1669, + "step": 9547 + }, + { + "epoch": 0.7564270152505447, + "grad_norm": 1.2031901359787553, + "learning_rate": 2.9530145866967897e-06, + "loss": 0.1391, + "step": 9548 + }, + { + "epoch": 0.75650623885918, + "grad_norm": 1.1256424405975474, + "learning_rate": 2.951194201636125e-06, + "loss": 0.168, + "step": 9549 + }, + { + "epoch": 0.7565854624678154, + "grad_norm": 1.2346522278018233, + "learning_rate": 2.9493742807054094e-06, + "loss": 0.1862, + "step": 9550 + }, + { + "epoch": 0.7566646860764508, + "grad_norm": 1.9002467841148203, + "learning_rate": 2.947554824024472e-06, + "loss": 0.2879, + "step": 9551 + }, + { + "epoch": 0.7567439096850862, + "grad_norm": 1.3541772414966324, + "learning_rate": 2.9457358317131125e-06, + "loss": 0.2286, + "step": 9552 + }, + { + "epoch": 0.7568231332937215, + "grad_norm": 1.2795563295436516, + "learning_rate": 2.943917303891107e-06, + "loss": 0.2211, + "step": 9553 + }, + { + "epoch": 0.7569023569023569, + "grad_norm": 1.2959381356085935, + "learning_rate": 2.942099240678197e-06, + "loss": 0.265, + "step": 9554 + }, + { + "epoch": 0.7569815805109923, + "grad_norm": 1.1469560239842571, + "learning_rate": 2.940281642194087e-06, + "loss": 0.1718, + "step": 9555 + }, + { + "epoch": 0.7570608041196276, + "grad_norm": 1.1959361190498585, + "learning_rate": 2.938464508558466e-06, + "loss": 0.193, + "step": 9556 + }, + { + "epoch": 0.757140027728263, + "grad_norm": 1.4007615458854932, + "learning_rate": 2.936647839890979e-06, + "loss": 0.2508, + "step": 9557 + }, + { + "epoch": 0.7572192513368984, + "grad_norm": 1.2136373586815747, + "learning_rate": 2.9348316363112417e-06, + "loss": 0.181, + "step": 9558 + }, + { + "epoch": 0.7572984749455338, + "grad_norm": 1.1975436375730801, + "learning_rate": 2.933015897938849e-06, + "loss": 0.2536, + "step": 9559 + }, + { + "epoch": 0.7573776985541691, + "grad_norm": 1.5071986401978135, + "learning_rate": 2.9312006248933543e-06, + "loss": 0.2582, + "step": 9560 + }, + { + "epoch": 0.7574569221628045, + "grad_norm": 1.0592827270598004, + "learning_rate": 2.9293858172942867e-06, + "loss": 0.1377, + "step": 9561 + }, + { + "epoch": 0.7575361457714399, + "grad_norm": 1.462186093008237, + "learning_rate": 2.9275714752611383e-06, + "loss": 0.2574, + "step": 9562 + }, + { + "epoch": 0.7576153693800752, + "grad_norm": 1.4344983083853058, + "learning_rate": 2.9257575989133803e-06, + "loss": 0.2613, + "step": 9563 + }, + { + "epoch": 0.7576945929887107, + "grad_norm": 1.4442717033631893, + "learning_rate": 2.9239441883704455e-06, + "loss": 0.2487, + "step": 9564 + }, + { + "epoch": 0.757773816597346, + "grad_norm": 1.901278749560395, + "learning_rate": 2.9221312437517357e-06, + "loss": 0.3364, + "step": 9565 + }, + { + "epoch": 0.7578530402059814, + "grad_norm": 1.7299008919290797, + "learning_rate": 2.9203187651766297e-06, + "loss": 0.2696, + "step": 9566 + }, + { + "epoch": 0.7579322638146168, + "grad_norm": 1.2760114705482966, + "learning_rate": 2.918506752764467e-06, + "loss": 0.1949, + "step": 9567 + }, + { + "epoch": 0.7580114874232521, + "grad_norm": 1.289478336015059, + "learning_rate": 2.916695206634558e-06, + "loss": 0.2143, + "step": 9568 + }, + { + "epoch": 0.7580907110318875, + "grad_norm": 1.3210774509193999, + "learning_rate": 2.91488412690619e-06, + "loss": 0.198, + "step": 9569 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 1.9759176749004026, + "learning_rate": 2.913073513698611e-06, + "loss": 0.259, + "step": 9570 + }, + { + "epoch": 0.7582491582491583, + "grad_norm": 1.48195582188223, + "learning_rate": 2.9112633671310387e-06, + "loss": 0.1944, + "step": 9571 + }, + { + "epoch": 0.7583283818577936, + "grad_norm": 1.3166515744083336, + "learning_rate": 2.9094536873226663e-06, + "loss": 0.1775, + "step": 9572 + }, + { + "epoch": 0.758407605466429, + "grad_norm": 1.4684754035297503, + "learning_rate": 2.9076444743926524e-06, + "loss": 0.293, + "step": 9573 + }, + { + "epoch": 0.7584868290750644, + "grad_norm": 1.414567671978473, + "learning_rate": 2.9058357284601204e-06, + "loss": 0.18, + "step": 9574 + }, + { + "epoch": 0.7585660526836997, + "grad_norm": 1.3132058618934817, + "learning_rate": 2.9040274496441732e-06, + "loss": 0.2823, + "step": 9575 + }, + { + "epoch": 0.7586452762923351, + "grad_norm": 1.6600077484501494, + "learning_rate": 2.902219638063876e-06, + "loss": 0.3342, + "step": 9576 + }, + { + "epoch": 0.7587244999009705, + "grad_norm": 1.2748401202404402, + "learning_rate": 2.9004122938382617e-06, + "loss": 0.2177, + "step": 9577 + }, + { + "epoch": 0.7588037235096059, + "grad_norm": 1.39759407545113, + "learning_rate": 2.8986054170863344e-06, + "loss": 0.2277, + "step": 9578 + }, + { + "epoch": 0.7588829471182412, + "grad_norm": 1.4573152447670206, + "learning_rate": 2.8967990079270736e-06, + "loss": 0.2476, + "step": 9579 + }, + { + "epoch": 0.7589621707268767, + "grad_norm": 1.9204576128964057, + "learning_rate": 2.89499306647942e-06, + "loss": 0.2756, + "step": 9580 + }, + { + "epoch": 0.759041394335512, + "grad_norm": 1.5141146172467528, + "learning_rate": 2.8931875928622833e-06, + "loss": 0.2031, + "step": 9581 + }, + { + "epoch": 0.7591206179441473, + "grad_norm": 1.3757728820671016, + "learning_rate": 2.89138258719455e-06, + "loss": 0.2606, + "step": 9582 + }, + { + "epoch": 0.7591998415527828, + "grad_norm": 1.6196468068252399, + "learning_rate": 2.8895780495950687e-06, + "loss": 0.2549, + "step": 9583 + }, + { + "epoch": 0.7592790651614181, + "grad_norm": 1.2864436229302496, + "learning_rate": 2.8877739801826577e-06, + "loss": 0.2147, + "step": 9584 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 1.162692149326294, + "learning_rate": 2.8859703790761095e-06, + "loss": 0.2213, + "step": 9585 + }, + { + "epoch": 0.7594375123786888, + "grad_norm": 1.6123628220486017, + "learning_rate": 2.8841672463941827e-06, + "loss": 0.2494, + "step": 9586 + }, + { + "epoch": 0.7595167359873243, + "grad_norm": 1.4773579274025002, + "learning_rate": 2.8823645822556e-06, + "loss": 0.2303, + "step": 9587 + }, + { + "epoch": 0.7595959595959596, + "grad_norm": 1.419175965475744, + "learning_rate": 2.8805623867790655e-06, + "loss": 0.2726, + "step": 9588 + }, + { + "epoch": 0.7596751832045949, + "grad_norm": 1.5534825544499657, + "learning_rate": 2.8787606600832408e-06, + "loss": 0.2893, + "step": 9589 + }, + { + "epoch": 0.7597544068132304, + "grad_norm": 1.1943753060486557, + "learning_rate": 2.876959402286759e-06, + "loss": 0.1931, + "step": 9590 + }, + { + "epoch": 0.7598336304218657, + "grad_norm": 1.6457476959368162, + "learning_rate": 2.8751586135082275e-06, + "loss": 0.3365, + "step": 9591 + }, + { + "epoch": 0.7599128540305011, + "grad_norm": 1.2569935626941384, + "learning_rate": 2.873358293866221e-06, + "loss": 0.193, + "step": 9592 + }, + { + "epoch": 0.7599920776391365, + "grad_norm": 1.4402726699614747, + "learning_rate": 2.8715584434792786e-06, + "loss": 0.2193, + "step": 9593 + }, + { + "epoch": 0.7600713012477719, + "grad_norm": 1.3640078068656039, + "learning_rate": 2.86975906246591e-06, + "loss": 0.2182, + "step": 9594 + }, + { + "epoch": 0.7601505248564072, + "grad_norm": 1.2762447527374055, + "learning_rate": 2.867960150944602e-06, + "loss": 0.2142, + "step": 9595 + }, + { + "epoch": 0.7602297484650425, + "grad_norm": 1.2333784276620576, + "learning_rate": 2.8661617090338e-06, + "loss": 0.151, + "step": 9596 + }, + { + "epoch": 0.760308972073678, + "grad_norm": 1.546120027415051, + "learning_rate": 2.864363736851922e-06, + "loss": 0.3125, + "step": 9597 + }, + { + "epoch": 0.7603881956823133, + "grad_norm": 1.2463588421189138, + "learning_rate": 2.86256623451736e-06, + "loss": 0.196, + "step": 9598 + }, + { + "epoch": 0.7604674192909487, + "grad_norm": 1.3399879817442026, + "learning_rate": 2.860769202148468e-06, + "loss": 0.1937, + "step": 9599 + }, + { + "epoch": 0.7605466428995841, + "grad_norm": 1.5030957843147574, + "learning_rate": 2.8589726398635688e-06, + "loss": 0.204, + "step": 9600 + }, + { + "epoch": 0.7606258665082194, + "grad_norm": 1.2832427503815433, + "learning_rate": 2.8571765477809645e-06, + "loss": 0.2276, + "step": 9601 + }, + { + "epoch": 0.7607050901168548, + "grad_norm": 1.3912188717597578, + "learning_rate": 2.8553809260189145e-06, + "loss": 0.1929, + "step": 9602 + }, + { + "epoch": 0.7607843137254902, + "grad_norm": 1.0342192893577118, + "learning_rate": 2.8535857746956507e-06, + "loss": 0.1493, + "step": 9603 + }, + { + "epoch": 0.7608635373341256, + "grad_norm": 1.1498466708481796, + "learning_rate": 2.8517910939293804e-06, + "loss": 0.1857, + "step": 9604 + }, + { + "epoch": 0.7609427609427609, + "grad_norm": 1.1688361274280603, + "learning_rate": 2.849996883838271e-06, + "loss": 0.1651, + "step": 9605 + }, + { + "epoch": 0.7610219845513964, + "grad_norm": 1.4190484967403216, + "learning_rate": 2.8482031445404634e-06, + "loss": 0.2331, + "step": 9606 + }, + { + "epoch": 0.7611012081600317, + "grad_norm": 1.2670280482520957, + "learning_rate": 2.8464098761540637e-06, + "loss": 0.185, + "step": 9607 + }, + { + "epoch": 0.761180431768667, + "grad_norm": 1.6250569952397633, + "learning_rate": 2.844617078797155e-06, + "loss": 0.2446, + "step": 9608 + }, + { + "epoch": 0.7612596553773024, + "grad_norm": 1.1229052076592008, + "learning_rate": 2.842824752587783e-06, + "loss": 0.1705, + "step": 9609 + }, + { + "epoch": 0.7613388789859378, + "grad_norm": 1.3501124681231995, + "learning_rate": 2.8410328976439595e-06, + "loss": 0.2274, + "step": 9610 + }, + { + "epoch": 0.7614181025945732, + "grad_norm": 1.8396143205707307, + "learning_rate": 2.839241514083676e-06, + "loss": 0.3653, + "step": 9611 + }, + { + "epoch": 0.7614973262032085, + "grad_norm": 1.3383341171895384, + "learning_rate": 2.837450602024884e-06, + "loss": 0.2463, + "step": 9612 + }, + { + "epoch": 0.761576549811844, + "grad_norm": 1.1014026103065113, + "learning_rate": 2.8356601615855027e-06, + "loss": 0.1782, + "step": 9613 + }, + { + "epoch": 0.7616557734204793, + "grad_norm": 1.064498943487028, + "learning_rate": 2.83387019288343e-06, + "loss": 0.1894, + "step": 9614 + }, + { + "epoch": 0.7617349970291146, + "grad_norm": 1.437961500822072, + "learning_rate": 2.8320806960365234e-06, + "loss": 0.2239, + "step": 9615 + }, + { + "epoch": 0.7618142206377501, + "grad_norm": 1.494959339111669, + "learning_rate": 2.8302916711626106e-06, + "loss": 0.1239, + "step": 9616 + }, + { + "epoch": 0.7618934442463854, + "grad_norm": 1.4718711394640587, + "learning_rate": 2.8285031183794955e-06, + "loss": 0.3247, + "step": 9617 + }, + { + "epoch": 0.7619726678550208, + "grad_norm": 1.682513820831713, + "learning_rate": 2.8267150378049437e-06, + "loss": 0.3004, + "step": 9618 + }, + { + "epoch": 0.7620518914636562, + "grad_norm": 0.9797877302659065, + "learning_rate": 2.8249274295566863e-06, + "loss": 0.1714, + "step": 9619 + }, + { + "epoch": 0.7621311150722916, + "grad_norm": 1.5454328721732453, + "learning_rate": 2.823140293752441e-06, + "loss": 0.2438, + "step": 9620 + }, + { + "epoch": 0.7622103386809269, + "grad_norm": 1.3219426016898606, + "learning_rate": 2.821353630509871e-06, + "loss": 0.2401, + "step": 9621 + }, + { + "epoch": 0.7622895622895622, + "grad_norm": 1.1017171040008353, + "learning_rate": 2.819567439946621e-06, + "loss": 0.1546, + "step": 9622 + }, + { + "epoch": 0.7623687858981977, + "grad_norm": 1.1866900616599196, + "learning_rate": 2.8177817221803074e-06, + "loss": 0.1575, + "step": 9623 + }, + { + "epoch": 0.762448009506833, + "grad_norm": 1.2252375571065437, + "learning_rate": 2.8159964773285074e-06, + "loss": 0.1654, + "step": 9624 + }, + { + "epoch": 0.7625272331154684, + "grad_norm": 1.3566432891303415, + "learning_rate": 2.8142117055087704e-06, + "loss": 0.1498, + "step": 9625 + }, + { + "epoch": 0.7626064567241038, + "grad_norm": 1.8138063197517948, + "learning_rate": 2.8124274068386203e-06, + "loss": 0.3144, + "step": 9626 + }, + { + "epoch": 0.7626856803327392, + "grad_norm": 1.388547678117533, + "learning_rate": 2.8106435814355404e-06, + "loss": 0.2591, + "step": 9627 + }, + { + "epoch": 0.7627649039413745, + "grad_norm": 1.4476412809386547, + "learning_rate": 2.808860229416984e-06, + "loss": 0.2584, + "step": 9628 + }, + { + "epoch": 0.7628441275500099, + "grad_norm": 1.63881227206624, + "learning_rate": 2.8070773509003846e-06, + "loss": 0.2471, + "step": 9629 + }, + { + "epoch": 0.7629233511586453, + "grad_norm": 1.43305716308842, + "learning_rate": 2.80529494600313e-06, + "loss": 0.2709, + "step": 9630 + }, + { + "epoch": 0.7630025747672806, + "grad_norm": 1.338375552829051, + "learning_rate": 2.8035130148425847e-06, + "loss": 0.191, + "step": 9631 + }, + { + "epoch": 0.763081798375916, + "grad_norm": 1.221563060580026, + "learning_rate": 2.801731557536078e-06, + "loss": 0.1933, + "step": 9632 + }, + { + "epoch": 0.7631610219845514, + "grad_norm": 1.2712555631294444, + "learning_rate": 2.799950574200915e-06, + "loss": 0.2368, + "step": 9633 + }, + { + "epoch": 0.7632402455931868, + "grad_norm": 1.1162015888261139, + "learning_rate": 2.7981700649543618e-06, + "loss": 0.187, + "step": 9634 + }, + { + "epoch": 0.7633194692018221, + "grad_norm": 1.4390583013939366, + "learning_rate": 2.796390029913655e-06, + "loss": 0.2369, + "step": 9635 + }, + { + "epoch": 0.7633986928104575, + "grad_norm": 1.4801049402661324, + "learning_rate": 2.794610469196004e-06, + "loss": 0.2393, + "step": 9636 + }, + { + "epoch": 0.7634779164190929, + "grad_norm": 1.3794599199353628, + "learning_rate": 2.792831382918585e-06, + "loss": 0.3018, + "step": 9637 + }, + { + "epoch": 0.7635571400277282, + "grad_norm": 1.2072235882098208, + "learning_rate": 2.791052771198538e-06, + "loss": 0.1639, + "step": 9638 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 1.3002816627252245, + "learning_rate": 2.7892746341529807e-06, + "loss": 0.2505, + "step": 9639 + }, + { + "epoch": 0.763715587244999, + "grad_norm": 1.2160307854137866, + "learning_rate": 2.7874969718989943e-06, + "loss": 0.1985, + "step": 9640 + }, + { + "epoch": 0.7637948108536344, + "grad_norm": 1.2765914341471314, + "learning_rate": 2.785719784553624e-06, + "loss": 0.2456, + "step": 9641 + }, + { + "epoch": 0.7638740344622698, + "grad_norm": 1.246250225584539, + "learning_rate": 2.7839430722338956e-06, + "loss": 0.2048, + "step": 9642 + }, + { + "epoch": 0.7639532580709051, + "grad_norm": 1.1130326574190037, + "learning_rate": 2.7821668350567956e-06, + "loss": 0.2293, + "step": 9643 + }, + { + "epoch": 0.7640324816795405, + "grad_norm": 1.6670628317880867, + "learning_rate": 2.7803910731392757e-06, + "loss": 0.2843, + "step": 9644 + }, + { + "epoch": 0.7641117052881758, + "grad_norm": 1.1663749664511274, + "learning_rate": 2.778615786598269e-06, + "loss": 0.2257, + "step": 9645 + }, + { + "epoch": 0.7641909288968113, + "grad_norm": 1.1326643697143965, + "learning_rate": 2.776840975550664e-06, + "loss": 0.203, + "step": 9646 + }, + { + "epoch": 0.7642701525054466, + "grad_norm": 1.275479010381982, + "learning_rate": 2.7750666401133263e-06, + "loss": 0.1942, + "step": 9647 + }, + { + "epoch": 0.764349376114082, + "grad_norm": 0.9954876350818346, + "learning_rate": 2.773292780403083e-06, + "loss": 0.1533, + "step": 9648 + }, + { + "epoch": 0.7644285997227174, + "grad_norm": 1.2266258157105536, + "learning_rate": 2.7715193965367403e-06, + "loss": 0.188, + "step": 9649 + }, + { + "epoch": 0.7645078233313527, + "grad_norm": 1.3944449727972121, + "learning_rate": 2.769746488631064e-06, + "loss": 0.2403, + "step": 9650 + }, + { + "epoch": 0.7645870469399881, + "grad_norm": 1.2968171985292305, + "learning_rate": 2.767974056802789e-06, + "loss": 0.1945, + "step": 9651 + }, + { + "epoch": 0.7646662705486235, + "grad_norm": 1.3527802382386294, + "learning_rate": 2.766202101168628e-06, + "loss": 0.2197, + "step": 9652 + }, + { + "epoch": 0.7647454941572589, + "grad_norm": 1.1243499762329816, + "learning_rate": 2.76443062184525e-06, + "loss": 0.207, + "step": 9653 + }, + { + "epoch": 0.7648247177658942, + "grad_norm": 1.5515031320433774, + "learning_rate": 2.7626596189492983e-06, + "loss": 0.2665, + "step": 9654 + }, + { + "epoch": 0.7649039413745297, + "grad_norm": 1.5172629951981524, + "learning_rate": 2.76088909259739e-06, + "loss": 0.1985, + "step": 9655 + }, + { + "epoch": 0.764983164983165, + "grad_norm": 1.3690252606106479, + "learning_rate": 2.7591190429061023e-06, + "loss": 0.1933, + "step": 9656 + }, + { + "epoch": 0.7650623885918003, + "grad_norm": 1.4631754202698102, + "learning_rate": 2.757349469991981e-06, + "loss": 0.2499, + "step": 9657 + }, + { + "epoch": 0.7651416122004358, + "grad_norm": 0.9908502330179804, + "learning_rate": 2.7555803739715512e-06, + "loss": 0.1773, + "step": 9658 + }, + { + "epoch": 0.7652208358090711, + "grad_norm": 1.2440165795221518, + "learning_rate": 2.7538117549612963e-06, + "loss": 0.1835, + "step": 9659 + }, + { + "epoch": 0.7653000594177065, + "grad_norm": 1.1858777893085968, + "learning_rate": 2.752043613077667e-06, + "loss": 0.1656, + "step": 9660 + }, + { + "epoch": 0.7653792830263418, + "grad_norm": 1.0996440543094386, + "learning_rate": 2.7502759484370946e-06, + "loss": 0.1784, + "step": 9661 + }, + { + "epoch": 0.7654585066349773, + "grad_norm": 1.4794582881552152, + "learning_rate": 2.748508761155967e-06, + "loss": 0.2633, + "step": 9662 + }, + { + "epoch": 0.7655377302436126, + "grad_norm": 1.5349235958232526, + "learning_rate": 2.746742051350646e-06, + "loss": 0.2146, + "step": 9663 + }, + { + "epoch": 0.7656169538522479, + "grad_norm": 1.6214920852118246, + "learning_rate": 2.7449758191374574e-06, + "loss": 0.3423, + "step": 9664 + }, + { + "epoch": 0.7656961774608834, + "grad_norm": 1.3985187169370845, + "learning_rate": 2.7432100646327043e-06, + "loss": 0.2276, + "step": 9665 + }, + { + "epoch": 0.7657754010695187, + "grad_norm": 1.2965400438387649, + "learning_rate": 2.7414447879526517e-06, + "loss": 0.2271, + "step": 9666 + }, + { + "epoch": 0.7658546246781541, + "grad_norm": 1.5466914478942493, + "learning_rate": 2.739679989213532e-06, + "loss": 0.2806, + "step": 9667 + }, + { + "epoch": 0.7659338482867895, + "grad_norm": 1.2912448927774847, + "learning_rate": 2.7379156685315523e-06, + "loss": 0.1687, + "step": 9668 + }, + { + "epoch": 0.7660130718954249, + "grad_norm": 1.2906373929689472, + "learning_rate": 2.7361518260228827e-06, + "loss": 0.1924, + "step": 9669 + }, + { + "epoch": 0.7660922955040602, + "grad_norm": 1.1449090500809695, + "learning_rate": 2.734388461803661e-06, + "loss": 0.1995, + "step": 9670 + }, + { + "epoch": 0.7661715191126955, + "grad_norm": 1.0505989608056299, + "learning_rate": 2.7326255759900024e-06, + "loss": 0.1781, + "step": 9671 + }, + { + "epoch": 0.766250742721331, + "grad_norm": 1.309540519979868, + "learning_rate": 2.7308631686979816e-06, + "loss": 0.1884, + "step": 9672 + }, + { + "epoch": 0.7663299663299663, + "grad_norm": 1.2054370913353984, + "learning_rate": 2.7291012400436414e-06, + "loss": 0.1789, + "step": 9673 + }, + { + "epoch": 0.7664091899386017, + "grad_norm": 1.412604659709111, + "learning_rate": 2.7273397901430023e-06, + "loss": 0.1587, + "step": 9674 + }, + { + "epoch": 0.7664884135472371, + "grad_norm": 1.2535616167202037, + "learning_rate": 2.7255788191120435e-06, + "loss": 0.2327, + "step": 9675 + }, + { + "epoch": 0.7665676371558724, + "grad_norm": 1.2929861235799582, + "learning_rate": 2.723818327066717e-06, + "loss": 0.203, + "step": 9676 + }, + { + "epoch": 0.7666468607645078, + "grad_norm": 1.5603009488862183, + "learning_rate": 2.722058314122941e-06, + "loss": 0.2062, + "step": 9677 + }, + { + "epoch": 0.7667260843731432, + "grad_norm": 1.2764935684103669, + "learning_rate": 2.7202987803966073e-06, + "loss": 0.1792, + "step": 9678 + }, + { + "epoch": 0.7668053079817786, + "grad_norm": 1.2313598379160944, + "learning_rate": 2.718539726003573e-06, + "loss": 0.2443, + "step": 9679 + }, + { + "epoch": 0.7668845315904139, + "grad_norm": 1.3190603265335255, + "learning_rate": 2.7167811510596577e-06, + "loss": 0.1981, + "step": 9680 + }, + { + "epoch": 0.7669637551990494, + "grad_norm": 1.5167742906135497, + "learning_rate": 2.715023055680661e-06, + "loss": 0.2341, + "step": 9681 + }, + { + "epoch": 0.7670429788076847, + "grad_norm": 1.5095545221269455, + "learning_rate": 2.7132654399823444e-06, + "loss": 0.2363, + "step": 9682 + }, + { + "epoch": 0.76712220241632, + "grad_norm": 1.4820826102641982, + "learning_rate": 2.7115083040804337e-06, + "loss": 0.2477, + "step": 9683 + }, + { + "epoch": 0.7672014260249554, + "grad_norm": 1.4788755937620777, + "learning_rate": 2.709751648090634e-06, + "loss": 0.21, + "step": 9684 + }, + { + "epoch": 0.7672806496335908, + "grad_norm": 1.2081503108109057, + "learning_rate": 2.7079954721286108e-06, + "loss": 0.2003, + "step": 9685 + }, + { + "epoch": 0.7673598732422262, + "grad_norm": 1.248696332684323, + "learning_rate": 2.7062397763099945e-06, + "loss": 0.2363, + "step": 9686 + }, + { + "epoch": 0.7674390968508615, + "grad_norm": 1.273745915104082, + "learning_rate": 2.7044845607503967e-06, + "loss": 0.2007, + "step": 9687 + }, + { + "epoch": 0.767518320459497, + "grad_norm": 1.489220722122109, + "learning_rate": 2.7027298255653878e-06, + "loss": 0.289, + "step": 9688 + }, + { + "epoch": 0.7675975440681323, + "grad_norm": 1.4521833134623467, + "learning_rate": 2.700975570870503e-06, + "loss": 0.2193, + "step": 9689 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 1.389571089131228, + "learning_rate": 2.6992217967812606e-06, + "loss": 0.2432, + "step": 9690 + }, + { + "epoch": 0.7677559912854031, + "grad_norm": 1.6863830982484698, + "learning_rate": 2.697468503413134e-06, + "loss": 0.2734, + "step": 9691 + }, + { + "epoch": 0.7678352148940384, + "grad_norm": 1.6764967277894864, + "learning_rate": 2.6957156908815684e-06, + "loss": 0.2464, + "step": 9692 + }, + { + "epoch": 0.7679144385026738, + "grad_norm": 1.393252266810544, + "learning_rate": 2.6939633593019754e-06, + "loss": 0.2243, + "step": 9693 + }, + { + "epoch": 0.7679936621113091, + "grad_norm": 1.3671780697355251, + "learning_rate": 2.692211508789744e-06, + "loss": 0.2277, + "step": 9694 + }, + { + "epoch": 0.7680728857199446, + "grad_norm": 1.6234745240168307, + "learning_rate": 2.6904601394602216e-06, + "loss": 0.279, + "step": 9695 + }, + { + "epoch": 0.7681521093285799, + "grad_norm": 1.709171466067922, + "learning_rate": 2.688709251428725e-06, + "loss": 0.2656, + "step": 9696 + }, + { + "epoch": 0.7682313329372152, + "grad_norm": 1.3736529211297932, + "learning_rate": 2.6869588448105475e-06, + "loss": 0.2279, + "step": 9697 + }, + { + "epoch": 0.7683105565458507, + "grad_norm": 1.1789271419360619, + "learning_rate": 2.685208919720942e-06, + "loss": 0.2164, + "step": 9698 + }, + { + "epoch": 0.768389780154486, + "grad_norm": 1.2415615531908208, + "learning_rate": 2.683459476275133e-06, + "loss": 0.2604, + "step": 9699 + }, + { + "epoch": 0.7684690037631214, + "grad_norm": 1.3779018612909175, + "learning_rate": 2.6817105145883117e-06, + "loss": 0.2754, + "step": 9700 + }, + { + "epoch": 0.7685482273717568, + "grad_norm": 1.457510329751007, + "learning_rate": 2.6799620347756407e-06, + "loss": 0.2095, + "step": 9701 + }, + { + "epoch": 0.7686274509803922, + "grad_norm": 1.605748800608434, + "learning_rate": 2.6782140369522435e-06, + "loss": 0.3182, + "step": 9702 + }, + { + "epoch": 0.7687066745890275, + "grad_norm": 1.3277532140691806, + "learning_rate": 2.676466521233225e-06, + "loss": 0.2313, + "step": 9703 + }, + { + "epoch": 0.7687858981976629, + "grad_norm": 1.4428146105231685, + "learning_rate": 2.674719487733649e-06, + "loss": 0.2122, + "step": 9704 + }, + { + "epoch": 0.7688651218062983, + "grad_norm": 1.1147503023422805, + "learning_rate": 2.672972936568543e-06, + "loss": 0.2337, + "step": 9705 + }, + { + "epoch": 0.7689443454149336, + "grad_norm": 1.5862144631022426, + "learning_rate": 2.6712268678529187e-06, + "loss": 0.2747, + "step": 9706 + }, + { + "epoch": 0.769023569023569, + "grad_norm": 1.4749711758766628, + "learning_rate": 2.669481281701739e-06, + "loss": 0.2023, + "step": 9707 + }, + { + "epoch": 0.7691027926322044, + "grad_norm": 1.0974533667605268, + "learning_rate": 2.6677361782299437e-06, + "loss": 0.1809, + "step": 9708 + }, + { + "epoch": 0.7691820162408398, + "grad_norm": 1.4069468903267026, + "learning_rate": 2.665991557552442e-06, + "loss": 0.2398, + "step": 9709 + }, + { + "epoch": 0.7692612398494751, + "grad_norm": 1.470254654563354, + "learning_rate": 2.6642474197841086e-06, + "loss": 0.2879, + "step": 9710 + }, + { + "epoch": 0.7693404634581105, + "grad_norm": 1.4812002953277617, + "learning_rate": 2.6625037650397812e-06, + "loss": 0.1972, + "step": 9711 + }, + { + "epoch": 0.7694196870667459, + "grad_norm": 1.4990453061249278, + "learning_rate": 2.6607605934342785e-06, + "loss": 0.2276, + "step": 9712 + }, + { + "epoch": 0.7694989106753812, + "grad_norm": 1.6736194542702678, + "learning_rate": 2.659017905082376e-06, + "loss": 0.2862, + "step": 9713 + }, + { + "epoch": 0.7695781342840167, + "grad_norm": 1.1684544272392003, + "learning_rate": 2.657275700098819e-06, + "loss": 0.2286, + "step": 9714 + }, + { + "epoch": 0.769657357892652, + "grad_norm": 1.2044203004374867, + "learning_rate": 2.65553397859833e-06, + "loss": 0.2267, + "step": 9715 + }, + { + "epoch": 0.7697365815012874, + "grad_norm": 1.5780793831662523, + "learning_rate": 2.6537927406955888e-06, + "loss": 0.1856, + "step": 9716 + }, + { + "epoch": 0.7698158051099228, + "grad_norm": 1.1369371413828187, + "learning_rate": 2.6520519865052476e-06, + "loss": 0.1573, + "step": 9717 + }, + { + "epoch": 0.7698950287185581, + "grad_norm": 1.5582316550777615, + "learning_rate": 2.6503117161419246e-06, + "loss": 0.2785, + "step": 9718 + }, + { + "epoch": 0.7699742523271935, + "grad_norm": 1.0793850245542824, + "learning_rate": 2.6485719297202127e-06, + "loss": 0.1452, + "step": 9719 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 1.2917755849869386, + "learning_rate": 2.646832627354667e-06, + "loss": 0.257, + "step": 9720 + }, + { + "epoch": 0.7701326995444643, + "grad_norm": 0.9475948104829591, + "learning_rate": 2.645093809159809e-06, + "loss": 0.194, + "step": 9721 + }, + { + "epoch": 0.7702119231530996, + "grad_norm": 1.4620853099101796, + "learning_rate": 2.643355475250137e-06, + "loss": 0.3362, + "step": 9722 + }, + { + "epoch": 0.770291146761735, + "grad_norm": 1.1663102274819164, + "learning_rate": 2.6416176257401083e-06, + "loss": 0.2278, + "step": 9723 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 1.3870115097687417, + "learning_rate": 2.639880260744151e-06, + "loss": 0.2143, + "step": 9724 + }, + { + "epoch": 0.7704495939790057, + "grad_norm": 1.6102421385417403, + "learning_rate": 2.6381433803766654e-06, + "loss": 0.2995, + "step": 9725 + }, + { + "epoch": 0.7705288175876411, + "grad_norm": 1.2639052746443182, + "learning_rate": 2.6364069847520155e-06, + "loss": 0.2011, + "step": 9726 + }, + { + "epoch": 0.7706080411962765, + "grad_norm": 1.0391428470685706, + "learning_rate": 2.6346710739845317e-06, + "loss": 0.1407, + "step": 9727 + }, + { + "epoch": 0.7706872648049119, + "grad_norm": 1.2638084198101853, + "learning_rate": 2.6329356481885215e-06, + "loss": 0.1783, + "step": 9728 + }, + { + "epoch": 0.7707664884135472, + "grad_norm": 1.2691448031715797, + "learning_rate": 2.6312007074782497e-06, + "loss": 0.1933, + "step": 9729 + }, + { + "epoch": 0.7708457120221827, + "grad_norm": 1.4917692481855218, + "learning_rate": 2.6294662519679525e-06, + "loss": 0.2648, + "step": 9730 + }, + { + "epoch": 0.770924935630818, + "grad_norm": 1.1915371161373722, + "learning_rate": 2.627732281771841e-06, + "loss": 0.1371, + "step": 9731 + }, + { + "epoch": 0.7710041592394533, + "grad_norm": 1.0843401463639948, + "learning_rate": 2.6259987970040858e-06, + "loss": 0.158, + "step": 9732 + }, + { + "epoch": 0.7710833828480887, + "grad_norm": 1.3029022926510931, + "learning_rate": 2.6242657977788277e-06, + "loss": 0.1934, + "step": 9733 + }, + { + "epoch": 0.7711626064567241, + "grad_norm": 1.3933167070185897, + "learning_rate": 2.6225332842101746e-06, + "loss": 0.2154, + "step": 9734 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 1.5026971001230482, + "learning_rate": 2.6208012564122097e-06, + "loss": 0.1992, + "step": 9735 + }, + { + "epoch": 0.7713210536739948, + "grad_norm": 1.4474525946262127, + "learning_rate": 2.6190697144989753e-06, + "loss": 0.2217, + "step": 9736 + }, + { + "epoch": 0.7714002772826303, + "grad_norm": 1.2414046060407191, + "learning_rate": 2.617338658584483e-06, + "loss": 0.2242, + "step": 9737 + }, + { + "epoch": 0.7714795008912656, + "grad_norm": 1.264662520040703, + "learning_rate": 2.6156080887827183e-06, + "loss": 0.1982, + "step": 9738 + }, + { + "epoch": 0.7715587244999009, + "grad_norm": 1.471965184920465, + "learning_rate": 2.613878005207631e-06, + "loss": 0.2816, + "step": 9739 + }, + { + "epoch": 0.7716379481085364, + "grad_norm": 0.9852968884174986, + "learning_rate": 2.612148407973134e-06, + "loss": 0.1191, + "step": 9740 + }, + { + "epoch": 0.7717171717171717, + "grad_norm": 1.4947392045049763, + "learning_rate": 2.6104192971931197e-06, + "loss": 0.2491, + "step": 9741 + }, + { + "epoch": 0.7717963953258071, + "grad_norm": 1.4711988183903855, + "learning_rate": 2.6086906729814378e-06, + "loss": 0.2991, + "step": 9742 + }, + { + "epoch": 0.7718756189344425, + "grad_norm": 1.4129032041874277, + "learning_rate": 2.606962535451907e-06, + "loss": 0.1856, + "step": 9743 + }, + { + "epoch": 0.7719548425430779, + "grad_norm": 1.0585597784984708, + "learning_rate": 2.605234884718324e-06, + "loss": 0.1695, + "step": 9744 + }, + { + "epoch": 0.7720340661517132, + "grad_norm": 1.3450712662523423, + "learning_rate": 2.6035077208944416e-06, + "loss": 0.2678, + "step": 9745 + }, + { + "epoch": 0.7721132897603485, + "grad_norm": 1.068305692416248, + "learning_rate": 2.601781044093984e-06, + "loss": 0.1412, + "step": 9746 + }, + { + "epoch": 0.772192513368984, + "grad_norm": 1.2969221167205398, + "learning_rate": 2.600054854430649e-06, + "loss": 0.2098, + "step": 9747 + }, + { + "epoch": 0.7722717369776193, + "grad_norm": 1.570730164246765, + "learning_rate": 2.5983291520180965e-06, + "loss": 0.2825, + "step": 9748 + }, + { + "epoch": 0.7723509605862547, + "grad_norm": 1.4732032095279834, + "learning_rate": 2.5966039369699537e-06, + "loss": 0.2761, + "step": 9749 + }, + { + "epoch": 0.7724301841948901, + "grad_norm": 1.5011264372345434, + "learning_rate": 2.5948792093998167e-06, + "loss": 0.2623, + "step": 9750 + }, + { + "epoch": 0.7725094078035255, + "grad_norm": 1.3924967768554901, + "learning_rate": 2.5931549694212545e-06, + "loss": 0.2309, + "step": 9751 + }, + { + "epoch": 0.7725886314121608, + "grad_norm": 2.087296032616522, + "learning_rate": 2.5914312171477983e-06, + "loss": 0.2276, + "step": 9752 + }, + { + "epoch": 0.7726678550207962, + "grad_norm": 1.179237864240833, + "learning_rate": 2.589707952692947e-06, + "loss": 0.1556, + "step": 9753 + }, + { + "epoch": 0.7727470786294316, + "grad_norm": 1.1975446071149518, + "learning_rate": 2.5879851761701724e-06, + "loss": 0.187, + "step": 9754 + }, + { + "epoch": 0.7728263022380669, + "grad_norm": 1.0076508648276241, + "learning_rate": 2.586262887692911e-06, + "loss": 0.1276, + "step": 9755 + }, + { + "epoch": 0.7729055258467024, + "grad_norm": 1.1773819719638143, + "learning_rate": 2.5845410873745614e-06, + "loss": 0.2286, + "step": 9756 + }, + { + "epoch": 0.7729847494553377, + "grad_norm": 1.8822293925967903, + "learning_rate": 2.5828197753285043e-06, + "loss": 0.3079, + "step": 9757 + }, + { + "epoch": 0.773063973063973, + "grad_norm": 1.2767485432948658, + "learning_rate": 2.581098951668075e-06, + "loss": 0.2438, + "step": 9758 + }, + { + "epoch": 0.7731431966726084, + "grad_norm": 1.4030859500932207, + "learning_rate": 2.5793786165065805e-06, + "loss": 0.2347, + "step": 9759 + }, + { + "epoch": 0.7732224202812438, + "grad_norm": 1.3994373534062892, + "learning_rate": 2.5776587699573007e-06, + "loss": 0.2039, + "step": 9760 + }, + { + "epoch": 0.7733016438898792, + "grad_norm": 1.4011803800677545, + "learning_rate": 2.5759394121334767e-06, + "loss": 0.2197, + "step": 9761 + }, + { + "epoch": 0.7733808674985145, + "grad_norm": 1.0681782158546003, + "learning_rate": 2.57422054314832e-06, + "loss": 0.1544, + "step": 9762 + }, + { + "epoch": 0.77346009110715, + "grad_norm": 1.3640290680430198, + "learning_rate": 2.572502163115007e-06, + "loss": 0.1993, + "step": 9763 + }, + { + "epoch": 0.7735393147157853, + "grad_norm": 1.8237249707028338, + "learning_rate": 2.5707842721466914e-06, + "loss": 0.3321, + "step": 9764 + }, + { + "epoch": 0.7736185383244206, + "grad_norm": 1.4114956114890511, + "learning_rate": 2.5690668703564835e-06, + "loss": 0.2341, + "step": 9765 + }, + { + "epoch": 0.7736977619330561, + "grad_norm": 2.2349017314993844, + "learning_rate": 2.5673499578574644e-06, + "loss": 0.2746, + "step": 9766 + }, + { + "epoch": 0.7737769855416914, + "grad_norm": 1.6321926073560515, + "learning_rate": 2.565633534762689e-06, + "loss": 0.2406, + "step": 9767 + }, + { + "epoch": 0.7738562091503268, + "grad_norm": 1.588654883202697, + "learning_rate": 2.5639176011851753e-06, + "loss": 0.3065, + "step": 9768 + }, + { + "epoch": 0.7739354327589621, + "grad_norm": 1.4625348158453002, + "learning_rate": 2.562202157237903e-06, + "loss": 0.2306, + "step": 9769 + }, + { + "epoch": 0.7740146563675976, + "grad_norm": 1.4127855907656617, + "learning_rate": 2.5604872030338336e-06, + "loss": 0.2176, + "step": 9770 + }, + { + "epoch": 0.7740938799762329, + "grad_norm": 1.1559966829911, + "learning_rate": 2.5587727386858853e-06, + "loss": 0.2204, + "step": 9771 + }, + { + "epoch": 0.7741731035848682, + "grad_norm": 1.3936104441926196, + "learning_rate": 2.5570587643069435e-06, + "loss": 0.2191, + "step": 9772 + }, + { + "epoch": 0.7742523271935037, + "grad_norm": 1.4867912888290955, + "learning_rate": 2.555345280009872e-06, + "loss": 0.2951, + "step": 9773 + }, + { + "epoch": 0.774331550802139, + "grad_norm": 1.2391187519545641, + "learning_rate": 2.5536322859074934e-06, + "loss": 0.2176, + "step": 9774 + }, + { + "epoch": 0.7744107744107744, + "grad_norm": 1.3700158717381583, + "learning_rate": 2.551919782112596e-06, + "loss": 0.2926, + "step": 9775 + }, + { + "epoch": 0.7744899980194098, + "grad_norm": 1.4758559459077216, + "learning_rate": 2.550207768737949e-06, + "loss": 0.3297, + "step": 9776 + }, + { + "epoch": 0.7745692216280452, + "grad_norm": 1.4231930240261537, + "learning_rate": 2.54849624589627e-06, + "loss": 0.2502, + "step": 9777 + }, + { + "epoch": 0.7746484452366805, + "grad_norm": 1.277991131774114, + "learning_rate": 2.546785213700258e-06, + "loss": 0.1998, + "step": 9778 + }, + { + "epoch": 0.7747276688453159, + "grad_norm": 1.0933793358100243, + "learning_rate": 2.5450746722625785e-06, + "loss": 0.1578, + "step": 9779 + }, + { + "epoch": 0.7748068924539513, + "grad_norm": 1.2826164703801584, + "learning_rate": 2.5433646216958617e-06, + "loss": 0.2039, + "step": 9780 + }, + { + "epoch": 0.7748861160625866, + "grad_norm": 1.2445057308386933, + "learning_rate": 2.5416550621127024e-06, + "loss": 0.1823, + "step": 9781 + }, + { + "epoch": 0.774965339671222, + "grad_norm": 1.128413867916967, + "learning_rate": 2.539945993625673e-06, + "loss": 0.128, + "step": 9782 + }, + { + "epoch": 0.7750445632798574, + "grad_norm": 1.6025571787779858, + "learning_rate": 2.5382374163473046e-06, + "loss": 0.2727, + "step": 9783 + }, + { + "epoch": 0.7751237868884928, + "grad_norm": 1.1236097844657231, + "learning_rate": 2.536529330390095e-06, + "loss": 0.1779, + "step": 9784 + }, + { + "epoch": 0.7752030104971281, + "grad_norm": 1.4867734272149344, + "learning_rate": 2.5348217358665207e-06, + "loss": 0.2373, + "step": 9785 + }, + { + "epoch": 0.7752822341057635, + "grad_norm": 1.5319555874516364, + "learning_rate": 2.5331146328890145e-06, + "loss": 0.2603, + "step": 9786 + }, + { + "epoch": 0.7753614577143989, + "grad_norm": 1.4968446226011585, + "learning_rate": 2.5314080215699822e-06, + "loss": 0.2673, + "step": 9787 + }, + { + "epoch": 0.7754406813230342, + "grad_norm": 1.5590768794546803, + "learning_rate": 2.5297019020217904e-06, + "loss": 0.2624, + "step": 9788 + }, + { + "epoch": 0.7755199049316697, + "grad_norm": 1.288873511730507, + "learning_rate": 2.5279962743567877e-06, + "loss": 0.2038, + "step": 9789 + }, + { + "epoch": 0.775599128540305, + "grad_norm": 1.2411919203457322, + "learning_rate": 2.526291138687278e-06, + "loss": 0.1737, + "step": 9790 + }, + { + "epoch": 0.7756783521489404, + "grad_norm": 1.8322792847583997, + "learning_rate": 2.5245864951255317e-06, + "loss": 0.3471, + "step": 9791 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 1.5299762404464519, + "learning_rate": 2.522882343783799e-06, + "loss": 0.2267, + "step": 9792 + }, + { + "epoch": 0.7758367993662111, + "grad_norm": 1.274294858630166, + "learning_rate": 2.521178684774286e-06, + "loss": 0.2005, + "step": 9793 + }, + { + "epoch": 0.7759160229748465, + "grad_norm": 1.621905189167536, + "learning_rate": 2.519475518209167e-06, + "loss": 0.3123, + "step": 9794 + }, + { + "epoch": 0.7759952465834818, + "grad_norm": 1.3538410344102365, + "learning_rate": 2.5177728442005956e-06, + "loss": 0.1818, + "step": 9795 + }, + { + "epoch": 0.7760744701921173, + "grad_norm": 1.5069096239824853, + "learning_rate": 2.516070662860679e-06, + "loss": 0.3361, + "step": 9796 + }, + { + "epoch": 0.7761536938007526, + "grad_norm": 1.3526837243609664, + "learning_rate": 2.5143689743014966e-06, + "loss": 0.224, + "step": 9797 + }, + { + "epoch": 0.776232917409388, + "grad_norm": 1.3140397788197928, + "learning_rate": 2.5126677786351005e-06, + "loss": 0.2085, + "step": 9798 + }, + { + "epoch": 0.7763121410180234, + "grad_norm": 1.291769236508765, + "learning_rate": 2.5109670759735063e-06, + "loss": 0.1774, + "step": 9799 + }, + { + "epoch": 0.7763913646266587, + "grad_norm": 1.1257316434451494, + "learning_rate": 2.509266866428691e-06, + "loss": 0.187, + "step": 9800 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 1.1852026694356723, + "learning_rate": 2.507567150112613e-06, + "loss": 0.2007, + "step": 9801 + }, + { + "epoch": 0.7765498118439295, + "grad_norm": 1.4961105200209923, + "learning_rate": 2.5058679271371865e-06, + "loss": 0.2072, + "step": 9802 + }, + { + "epoch": 0.7766290354525649, + "grad_norm": 1.3459071584133198, + "learning_rate": 2.504169197614298e-06, + "loss": 0.2177, + "step": 9803 + }, + { + "epoch": 0.7767082590612002, + "grad_norm": 0.91136821091373, + "learning_rate": 2.5024709616557964e-06, + "loss": 0.1261, + "step": 9804 + }, + { + "epoch": 0.7767874826698357, + "grad_norm": 1.3580249630504875, + "learning_rate": 2.500773219373509e-06, + "loss": 0.2736, + "step": 9805 + }, + { + "epoch": 0.776866706278471, + "grad_norm": 1.1835191512655656, + "learning_rate": 2.499075970879222e-06, + "loss": 0.1916, + "step": 9806 + }, + { + "epoch": 0.7769459298871063, + "grad_norm": 1.4384152599467444, + "learning_rate": 2.4973792162846878e-06, + "loss": 0.2339, + "step": 9807 + }, + { + "epoch": 0.7770251534957417, + "grad_norm": 1.704774782212089, + "learning_rate": 2.4956829557016336e-06, + "loss": 0.3013, + "step": 9808 + }, + { + "epoch": 0.7771043771043771, + "grad_norm": 1.1597230656190647, + "learning_rate": 2.493987189241749e-06, + "loss": 0.1611, + "step": 9809 + }, + { + "epoch": 0.7771836007130125, + "grad_norm": 1.3320295329712077, + "learning_rate": 2.4922919170166883e-06, + "loss": 0.2666, + "step": 9810 + }, + { + "epoch": 0.7772628243216478, + "grad_norm": 1.2956372994974583, + "learning_rate": 2.4905971391380823e-06, + "loss": 0.2342, + "step": 9811 + }, + { + "epoch": 0.7773420479302833, + "grad_norm": 1.4098237361886397, + "learning_rate": 2.488902855717522e-06, + "loss": 0.2352, + "step": 9812 + }, + { + "epoch": 0.7774212715389186, + "grad_norm": 1.4430902858066355, + "learning_rate": 2.487209066866565e-06, + "loss": 0.2438, + "step": 9813 + }, + { + "epoch": 0.7775004951475539, + "grad_norm": 1.325367469354069, + "learning_rate": 2.485515772696745e-06, + "loss": 0.2683, + "step": 9814 + }, + { + "epoch": 0.7775797187561894, + "grad_norm": 1.6190655999350856, + "learning_rate": 2.483822973319553e-06, + "loss": 0.2791, + "step": 9815 + }, + { + "epoch": 0.7776589423648247, + "grad_norm": 1.686175871770783, + "learning_rate": 2.482130668846451e-06, + "loss": 0.2646, + "step": 9816 + }, + { + "epoch": 0.7777381659734601, + "grad_norm": 1.2999105474559334, + "learning_rate": 2.480438859388873e-06, + "loss": 0.1858, + "step": 9817 + }, + { + "epoch": 0.7778173895820955, + "grad_norm": 1.3646265368255728, + "learning_rate": 2.4787475450582133e-06, + "loss": 0.1911, + "step": 9818 + }, + { + "epoch": 0.7778966131907309, + "grad_norm": 1.194040823596541, + "learning_rate": 2.4770567259658386e-06, + "loss": 0.1936, + "step": 9819 + }, + { + "epoch": 0.7779758367993662, + "grad_norm": 1.2695115005678987, + "learning_rate": 2.4753664022230783e-06, + "loss": 0.2075, + "step": 9820 + }, + { + "epoch": 0.7780550604080015, + "grad_norm": 1.163372999305103, + "learning_rate": 2.473676573941236e-06, + "loss": 0.1783, + "step": 9821 + }, + { + "epoch": 0.778134284016637, + "grad_norm": 1.1635340373882301, + "learning_rate": 2.471987241231577e-06, + "loss": 0.2323, + "step": 9822 + }, + { + "epoch": 0.7782135076252723, + "grad_norm": 1.3777705748488132, + "learning_rate": 2.4702984042053335e-06, + "loss": 0.2314, + "step": 9823 + }, + { + "epoch": 0.7782927312339077, + "grad_norm": 1.38497452629102, + "learning_rate": 2.468610062973712e-06, + "loss": 0.1805, + "step": 9824 + }, + { + "epoch": 0.7783719548425431, + "grad_norm": 1.3448713104542693, + "learning_rate": 2.466922217647879e-06, + "loss": 0.1973, + "step": 9825 + }, + { + "epoch": 0.7784511784511785, + "grad_norm": 1.5234148565987975, + "learning_rate": 2.465234868338968e-06, + "loss": 0.2097, + "step": 9826 + }, + { + "epoch": 0.7785304020598138, + "grad_norm": 1.2804172945685526, + "learning_rate": 2.4635480151580902e-06, + "loss": 0.252, + "step": 9827 + }, + { + "epoch": 0.7786096256684492, + "grad_norm": 1.1899370919299495, + "learning_rate": 2.461861658216311e-06, + "loss": 0.2178, + "step": 9828 + }, + { + "epoch": 0.7786888492770846, + "grad_norm": 1.3986738066004556, + "learning_rate": 2.4601757976246685e-06, + "loss": 0.242, + "step": 9829 + }, + { + "epoch": 0.7787680728857199, + "grad_norm": 1.2427130278414202, + "learning_rate": 2.4584904334941728e-06, + "loss": 0.1395, + "step": 9830 + }, + { + "epoch": 0.7788472964943554, + "grad_norm": 1.2550569843571655, + "learning_rate": 2.456805565935795e-06, + "loss": 0.2126, + "step": 9831 + }, + { + "epoch": 0.7789265201029907, + "grad_norm": 1.3278799203293916, + "learning_rate": 2.4551211950604713e-06, + "loss": 0.2362, + "step": 9832 + }, + { + "epoch": 0.7790057437116261, + "grad_norm": 1.4787427880788118, + "learning_rate": 2.4534373209791162e-06, + "loss": 0.2312, + "step": 9833 + }, + { + "epoch": 0.7790849673202614, + "grad_norm": 1.3342973874062503, + "learning_rate": 2.451753943802603e-06, + "loss": 0.2501, + "step": 9834 + }, + { + "epoch": 0.7791641909288968, + "grad_norm": 1.3620104584058979, + "learning_rate": 2.4500710636417725e-06, + "loss": 0.2311, + "step": 9835 + }, + { + "epoch": 0.7792434145375322, + "grad_norm": 1.1549766958184322, + "learning_rate": 2.4483886806074308e-06, + "loss": 0.1735, + "step": 9836 + }, + { + "epoch": 0.7793226381461675, + "grad_norm": 1.4914316260343865, + "learning_rate": 2.4467067948103616e-06, + "loss": 0.3064, + "step": 9837 + }, + { + "epoch": 0.779401861754803, + "grad_norm": 1.4867861831483329, + "learning_rate": 2.4450254063613056e-06, + "loss": 0.2603, + "step": 9838 + }, + { + "epoch": 0.7794810853634383, + "grad_norm": 1.1722369248622868, + "learning_rate": 2.4433445153709722e-06, + "loss": 0.1299, + "step": 9839 + }, + { + "epoch": 0.7795603089720736, + "grad_norm": 1.6269310481454606, + "learning_rate": 2.441664121950045e-06, + "loss": 0.2723, + "step": 9840 + }, + { + "epoch": 0.7796395325807091, + "grad_norm": 1.6435256799912368, + "learning_rate": 2.439984226209167e-06, + "loss": 0.2002, + "step": 9841 + }, + { + "epoch": 0.7797187561893444, + "grad_norm": 1.2210533318748373, + "learning_rate": 2.438304828258947e-06, + "loss": 0.2124, + "step": 9842 + }, + { + "epoch": 0.7797979797979798, + "grad_norm": 1.5488817003804343, + "learning_rate": 2.4366259282099737e-06, + "loss": 0.2708, + "step": 9843 + }, + { + "epoch": 0.7798772034066151, + "grad_norm": 1.1249739446523817, + "learning_rate": 2.4349475261727905e-06, + "loss": 0.2071, + "step": 9844 + }, + { + "epoch": 0.7799564270152506, + "grad_norm": 1.495207088742677, + "learning_rate": 2.4332696222579078e-06, + "loss": 0.233, + "step": 9845 + }, + { + "epoch": 0.7800356506238859, + "grad_norm": 1.3890539034197336, + "learning_rate": 2.4315922165758154e-06, + "loss": 0.2772, + "step": 9846 + }, + { + "epoch": 0.7801148742325212, + "grad_norm": 1.1698237027329774, + "learning_rate": 2.4299153092369598e-06, + "loss": 0.1939, + "step": 9847 + }, + { + "epoch": 0.7801940978411567, + "grad_norm": 1.3966118506002616, + "learning_rate": 2.428238900351755e-06, + "loss": 0.2102, + "step": 9848 + }, + { + "epoch": 0.780273321449792, + "grad_norm": 1.1226373455404322, + "learning_rate": 2.426562990030582e-06, + "loss": 0.174, + "step": 9849 + }, + { + "epoch": 0.7803525450584274, + "grad_norm": 1.2347883158850974, + "learning_rate": 2.424887578383799e-06, + "loss": 0.2374, + "step": 9850 + }, + { + "epoch": 0.7804317686670628, + "grad_norm": 1.5164522983368813, + "learning_rate": 2.4232126655217202e-06, + "loss": 0.2444, + "step": 9851 + }, + { + "epoch": 0.7805109922756982, + "grad_norm": 1.081402277973487, + "learning_rate": 2.421538251554627e-06, + "loss": 0.1829, + "step": 9852 + }, + { + "epoch": 0.7805902158843335, + "grad_norm": 1.1776046044388244, + "learning_rate": 2.4198643365927767e-06, + "loss": 0.197, + "step": 9853 + }, + { + "epoch": 0.7806694394929689, + "grad_norm": 1.1795129618040738, + "learning_rate": 2.4181909207463873e-06, + "loss": 0.1705, + "step": 9854 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 1.1989535403746896, + "learning_rate": 2.4165180041256444e-06, + "loss": 0.2565, + "step": 9855 + }, + { + "epoch": 0.7808278867102396, + "grad_norm": 1.1321351278901886, + "learning_rate": 2.4148455868407015e-06, + "loss": 0.1942, + "step": 9856 + }, + { + "epoch": 0.780907110318875, + "grad_norm": 1.6103943661973856, + "learning_rate": 2.413173669001676e-06, + "loss": 0.2792, + "step": 9857 + }, + { + "epoch": 0.7809863339275104, + "grad_norm": 1.3498004566441262, + "learning_rate": 2.4115022507186626e-06, + "loss": 0.165, + "step": 9858 + }, + { + "epoch": 0.7810655575361458, + "grad_norm": 1.4002256424804078, + "learning_rate": 2.409831332101712e-06, + "loss": 0.1966, + "step": 9859 + }, + { + "epoch": 0.7811447811447811, + "grad_norm": 1.4073168574616537, + "learning_rate": 2.4081609132608464e-06, + "loss": 0.2263, + "step": 9860 + }, + { + "epoch": 0.7812240047534165, + "grad_norm": 1.30523152072414, + "learning_rate": 2.406490994306052e-06, + "loss": 0.2838, + "step": 9861 + }, + { + "epoch": 0.7813032283620519, + "grad_norm": 1.1465749574652757, + "learning_rate": 2.4048215753472914e-06, + "loss": 0.1992, + "step": 9862 + }, + { + "epoch": 0.7813824519706872, + "grad_norm": 1.6836733865332354, + "learning_rate": 2.403152656494485e-06, + "loss": 0.2525, + "step": 9863 + }, + { + "epoch": 0.7814616755793227, + "grad_norm": 1.3334983347970941, + "learning_rate": 2.401484237857519e-06, + "loss": 0.2664, + "step": 9864 + }, + { + "epoch": 0.781540899187958, + "grad_norm": 1.2727137205827692, + "learning_rate": 2.3998163195462565e-06, + "loss": 0.1925, + "step": 9865 + }, + { + "epoch": 0.7816201227965934, + "grad_norm": 1.2162621333401924, + "learning_rate": 2.398148901670521e-06, + "loss": 0.2598, + "step": 9866 + }, + { + "epoch": 0.7816993464052288, + "grad_norm": 1.1008673990444313, + "learning_rate": 2.396481984340098e-06, + "loss": 0.1268, + "step": 9867 + }, + { + "epoch": 0.7817785700138641, + "grad_norm": 1.2822854896648175, + "learning_rate": 2.3948155676647546e-06, + "loss": 0.1421, + "step": 9868 + }, + { + "epoch": 0.7818577936224995, + "grad_norm": 1.8116326552945974, + "learning_rate": 2.393149651754212e-06, + "loss": 0.2408, + "step": 9869 + }, + { + "epoch": 0.7819370172311348, + "grad_norm": 1.1781609148244916, + "learning_rate": 2.391484236718159e-06, + "loss": 0.1898, + "step": 9870 + }, + { + "epoch": 0.7820162408397703, + "grad_norm": 1.8513002772033784, + "learning_rate": 2.389819322666264e-06, + "loss": 0.202, + "step": 9871 + }, + { + "epoch": 0.7820954644484056, + "grad_norm": 1.3133458015195805, + "learning_rate": 2.3881549097081467e-06, + "loss": 0.1942, + "step": 9872 + }, + { + "epoch": 0.782174688057041, + "grad_norm": 1.7831442212345616, + "learning_rate": 2.3864909979534044e-06, + "loss": 0.2722, + "step": 9873 + }, + { + "epoch": 0.7822539116656764, + "grad_norm": 1.2140586418904127, + "learning_rate": 2.3848275875115925e-06, + "loss": 0.2197, + "step": 9874 + }, + { + "epoch": 0.7823331352743117, + "grad_norm": 1.424360517002461, + "learning_rate": 2.3831646784922446e-06, + "loss": 0.2347, + "step": 9875 + }, + { + "epoch": 0.7824123588829471, + "grad_norm": 1.8904571960955172, + "learning_rate": 2.381502271004853e-06, + "loss": 0.3165, + "step": 9876 + }, + { + "epoch": 0.7824915824915825, + "grad_norm": 1.392190133827388, + "learning_rate": 2.3798403651588765e-06, + "loss": 0.2685, + "step": 9877 + }, + { + "epoch": 0.7825708061002179, + "grad_norm": 1.5229441541479927, + "learning_rate": 2.3781789610637483e-06, + "loss": 0.336, + "step": 9878 + }, + { + "epoch": 0.7826500297088532, + "grad_norm": 1.344870866671153, + "learning_rate": 2.376518058828863e-06, + "loss": 0.2437, + "step": 9879 + }, + { + "epoch": 0.7827292533174887, + "grad_norm": 1.1635431213517604, + "learning_rate": 2.3748576585635774e-06, + "loss": 0.1958, + "step": 9880 + }, + { + "epoch": 0.782808476926124, + "grad_norm": 1.2723448606048013, + "learning_rate": 2.373197760377228e-06, + "loss": 0.2052, + "step": 9881 + }, + { + "epoch": 0.7828877005347593, + "grad_norm": 1.636000304317728, + "learning_rate": 2.371538364379109e-06, + "loss": 0.2451, + "step": 9882 + }, + { + "epoch": 0.7829669241433947, + "grad_norm": 1.2617145233470228, + "learning_rate": 2.36987947067848e-06, + "loss": 0.2691, + "step": 9883 + }, + { + "epoch": 0.7830461477520301, + "grad_norm": 1.3400291145152263, + "learning_rate": 2.368221079384577e-06, + "loss": 0.2098, + "step": 9884 + }, + { + "epoch": 0.7831253713606655, + "grad_norm": 1.3312196768751225, + "learning_rate": 2.3665631906065933e-06, + "loss": 0.2294, + "step": 9885 + }, + { + "epoch": 0.7832045949693008, + "grad_norm": 1.1271118797356454, + "learning_rate": 2.364905804453692e-06, + "loss": 0.1843, + "step": 9886 + }, + { + "epoch": 0.7832838185779363, + "grad_norm": 1.2462825292571693, + "learning_rate": 2.3632489210350074e-06, + "loss": 0.1737, + "step": 9887 + }, + { + "epoch": 0.7833630421865716, + "grad_norm": 1.036274618253312, + "learning_rate": 2.361592540459636e-06, + "loss": 0.1428, + "step": 9888 + }, + { + "epoch": 0.7834422657952069, + "grad_norm": 1.367322973295281, + "learning_rate": 2.3599366628366427e-06, + "loss": 0.2396, + "step": 9889 + }, + { + "epoch": 0.7835214894038424, + "grad_norm": 1.500516288589599, + "learning_rate": 2.358281288275055e-06, + "loss": 0.28, + "step": 9890 + }, + { + "epoch": 0.7836007130124777, + "grad_norm": 1.2892530881249833, + "learning_rate": 2.356626416883878e-06, + "loss": 0.2377, + "step": 9891 + }, + { + "epoch": 0.7836799366211131, + "grad_norm": 1.7851417420327353, + "learning_rate": 2.354972048772074e-06, + "loss": 0.2475, + "step": 9892 + }, + { + "epoch": 0.7837591602297485, + "grad_norm": 1.037699305032629, + "learning_rate": 2.353318184048573e-06, + "loss": 0.1255, + "step": 9893 + }, + { + "epoch": 0.7838383838383839, + "grad_norm": 1.087055042943525, + "learning_rate": 2.351664822822277e-06, + "loss": 0.1387, + "step": 9894 + }, + { + "epoch": 0.7839176074470192, + "grad_norm": 1.2408409281557684, + "learning_rate": 2.3500119652020526e-06, + "loss": 0.217, + "step": 9895 + }, + { + "epoch": 0.7839968310556545, + "grad_norm": 1.3334025655193391, + "learning_rate": 2.348359611296728e-06, + "loss": 0.2555, + "step": 9896 + }, + { + "epoch": 0.78407605466429, + "grad_norm": 1.4739817711549104, + "learning_rate": 2.346707761215108e-06, + "loss": 0.2154, + "step": 9897 + }, + { + "epoch": 0.7841552782729253, + "grad_norm": 1.2291750974138853, + "learning_rate": 2.345056415065956e-06, + "loss": 0.2162, + "step": 9898 + }, + { + "epoch": 0.7842345018815607, + "grad_norm": 1.4078494400038302, + "learning_rate": 2.343405572958004e-06, + "loss": 0.2383, + "step": 9899 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 1.4286986823989243, + "learning_rate": 2.341755234999956e-06, + "loss": 0.3231, + "step": 9900 + }, + { + "epoch": 0.7843929490988315, + "grad_norm": 1.4633301709602267, + "learning_rate": 2.3401054013004776e-06, + "loss": 0.2451, + "step": 9901 + }, + { + "epoch": 0.7844721727074668, + "grad_norm": 1.2974481192501213, + "learning_rate": 2.338456071968198e-06, + "loss": 0.2193, + "step": 9902 + }, + { + "epoch": 0.7845513963161022, + "grad_norm": 1.3732503791224975, + "learning_rate": 2.336807247111723e-06, + "loss": 0.2161, + "step": 9903 + }, + { + "epoch": 0.7846306199247376, + "grad_norm": 1.2483504707354818, + "learning_rate": 2.3351589268396193e-06, + "loss": 0.194, + "step": 9904 + }, + { + "epoch": 0.7847098435333729, + "grad_norm": 1.4166070919215041, + "learning_rate": 2.3335111112604194e-06, + "loss": 0.2603, + "step": 9905 + }, + { + "epoch": 0.7847890671420084, + "grad_norm": 1.3310924005256144, + "learning_rate": 2.33186380048262e-06, + "loss": 0.2891, + "step": 9906 + }, + { + "epoch": 0.7848682907506437, + "grad_norm": 1.3732942770407457, + "learning_rate": 2.330216994614696e-06, + "loss": 0.2719, + "step": 9907 + }, + { + "epoch": 0.7849475143592791, + "grad_norm": 1.4653131419759486, + "learning_rate": 2.3285706937650786e-06, + "loss": 0.2239, + "step": 9908 + }, + { + "epoch": 0.7850267379679144, + "grad_norm": 1.4753549367756522, + "learning_rate": 2.3269248980421653e-06, + "loss": 0.256, + "step": 9909 + }, + { + "epoch": 0.7851059615765498, + "grad_norm": 1.1013197545678601, + "learning_rate": 2.3252796075543295e-06, + "loss": 0.1526, + "step": 9910 + }, + { + "epoch": 0.7851851851851852, + "grad_norm": 1.0476398735857544, + "learning_rate": 2.3236348224099038e-06, + "loss": 0.1783, + "step": 9911 + }, + { + "epoch": 0.7852644087938205, + "grad_norm": 1.6738783342305954, + "learning_rate": 2.3219905427171864e-06, + "loss": 0.2963, + "step": 9912 + }, + { + "epoch": 0.785343632402456, + "grad_norm": 1.5196516826877013, + "learning_rate": 2.320346768584449e-06, + "loss": 0.2409, + "step": 9913 + }, + { + "epoch": 0.7854228560110913, + "grad_norm": 1.2553838154213728, + "learning_rate": 2.3187035001199254e-06, + "loss": 0.1451, + "step": 9914 + }, + { + "epoch": 0.7855020796197266, + "grad_norm": 1.3606211542973146, + "learning_rate": 2.317060737431813e-06, + "loss": 0.2268, + "step": 9915 + }, + { + "epoch": 0.7855813032283621, + "grad_norm": 1.1879794831866943, + "learning_rate": 2.3154184806282863e-06, + "loss": 0.1611, + "step": 9916 + }, + { + "epoch": 0.7856605268369974, + "grad_norm": 1.3529114657561345, + "learning_rate": 2.3137767298174774e-06, + "loss": 0.1757, + "step": 9917 + }, + { + "epoch": 0.7857397504456328, + "grad_norm": 1.283361078893396, + "learning_rate": 2.312135485107486e-06, + "loss": 0.1776, + "step": 9918 + }, + { + "epoch": 0.7858189740542681, + "grad_norm": 1.1860801780667327, + "learning_rate": 2.3104947466063785e-06, + "loss": 0.1919, + "step": 9919 + }, + { + "epoch": 0.7858981976629036, + "grad_norm": 1.0042676441401837, + "learning_rate": 2.3088545144221964e-06, + "loss": 0.1079, + "step": 9920 + }, + { + "epoch": 0.7859774212715389, + "grad_norm": 1.6201670304917153, + "learning_rate": 2.307214788662936e-06, + "loss": 0.323, + "step": 9921 + }, + { + "epoch": 0.7860566448801742, + "grad_norm": 1.4393124213212198, + "learning_rate": 2.3055755694365644e-06, + "loss": 0.2208, + "step": 9922 + }, + { + "epoch": 0.7861358684888097, + "grad_norm": 1.1975356183472057, + "learning_rate": 2.303936856851021e-06, + "loss": 0.1589, + "step": 9923 + }, + { + "epoch": 0.786215092097445, + "grad_norm": 1.0148862121154767, + "learning_rate": 2.302298651014204e-06, + "loss": 0.1356, + "step": 9924 + }, + { + "epoch": 0.7862943157060804, + "grad_norm": 1.1291302129390803, + "learning_rate": 2.3006609520339796e-06, + "loss": 0.1434, + "step": 9925 + }, + { + "epoch": 0.7863735393147158, + "grad_norm": 1.6482819136838143, + "learning_rate": 2.2990237600181864e-06, + "loss": 0.3151, + "step": 9926 + }, + { + "epoch": 0.7864527629233512, + "grad_norm": 1.4021068748727905, + "learning_rate": 2.2973870750746253e-06, + "loss": 0.2076, + "step": 9927 + }, + { + "epoch": 0.7865319865319865, + "grad_norm": 1.3497590054526278, + "learning_rate": 2.2957508973110586e-06, + "loss": 0.219, + "step": 9928 + }, + { + "epoch": 0.7866112101406219, + "grad_norm": 1.4529947070303282, + "learning_rate": 2.2941152268352284e-06, + "loss": 0.2479, + "step": 9929 + }, + { + "epoch": 0.7866904337492573, + "grad_norm": 1.6627996199675967, + "learning_rate": 2.292480063754833e-06, + "loss": 0.2571, + "step": 9930 + }, + { + "epoch": 0.7867696573578926, + "grad_norm": 1.5263400909391849, + "learning_rate": 2.2908454081775344e-06, + "loss": 0.2494, + "step": 9931 + }, + { + "epoch": 0.786848880966528, + "grad_norm": 1.0879206355764373, + "learning_rate": 2.2892112602109783e-06, + "loss": 0.1376, + "step": 9932 + }, + { + "epoch": 0.7869281045751634, + "grad_norm": 1.2449612663426468, + "learning_rate": 2.2875776199627564e-06, + "loss": 0.2087, + "step": 9933 + }, + { + "epoch": 0.7870073281837988, + "grad_norm": 1.38946532860024, + "learning_rate": 2.2859444875404347e-06, + "loss": 0.2079, + "step": 9934 + }, + { + "epoch": 0.7870865517924341, + "grad_norm": 1.526377619670447, + "learning_rate": 2.2843118630515536e-06, + "loss": 0.2657, + "step": 9935 + }, + { + "epoch": 0.7871657754010695, + "grad_norm": 1.2577583156740642, + "learning_rate": 2.282679746603611e-06, + "loss": 0.1613, + "step": 9936 + }, + { + "epoch": 0.7872449990097049, + "grad_norm": 1.4840892467713886, + "learning_rate": 2.281048138304072e-06, + "loss": 0.278, + "step": 9937 + }, + { + "epoch": 0.7873242226183402, + "grad_norm": 1.8028862577357145, + "learning_rate": 2.279417038260373e-06, + "loss": 0.3526, + "step": 9938 + }, + { + "epoch": 0.7874034462269757, + "grad_norm": 1.237516044815252, + "learning_rate": 2.2777864465799137e-06, + "loss": 0.2163, + "step": 9939 + }, + { + "epoch": 0.787482669835611, + "grad_norm": 1.394261685373207, + "learning_rate": 2.276156363370058e-06, + "loss": 0.2079, + "step": 9940 + }, + { + "epoch": 0.7875618934442464, + "grad_norm": 1.1622432649522068, + "learning_rate": 2.274526788738143e-06, + "loss": 0.2118, + "step": 9941 + }, + { + "epoch": 0.7876411170528818, + "grad_norm": 1.7095905653464842, + "learning_rate": 2.272897722791466e-06, + "loss": 0.3082, + "step": 9942 + }, + { + "epoch": 0.7877203406615171, + "grad_norm": 1.5326759806882049, + "learning_rate": 2.271269165637294e-06, + "loss": 0.2862, + "step": 9943 + }, + { + "epoch": 0.7877995642701525, + "grad_norm": 1.2973899393295614, + "learning_rate": 2.2696411173828557e-06, + "loss": 0.2435, + "step": 9944 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 1.6309969512560585, + "learning_rate": 2.268013578135357e-06, + "loss": 0.2768, + "step": 9945 + }, + { + "epoch": 0.7879580114874233, + "grad_norm": 1.2742891241394383, + "learning_rate": 2.266386548001961e-06, + "loss": 0.2556, + "step": 9946 + }, + { + "epoch": 0.7880372350960586, + "grad_norm": 1.3937887766082877, + "learning_rate": 2.264760027089795e-06, + "loss": 0.3047, + "step": 9947 + }, + { + "epoch": 0.788116458704694, + "grad_norm": 1.0740663943769213, + "learning_rate": 2.2631340155059656e-06, + "loss": 0.1616, + "step": 9948 + }, + { + "epoch": 0.7881956823133294, + "grad_norm": 1.5917645559927993, + "learning_rate": 2.261508513357532e-06, + "loss": 0.2537, + "step": 9949 + }, + { + "epoch": 0.7882749059219647, + "grad_norm": 1.2451814826213414, + "learning_rate": 2.2598835207515267e-06, + "loss": 0.1486, + "step": 9950 + }, + { + "epoch": 0.7883541295306001, + "grad_norm": 1.5512190307109022, + "learning_rate": 2.2582590377949497e-06, + "loss": 0.2783, + "step": 9951 + }, + { + "epoch": 0.7884333531392355, + "grad_norm": 1.412805822510152, + "learning_rate": 2.2566350645947656e-06, + "loss": 0.2471, + "step": 9952 + }, + { + "epoch": 0.7885125767478709, + "grad_norm": 1.1487181400057618, + "learning_rate": 2.2550116012579004e-06, + "loss": 0.139, + "step": 9953 + }, + { + "epoch": 0.7885918003565062, + "grad_norm": 1.3641899940447755, + "learning_rate": 2.253388647891258e-06, + "loss": 0.2443, + "step": 9954 + }, + { + "epoch": 0.7886710239651417, + "grad_norm": 1.0853164644691753, + "learning_rate": 2.2517662046016975e-06, + "loss": 0.1411, + "step": 9955 + }, + { + "epoch": 0.788750247573777, + "grad_norm": 1.5195153135443165, + "learning_rate": 2.250144271496049e-06, + "loss": 0.2386, + "step": 9956 + }, + { + "epoch": 0.7888294711824123, + "grad_norm": 1.1884205594310875, + "learning_rate": 2.2485228486811128e-06, + "loss": 0.2158, + "step": 9957 + }, + { + "epoch": 0.7889086947910477, + "grad_norm": 1.4803177855233298, + "learning_rate": 2.2469019362636478e-06, + "loss": 0.1788, + "step": 9958 + }, + { + "epoch": 0.7889879183996831, + "grad_norm": 1.2687977783308744, + "learning_rate": 2.2452815343503862e-06, + "loss": 0.2236, + "step": 9959 + }, + { + "epoch": 0.7890671420083185, + "grad_norm": 1.5656619584628373, + "learning_rate": 2.2436616430480197e-06, + "loss": 0.3275, + "step": 9960 + }, + { + "epoch": 0.7891463656169538, + "grad_norm": 1.3806051030987687, + "learning_rate": 2.2420422624632153e-06, + "loss": 0.2763, + "step": 9961 + }, + { + "epoch": 0.7892255892255893, + "grad_norm": 1.376220612768902, + "learning_rate": 2.2404233927025985e-06, + "loss": 0.2085, + "step": 9962 + }, + { + "epoch": 0.7893048128342246, + "grad_norm": 1.1990561626777876, + "learning_rate": 2.238805033872762e-06, + "loss": 0.1961, + "step": 9963 + }, + { + "epoch": 0.7893840364428599, + "grad_norm": 1.3329692751043065, + "learning_rate": 2.237187186080273e-06, + "loss": 0.1826, + "step": 9964 + }, + { + "epoch": 0.7894632600514954, + "grad_norm": 1.481753581596124, + "learning_rate": 2.235569849431655e-06, + "loss": 0.2444, + "step": 9965 + }, + { + "epoch": 0.7895424836601307, + "grad_norm": 1.2432384672455212, + "learning_rate": 2.2339530240333993e-06, + "loss": 0.2369, + "step": 9966 + }, + { + "epoch": 0.7896217072687661, + "grad_norm": 1.1202647849761296, + "learning_rate": 2.2323367099919724e-06, + "loss": 0.23, + "step": 9967 + }, + { + "epoch": 0.7897009308774015, + "grad_norm": 1.2914505321209109, + "learning_rate": 2.230720907413797e-06, + "loss": 0.2492, + "step": 9968 + }, + { + "epoch": 0.7897801544860369, + "grad_norm": 1.181051442283646, + "learning_rate": 2.2291056164052638e-06, + "loss": 0.177, + "step": 9969 + }, + { + "epoch": 0.7898593780946722, + "grad_norm": 1.2216963109119947, + "learning_rate": 2.2274908370727376e-06, + "loss": 0.1986, + "step": 9970 + }, + { + "epoch": 0.7899386017033075, + "grad_norm": 1.3921930510509921, + "learning_rate": 2.2258765695225416e-06, + "loss": 0.1693, + "step": 9971 + }, + { + "epoch": 0.790017825311943, + "grad_norm": 1.0550051674722272, + "learning_rate": 2.224262813860962e-06, + "loss": 0.1581, + "step": 9972 + }, + { + "epoch": 0.7900970489205783, + "grad_norm": 1.610546763894353, + "learning_rate": 2.2226495701942663e-06, + "loss": 0.263, + "step": 9973 + }, + { + "epoch": 0.7901762725292137, + "grad_norm": 1.4657956609637355, + "learning_rate": 2.2210368386286742e-06, + "loss": 0.2867, + "step": 9974 + }, + { + "epoch": 0.7902554961378491, + "grad_norm": 1.1946257522627222, + "learning_rate": 2.219424619270375e-06, + "loss": 0.1617, + "step": 9975 + }, + { + "epoch": 0.7903347197464845, + "grad_norm": 1.3262265321883955, + "learning_rate": 2.2178129122255255e-06, + "loss": 0.196, + "step": 9976 + }, + { + "epoch": 0.7904139433551198, + "grad_norm": 1.1606617437482047, + "learning_rate": 2.2162017176002514e-06, + "loss": 0.1893, + "step": 9977 + }, + { + "epoch": 0.7904931669637552, + "grad_norm": 1.287499575600566, + "learning_rate": 2.2145910355006415e-06, + "loss": 0.2389, + "step": 9978 + }, + { + "epoch": 0.7905723905723906, + "grad_norm": 1.6596604641298756, + "learning_rate": 2.212980866032749e-06, + "loss": 0.233, + "step": 9979 + }, + { + "epoch": 0.7906516141810259, + "grad_norm": 1.4249190205777467, + "learning_rate": 2.2113712093025997e-06, + "loss": 0.2629, + "step": 9980 + }, + { + "epoch": 0.7907308377896614, + "grad_norm": 1.5299727945194308, + "learning_rate": 2.20976206541618e-06, + "loss": 0.247, + "step": 9981 + }, + { + "epoch": 0.7908100613982967, + "grad_norm": 1.0478590279732038, + "learning_rate": 2.208153434479442e-06, + "loss": 0.1952, + "step": 9982 + }, + { + "epoch": 0.7908892850069321, + "grad_norm": 1.128319044059468, + "learning_rate": 2.20654531659831e-06, + "loss": 0.2117, + "step": 9983 + }, + { + "epoch": 0.7909685086155674, + "grad_norm": 1.6686463227701107, + "learning_rate": 2.2049377118786696e-06, + "loss": 0.2512, + "step": 9984 + }, + { + "epoch": 0.7910477322242028, + "grad_norm": 1.3467671370617873, + "learning_rate": 2.2033306204263704e-06, + "loss": 0.1928, + "step": 9985 + }, + { + "epoch": 0.7911269558328382, + "grad_norm": 1.2930284830740735, + "learning_rate": 2.2017240423472384e-06, + "loss": 0.1806, + "step": 9986 + }, + { + "epoch": 0.7912061794414735, + "grad_norm": 1.323305642487313, + "learning_rate": 2.200117977747055e-06, + "loss": 0.2334, + "step": 9987 + }, + { + "epoch": 0.791285403050109, + "grad_norm": 1.4208029351884266, + "learning_rate": 2.198512426731568e-06, + "loss": 0.1911, + "step": 9988 + }, + { + "epoch": 0.7913646266587443, + "grad_norm": 1.3667820698657067, + "learning_rate": 2.196907389406504e-06, + "loss": 0.2243, + "step": 9989 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 1.791093536456264, + "learning_rate": 2.195302865877541e-06, + "loss": 0.3501, + "step": 9990 + }, + { + "epoch": 0.7915230738760151, + "grad_norm": 1.4951505870953965, + "learning_rate": 2.193698856250331e-06, + "loss": 0.2447, + "step": 9991 + }, + { + "epoch": 0.7916022974846504, + "grad_norm": 1.3089823436456214, + "learning_rate": 2.1920953606304875e-06, + "loss": 0.2843, + "step": 9992 + }, + { + "epoch": 0.7916815210932858, + "grad_norm": 1.2958487797745992, + "learning_rate": 2.1904923791235965e-06, + "loss": 0.2211, + "step": 9993 + }, + { + "epoch": 0.7917607447019211, + "grad_norm": 1.2825546731431516, + "learning_rate": 2.188889911835207e-06, + "loss": 0.1861, + "step": 9994 + }, + { + "epoch": 0.7918399683105566, + "grad_norm": 1.4713950370423254, + "learning_rate": 2.1872879588708286e-06, + "loss": 0.2604, + "step": 9995 + }, + { + "epoch": 0.7919191919191919, + "grad_norm": 1.3323944419507083, + "learning_rate": 2.185686520335948e-06, + "loss": 0.2348, + "step": 9996 + }, + { + "epoch": 0.7919984155278272, + "grad_norm": 1.3074796064722987, + "learning_rate": 2.184085596336011e-06, + "loss": 0.2235, + "step": 9997 + }, + { + "epoch": 0.7920776391364627, + "grad_norm": 1.1598380733286042, + "learning_rate": 2.1824851869764262e-06, + "loss": 0.1717, + "step": 9998 + }, + { + "epoch": 0.792156862745098, + "grad_norm": 1.5321560189673116, + "learning_rate": 2.1808852923625802e-06, + "loss": 0.286, + "step": 9999 + }, + { + "epoch": 0.7922360863537334, + "grad_norm": 1.8248009756127332, + "learning_rate": 2.1792859125998134e-06, + "loss": 0.3009, + "step": 10000 + }, + { + "epoch": 0.7923153099623688, + "grad_norm": 1.2349953833747491, + "learning_rate": 2.1776870477934353e-06, + "loss": 0.1715, + "step": 10001 + }, + { + "epoch": 0.7923945335710042, + "grad_norm": 1.1666802052134546, + "learning_rate": 2.1760886980487307e-06, + "loss": 0.2028, + "step": 10002 + }, + { + "epoch": 0.7924737571796395, + "grad_norm": 1.5195253589555158, + "learning_rate": 2.174490863470938e-06, + "loss": 0.323, + "step": 10003 + }, + { + "epoch": 0.7925529807882749, + "grad_norm": 1.259825688601649, + "learning_rate": 2.1728935441652687e-06, + "loss": 0.1365, + "step": 10004 + }, + { + "epoch": 0.7926322043969103, + "grad_norm": 1.2666422881448816, + "learning_rate": 2.1712967402368947e-06, + "loss": 0.1547, + "step": 10005 + }, + { + "epoch": 0.7927114280055456, + "grad_norm": 1.799324352157349, + "learning_rate": 2.169700451790964e-06, + "loss": 0.2196, + "step": 10006 + }, + { + "epoch": 0.792790651614181, + "grad_norm": 1.4724774522458546, + "learning_rate": 2.168104678932581e-06, + "loss": 0.2557, + "step": 10007 + }, + { + "epoch": 0.7928698752228164, + "grad_norm": 1.5231941789975854, + "learning_rate": 2.166509421766818e-06, + "loss": 0.2054, + "step": 10008 + }, + { + "epoch": 0.7929490988314518, + "grad_norm": 1.107829132588173, + "learning_rate": 2.1649146803987197e-06, + "loss": 0.1706, + "step": 10009 + }, + { + "epoch": 0.7930283224400871, + "grad_norm": 1.3376833380354436, + "learning_rate": 2.1633204549332897e-06, + "loss": 0.2047, + "step": 10010 + }, + { + "epoch": 0.7931075460487225, + "grad_norm": 0.9672791640395079, + "learning_rate": 2.1617267454754996e-06, + "loss": 0.1423, + "step": 10011 + }, + { + "epoch": 0.7931867696573579, + "grad_norm": 1.2769356374566705, + "learning_rate": 2.160133552130289e-06, + "loss": 0.2471, + "step": 10012 + }, + { + "epoch": 0.7932659932659932, + "grad_norm": 1.426737951051182, + "learning_rate": 2.1585408750025584e-06, + "loss": 0.2077, + "step": 10013 + }, + { + "epoch": 0.7933452168746287, + "grad_norm": 1.2895947556304939, + "learning_rate": 2.1569487141971824e-06, + "loss": 0.235, + "step": 10014 + }, + { + "epoch": 0.793424440483264, + "grad_norm": 1.4399102103735264, + "learning_rate": 2.155357069818995e-06, + "loss": 0.2659, + "step": 10015 + }, + { + "epoch": 0.7935036640918994, + "grad_norm": 1.2342433407853597, + "learning_rate": 2.1537659419727987e-06, + "loss": 0.2617, + "step": 10016 + }, + { + "epoch": 0.7935828877005348, + "grad_norm": 1.526028949621356, + "learning_rate": 2.152175330763359e-06, + "loss": 0.1728, + "step": 10017 + }, + { + "epoch": 0.7936621113091701, + "grad_norm": 1.2718600277377743, + "learning_rate": 2.150585236295415e-06, + "loss": 0.1894, + "step": 10018 + }, + { + "epoch": 0.7937413349178055, + "grad_norm": 1.2385976620892274, + "learning_rate": 2.148995658673665e-06, + "loss": 0.1586, + "step": 10019 + }, + { + "epoch": 0.7938205585264408, + "grad_norm": 1.3831165724156946, + "learning_rate": 2.14740659800277e-06, + "loss": 0.2596, + "step": 10020 + }, + { + "epoch": 0.7938997821350763, + "grad_norm": 1.2254736289746195, + "learning_rate": 2.1458180543873697e-06, + "loss": 0.1684, + "step": 10021 + }, + { + "epoch": 0.7939790057437116, + "grad_norm": 1.3576555103115144, + "learning_rate": 2.1442300279320593e-06, + "loss": 0.2172, + "step": 10022 + }, + { + "epoch": 0.794058229352347, + "grad_norm": 1.438419980302335, + "learning_rate": 2.142642518741399e-06, + "loss": 0.3328, + "step": 10023 + }, + { + "epoch": 0.7941374529609824, + "grad_norm": 1.156647007357365, + "learning_rate": 2.141055526919924e-06, + "loss": 0.1638, + "step": 10024 + }, + { + "epoch": 0.7942166765696177, + "grad_norm": 1.1524903426571225, + "learning_rate": 2.1394690525721275e-06, + "loss": 0.2042, + "step": 10025 + }, + { + "epoch": 0.7942959001782531, + "grad_norm": 1.5182758667856178, + "learning_rate": 2.137883095802469e-06, + "loss": 0.2956, + "step": 10026 + }, + { + "epoch": 0.7943751237868885, + "grad_norm": 1.3379707359334319, + "learning_rate": 2.1362976567153813e-06, + "loss": 0.2433, + "step": 10027 + }, + { + "epoch": 0.7944543473955239, + "grad_norm": 1.4079850434038292, + "learning_rate": 2.134712735415255e-06, + "loss": 0.258, + "step": 10028 + }, + { + "epoch": 0.7945335710041592, + "grad_norm": 1.2298891520714446, + "learning_rate": 2.13312833200645e-06, + "loss": 0.1831, + "step": 10029 + }, + { + "epoch": 0.7946127946127947, + "grad_norm": 1.2505744234829472, + "learning_rate": 2.131544446593289e-06, + "loss": 0.1919, + "step": 10030 + }, + { + "epoch": 0.79469201822143, + "grad_norm": 1.271701641006501, + "learning_rate": 2.1299610792800675e-06, + "loss": 0.1558, + "step": 10031 + }, + { + "epoch": 0.7947712418300653, + "grad_norm": 1.3392105134756338, + "learning_rate": 2.1283782301710408e-06, + "loss": 0.1874, + "step": 10032 + }, + { + "epoch": 0.7948504654387007, + "grad_norm": 1.2659260053617787, + "learning_rate": 2.1267958993704297e-06, + "loss": 0.2426, + "step": 10033 + }, + { + "epoch": 0.7949296890473361, + "grad_norm": 1.2716818583067733, + "learning_rate": 2.1252140869824266e-06, + "loss": 0.2159, + "step": 10034 + }, + { + "epoch": 0.7950089126559715, + "grad_norm": 1.74424913273211, + "learning_rate": 2.1236327931111868e-06, + "loss": 0.2462, + "step": 10035 + }, + { + "epoch": 0.7950881362646068, + "grad_norm": 1.1670589718188686, + "learning_rate": 2.122052017860825e-06, + "loss": 0.21, + "step": 10036 + }, + { + "epoch": 0.7951673598732423, + "grad_norm": 1.3652414913893929, + "learning_rate": 2.120471761335434e-06, + "loss": 0.1874, + "step": 10037 + }, + { + "epoch": 0.7952465834818776, + "grad_norm": 1.3673507089888488, + "learning_rate": 2.118892023639064e-06, + "loss": 0.1542, + "step": 10038 + }, + { + "epoch": 0.7953258070905129, + "grad_norm": 1.33695022773868, + "learning_rate": 2.1173128048757307e-06, + "loss": 0.2508, + "step": 10039 + }, + { + "epoch": 0.7954050306991484, + "grad_norm": 1.3589920098432118, + "learning_rate": 2.115734105149422e-06, + "loss": 0.288, + "step": 10040 + }, + { + "epoch": 0.7954842543077837, + "grad_norm": 1.1912113453417041, + "learning_rate": 2.1141559245640865e-06, + "loss": 0.1589, + "step": 10041 + }, + { + "epoch": 0.7955634779164191, + "grad_norm": 1.4471018026891507, + "learning_rate": 2.1125782632236357e-06, + "loss": 0.1956, + "step": 10042 + }, + { + "epoch": 0.7956427015250545, + "grad_norm": 1.2340719063866965, + "learning_rate": 2.111001121231957e-06, + "loss": 0.1916, + "step": 10043 + }, + { + "epoch": 0.7957219251336899, + "grad_norm": 1.6502802025963668, + "learning_rate": 2.1094244986928956e-06, + "loss": 0.2641, + "step": 10044 + }, + { + "epoch": 0.7958011487423252, + "grad_norm": 1.4286059065437176, + "learning_rate": 2.1078483957102637e-06, + "loss": 0.2142, + "step": 10045 + }, + { + "epoch": 0.7958803723509605, + "grad_norm": 1.1541005697579656, + "learning_rate": 2.1062728123878383e-06, + "loss": 0.1928, + "step": 10046 + }, + { + "epoch": 0.795959595959596, + "grad_norm": 0.9779231866751608, + "learning_rate": 2.1046977488293675e-06, + "loss": 0.164, + "step": 10047 + }, + { + "epoch": 0.7960388195682313, + "grad_norm": 1.7990159984894762, + "learning_rate": 2.1031232051385606e-06, + "loss": 0.2656, + "step": 10048 + }, + { + "epoch": 0.7961180431768667, + "grad_norm": 1.4442474253449442, + "learning_rate": 2.1015491814190913e-06, + "loss": 0.2755, + "step": 10049 + }, + { + "epoch": 0.7961972667855021, + "grad_norm": 1.1360343486965774, + "learning_rate": 2.099975677774606e-06, + "loss": 0.1755, + "step": 10050 + }, + { + "epoch": 0.7962764903941375, + "grad_norm": 1.1675717082990071, + "learning_rate": 2.0984026943087087e-06, + "loss": 0.1938, + "step": 10051 + }, + { + "epoch": 0.7963557140027728, + "grad_norm": 1.5112116981848314, + "learning_rate": 2.096830231124972e-06, + "loss": 0.2701, + "step": 10052 + }, + { + "epoch": 0.7964349376114082, + "grad_norm": 1.6807096099557952, + "learning_rate": 2.0952582883269403e-06, + "loss": 0.3468, + "step": 10053 + }, + { + "epoch": 0.7965141612200436, + "grad_norm": 1.6110911115903424, + "learning_rate": 2.093686866018114e-06, + "loss": 0.2474, + "step": 10054 + }, + { + "epoch": 0.7965933848286789, + "grad_norm": 0.984221143593065, + "learning_rate": 2.0921159643019627e-06, + "loss": 0.1595, + "step": 10055 + }, + { + "epoch": 0.7966726084373144, + "grad_norm": 1.1097585500989249, + "learning_rate": 2.0905455832819277e-06, + "loss": 0.1871, + "step": 10056 + }, + { + "epoch": 0.7967518320459497, + "grad_norm": 1.2385862124277391, + "learning_rate": 2.088975723061408e-06, + "loss": 0.2024, + "step": 10057 + }, + { + "epoch": 0.7968310556545851, + "grad_norm": 1.0930097652320747, + "learning_rate": 2.0874063837437687e-06, + "loss": 0.2358, + "step": 10058 + }, + { + "epoch": 0.7969102792632204, + "grad_norm": 1.455189501290251, + "learning_rate": 2.085837565432349e-06, + "loss": 0.2509, + "step": 10059 + }, + { + "epoch": 0.7969895028718558, + "grad_norm": 1.4618715079436955, + "learning_rate": 2.0842692682304442e-06, + "loss": 0.2388, + "step": 10060 + }, + { + "epoch": 0.7970687264804912, + "grad_norm": 1.3330412367482674, + "learning_rate": 2.0827014922413213e-06, + "loss": 0.2716, + "step": 10061 + }, + { + "epoch": 0.7971479500891265, + "grad_norm": 1.697535382314123, + "learning_rate": 2.0811342375682065e-06, + "loss": 0.333, + "step": 10062 + }, + { + "epoch": 0.797227173697762, + "grad_norm": 1.1133608534907486, + "learning_rate": 2.0795675043143016e-06, + "loss": 0.1791, + "step": 10063 + }, + { + "epoch": 0.7973063973063973, + "grad_norm": 1.091187182538296, + "learning_rate": 2.0780012925827653e-06, + "loss": 0.1503, + "step": 10064 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 1.3266508000551338, + "learning_rate": 2.0764356024767228e-06, + "loss": 0.296, + "step": 10065 + }, + { + "epoch": 0.7974648445236681, + "grad_norm": 1.3878071703715749, + "learning_rate": 2.0748704340992743e-06, + "loss": 0.2458, + "step": 10066 + }, + { + "epoch": 0.7975440681323034, + "grad_norm": 1.4026447611034674, + "learning_rate": 2.0733057875534734e-06, + "loss": 0.2269, + "step": 10067 + }, + { + "epoch": 0.7976232917409388, + "grad_norm": 1.115526369033375, + "learning_rate": 2.0717416629423425e-06, + "loss": 0.169, + "step": 10068 + }, + { + "epoch": 0.7977025153495741, + "grad_norm": 1.220850918869706, + "learning_rate": 2.0701780603688783e-06, + "loss": 0.1405, + "step": 10069 + }, + { + "epoch": 0.7977817389582096, + "grad_norm": 1.2753124954344028, + "learning_rate": 2.068614979936032e-06, + "loss": 0.206, + "step": 10070 + }, + { + "epoch": 0.7978609625668449, + "grad_norm": 1.3964368718481102, + "learning_rate": 2.0670524217467237e-06, + "loss": 0.2255, + "step": 10071 + }, + { + "epoch": 0.7979401861754803, + "grad_norm": 1.2966942491927698, + "learning_rate": 2.0654903859038457e-06, + "loss": 0.1999, + "step": 10072 + }, + { + "epoch": 0.7980194097841157, + "grad_norm": 1.3729307646340145, + "learning_rate": 2.0639288725102467e-06, + "loss": 0.2211, + "step": 10073 + }, + { + "epoch": 0.798098633392751, + "grad_norm": 1.114727632434341, + "learning_rate": 2.0623678816687433e-06, + "loss": 0.1379, + "step": 10074 + }, + { + "epoch": 0.7981778570013864, + "grad_norm": 1.1836953650481632, + "learning_rate": 2.0608074134821243e-06, + "loss": 0.1822, + "step": 10075 + }, + { + "epoch": 0.7982570806100218, + "grad_norm": 1.353414572508888, + "learning_rate": 2.0592474680531347e-06, + "loss": 0.2078, + "step": 10076 + }, + { + "epoch": 0.7983363042186572, + "grad_norm": 1.3768498118736798, + "learning_rate": 2.0576880454844926e-06, + "loss": 0.2266, + "step": 10077 + }, + { + "epoch": 0.7984155278272925, + "grad_norm": 1.2433226735011762, + "learning_rate": 2.0561291458788736e-06, + "loss": 0.1993, + "step": 10078 + }, + { + "epoch": 0.7984947514359279, + "grad_norm": 1.0821210913896915, + "learning_rate": 2.0545707693389296e-06, + "loss": 0.147, + "step": 10079 + }, + { + "epoch": 0.7985739750445633, + "grad_norm": 1.4345023903943792, + "learning_rate": 2.0530129159672685e-06, + "loss": 0.2451, + "step": 10080 + }, + { + "epoch": 0.7986531986531986, + "grad_norm": 0.9771276319871717, + "learning_rate": 2.0514555858664663e-06, + "loss": 0.1182, + "step": 10081 + }, + { + "epoch": 0.798732422261834, + "grad_norm": 1.4876256690502427, + "learning_rate": 2.0498987791390713e-06, + "loss": 0.2366, + "step": 10082 + }, + { + "epoch": 0.7988116458704694, + "grad_norm": 1.1456559547730043, + "learning_rate": 2.0483424958875876e-06, + "loss": 0.1358, + "step": 10083 + }, + { + "epoch": 0.7988908694791048, + "grad_norm": 1.1835780928546362, + "learning_rate": 2.0467867362144867e-06, + "loss": 0.1932, + "step": 10084 + }, + { + "epoch": 0.7989700930877401, + "grad_norm": 1.4400455663919776, + "learning_rate": 2.0452315002222134e-06, + "loss": 0.2307, + "step": 10085 + }, + { + "epoch": 0.7990493166963755, + "grad_norm": 1.4977139210642982, + "learning_rate": 2.04367678801317e-06, + "loss": 0.235, + "step": 10086 + }, + { + "epoch": 0.7991285403050109, + "grad_norm": 1.286165988202563, + "learning_rate": 2.0421225996897243e-06, + "loss": 0.1891, + "step": 10087 + }, + { + "epoch": 0.7992077639136462, + "grad_norm": 1.2970440831640693, + "learning_rate": 2.0405689353542204e-06, + "loss": 0.2039, + "step": 10088 + }, + { + "epoch": 0.7992869875222817, + "grad_norm": 1.5789173845812898, + "learning_rate": 2.0390157951089506e-06, + "loss": 0.3052, + "step": 10089 + }, + { + "epoch": 0.799366211130917, + "grad_norm": 1.1536288192742066, + "learning_rate": 2.0374631790561815e-06, + "loss": 0.1806, + "step": 10090 + }, + { + "epoch": 0.7994454347395524, + "grad_norm": 1.2498738727402872, + "learning_rate": 2.0359110872981526e-06, + "loss": 0.2317, + "step": 10091 + }, + { + "epoch": 0.7995246583481878, + "grad_norm": 1.4342050112591576, + "learning_rate": 2.034359519937057e-06, + "loss": 0.3094, + "step": 10092 + }, + { + "epoch": 0.7996038819568231, + "grad_norm": 1.1459214232825665, + "learning_rate": 2.032808477075057e-06, + "loss": 0.1456, + "step": 10093 + }, + { + "epoch": 0.7996831055654585, + "grad_norm": 1.2717047424352017, + "learning_rate": 2.0312579588142846e-06, + "loss": 0.1807, + "step": 10094 + }, + { + "epoch": 0.7997623291740938, + "grad_norm": 1.6236080733459077, + "learning_rate": 2.029707965256833e-06, + "loss": 0.3838, + "step": 10095 + }, + { + "epoch": 0.7998415527827293, + "grad_norm": 1.261159963505592, + "learning_rate": 2.0281584965047585e-06, + "loss": 0.1919, + "step": 10096 + }, + { + "epoch": 0.7999207763913646, + "grad_norm": 1.3827902907310599, + "learning_rate": 2.0266095526600925e-06, + "loss": 0.2018, + "step": 10097 + }, + { + "epoch": 0.8, + "grad_norm": 1.3708064203720107, + "learning_rate": 2.0250611338248215e-06, + "loss": 0.2607, + "step": 10098 + }, + { + "epoch": 0.8000792236086354, + "grad_norm": 1.0277386201776315, + "learning_rate": 2.0235132401008985e-06, + "loss": 0.1426, + "step": 10099 + }, + { + "epoch": 0.8001584472172707, + "grad_norm": 1.1300879985948629, + "learning_rate": 2.0219658715902514e-06, + "loss": 0.1674, + "step": 10100 + }, + { + "epoch": 0.8002376708259061, + "grad_norm": 1.2743524301711278, + "learning_rate": 2.0204190283947645e-06, + "loss": 0.2482, + "step": 10101 + }, + { + "epoch": 0.8003168944345415, + "grad_norm": 1.101843815938498, + "learning_rate": 2.0188727106162874e-06, + "loss": 0.1358, + "step": 10102 + }, + { + "epoch": 0.8003961180431769, + "grad_norm": 1.8307161788054416, + "learning_rate": 2.017326918356639e-06, + "loss": 0.3069, + "step": 10103 + }, + { + "epoch": 0.8004753416518122, + "grad_norm": 1.2863919198324845, + "learning_rate": 2.0157816517176045e-06, + "loss": 0.2062, + "step": 10104 + }, + { + "epoch": 0.8005545652604477, + "grad_norm": 1.6485069544438204, + "learning_rate": 2.0142369108009306e-06, + "loss": 0.267, + "step": 10105 + }, + { + "epoch": 0.800633788869083, + "grad_norm": 1.1038291976889658, + "learning_rate": 2.012692695708328e-06, + "loss": 0.1501, + "step": 10106 + }, + { + "epoch": 0.8007130124777183, + "grad_norm": 1.4175338489884162, + "learning_rate": 2.011149006541483e-06, + "loss": 0.238, + "step": 10107 + }, + { + "epoch": 0.8007922360863537, + "grad_norm": 1.5039496538066082, + "learning_rate": 2.0096058434020348e-06, + "loss": 0.2327, + "step": 10108 + }, + { + "epoch": 0.8008714596949891, + "grad_norm": 1.308265541140065, + "learning_rate": 2.0080632063915927e-06, + "loss": 0.1959, + "step": 10109 + }, + { + "epoch": 0.8009506833036245, + "grad_norm": 1.1449948458269021, + "learning_rate": 2.0065210956117354e-06, + "loss": 0.2028, + "step": 10110 + }, + { + "epoch": 0.8010299069122598, + "grad_norm": 1.4949408616518625, + "learning_rate": 2.0049795111640023e-06, + "loss": 0.2555, + "step": 10111 + }, + { + "epoch": 0.8011091305208953, + "grad_norm": 1.0771438450015118, + "learning_rate": 2.0034384531498962e-06, + "loss": 0.1698, + "step": 10112 + }, + { + "epoch": 0.8011883541295306, + "grad_norm": 1.381539600164165, + "learning_rate": 2.0018979216708935e-06, + "loss": 0.2812, + "step": 10113 + }, + { + "epoch": 0.8012675777381659, + "grad_norm": 1.7045386541522283, + "learning_rate": 2.000357916828428e-06, + "loss": 0.3365, + "step": 10114 + }, + { + "epoch": 0.8013468013468014, + "grad_norm": 0.9927199053215623, + "learning_rate": 1.9988184387239027e-06, + "loss": 0.1396, + "step": 10115 + }, + { + "epoch": 0.8014260249554367, + "grad_norm": 1.2445765848041155, + "learning_rate": 1.9972794874586808e-06, + "loss": 0.2612, + "step": 10116 + }, + { + "epoch": 0.8015052485640721, + "grad_norm": 1.381229687809628, + "learning_rate": 1.9957410631341e-06, + "loss": 0.2502, + "step": 10117 + }, + { + "epoch": 0.8015844721727075, + "grad_norm": 1.1127587353477846, + "learning_rate": 1.9942031658514573e-06, + "loss": 0.2265, + "step": 10118 + }, + { + "epoch": 0.8016636957813429, + "grad_norm": 1.0196678179637952, + "learning_rate": 1.992665795712011e-06, + "loss": 0.1545, + "step": 10119 + }, + { + "epoch": 0.8017429193899782, + "grad_norm": 1.666506403482929, + "learning_rate": 1.991128952816996e-06, + "loss": 0.2651, + "step": 10120 + }, + { + "epoch": 0.8018221429986135, + "grad_norm": 1.324582785510361, + "learning_rate": 1.9895926372676042e-06, + "loss": 0.1995, + "step": 10121 + }, + { + "epoch": 0.801901366607249, + "grad_norm": 1.7086514642676323, + "learning_rate": 1.988056849164991e-06, + "loss": 0.2918, + "step": 10122 + }, + { + "epoch": 0.8019805902158843, + "grad_norm": 1.18088746654375, + "learning_rate": 1.986521588610285e-06, + "loss": 0.1739, + "step": 10123 + }, + { + "epoch": 0.8020598138245197, + "grad_norm": 1.2591856160721422, + "learning_rate": 1.9849868557045738e-06, + "loss": 0.2048, + "step": 10124 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 1.1179822318003738, + "learning_rate": 1.9834526505489105e-06, + "loss": 0.193, + "step": 10125 + }, + { + "epoch": 0.8022182610417905, + "grad_norm": 1.214455416425855, + "learning_rate": 1.9819189732443187e-06, + "loss": 0.2295, + "step": 10126 + }, + { + "epoch": 0.8022974846504258, + "grad_norm": 1.0561176382575492, + "learning_rate": 1.9803858238917826e-06, + "loss": 0.1751, + "step": 10127 + }, + { + "epoch": 0.8023767082590612, + "grad_norm": 1.2849639222606646, + "learning_rate": 1.97885320259225e-06, + "loss": 0.1984, + "step": 10128 + }, + { + "epoch": 0.8024559318676966, + "grad_norm": 1.5194907069407513, + "learning_rate": 1.9773211094466404e-06, + "loss": 0.2277, + "step": 10129 + }, + { + "epoch": 0.8025351554763319, + "grad_norm": 1.176551235848641, + "learning_rate": 1.975789544555834e-06, + "loss": 0.2234, + "step": 10130 + }, + { + "epoch": 0.8026143790849674, + "grad_norm": 0.7892352253899914, + "learning_rate": 1.9742585080206754e-06, + "loss": 0.1076, + "step": 10131 + }, + { + "epoch": 0.8026936026936027, + "grad_norm": 0.9758370650968448, + "learning_rate": 1.9727279999419745e-06, + "loss": 0.1279, + "step": 10132 + }, + { + "epoch": 0.8027728263022381, + "grad_norm": 1.1767196344829758, + "learning_rate": 1.9711980204205115e-06, + "loss": 0.2078, + "step": 10133 + }, + { + "epoch": 0.8028520499108734, + "grad_norm": 1.7160701199502668, + "learning_rate": 1.9696685695570285e-06, + "loss": 0.2065, + "step": 10134 + }, + { + "epoch": 0.8029312735195088, + "grad_norm": 1.0815104996295632, + "learning_rate": 1.9681396474522264e-06, + "loss": 0.1797, + "step": 10135 + }, + { + "epoch": 0.8030104971281442, + "grad_norm": 1.2219685433630565, + "learning_rate": 1.966611254206785e-06, + "loss": 0.2191, + "step": 10136 + }, + { + "epoch": 0.8030897207367795, + "grad_norm": 1.49034873828191, + "learning_rate": 1.9650833899213383e-06, + "loss": 0.302, + "step": 10137 + }, + { + "epoch": 0.803168944345415, + "grad_norm": 1.2803873712659504, + "learning_rate": 1.963556054696487e-06, + "loss": 0.1724, + "step": 10138 + }, + { + "epoch": 0.8032481679540503, + "grad_norm": 1.3334402168903694, + "learning_rate": 1.962029248632802e-06, + "loss": 0.1791, + "step": 10139 + }, + { + "epoch": 0.8033273915626857, + "grad_norm": 1.3447278612631246, + "learning_rate": 1.9605029718308156e-06, + "loss": 0.1735, + "step": 10140 + }, + { + "epoch": 0.8034066151713211, + "grad_norm": 1.2041955856036584, + "learning_rate": 1.958977224391021e-06, + "loss": 0.1815, + "step": 10141 + }, + { + "epoch": 0.8034858387799564, + "grad_norm": 1.2556963140595756, + "learning_rate": 1.957452006413889e-06, + "loss": 0.2488, + "step": 10142 + }, + { + "epoch": 0.8035650623885918, + "grad_norm": 1.357159096573162, + "learning_rate": 1.955927317999844e-06, + "loss": 0.2113, + "step": 10143 + }, + { + "epoch": 0.8036442859972271, + "grad_norm": 1.346643773313863, + "learning_rate": 1.9544031592492763e-06, + "loss": 0.2805, + "step": 10144 + }, + { + "epoch": 0.8037235096058626, + "grad_norm": 1.226808929998541, + "learning_rate": 1.9528795302625515e-06, + "loss": 0.2114, + "step": 10145 + }, + { + "epoch": 0.8038027332144979, + "grad_norm": 1.6681317747869757, + "learning_rate": 1.951356431139988e-06, + "loss": 0.2573, + "step": 10146 + }, + { + "epoch": 0.8038819568231333, + "grad_norm": 1.657185706385089, + "learning_rate": 1.949833861981877e-06, + "loss": 0.2374, + "step": 10147 + }, + { + "epoch": 0.8039611804317687, + "grad_norm": 1.562054000431535, + "learning_rate": 1.948311822888468e-06, + "loss": 0.3371, + "step": 10148 + }, + { + "epoch": 0.804040404040404, + "grad_norm": 1.066699671091112, + "learning_rate": 1.9467903139599853e-06, + "loss": 0.1858, + "step": 10149 + }, + { + "epoch": 0.8041196276490394, + "grad_norm": 1.4510025018380015, + "learning_rate": 1.945269335296611e-06, + "loss": 0.1538, + "step": 10150 + }, + { + "epoch": 0.8041988512576748, + "grad_norm": 1.2258303063238312, + "learning_rate": 1.943748886998492e-06, + "loss": 0.2048, + "step": 10151 + }, + { + "epoch": 0.8042780748663102, + "grad_norm": 1.401573926034788, + "learning_rate": 1.942228969165748e-06, + "loss": 0.2833, + "step": 10152 + }, + { + "epoch": 0.8043572984749455, + "grad_norm": 1.2453388144194748, + "learning_rate": 1.940709581898453e-06, + "loss": 0.22, + "step": 10153 + }, + { + "epoch": 0.8044365220835809, + "grad_norm": 1.2396365590750524, + "learning_rate": 1.9391907252966522e-06, + "loss": 0.2341, + "step": 10154 + }, + { + "epoch": 0.8045157456922163, + "grad_norm": 1.4914906871726465, + "learning_rate": 1.9376723994603574e-06, + "loss": 0.2877, + "step": 10155 + }, + { + "epoch": 0.8045949693008516, + "grad_norm": 1.2308230245266347, + "learning_rate": 1.936154604489543e-06, + "loss": 0.2273, + "step": 10156 + }, + { + "epoch": 0.804674192909487, + "grad_norm": 1.158251685662581, + "learning_rate": 1.9346373404841433e-06, + "loss": 0.2029, + "step": 10157 + }, + { + "epoch": 0.8047534165181224, + "grad_norm": 1.4578628459268346, + "learning_rate": 1.93312060754407e-06, + "loss": 0.2278, + "step": 10158 + }, + { + "epoch": 0.8048326401267578, + "grad_norm": 1.2239206864037653, + "learning_rate": 1.9316044057691886e-06, + "loss": 0.1838, + "step": 10159 + }, + { + "epoch": 0.8049118637353931, + "grad_norm": 1.2614866999120988, + "learning_rate": 1.9300887352593355e-06, + "loss": 0.2389, + "step": 10160 + }, + { + "epoch": 0.8049910873440285, + "grad_norm": 1.4627397531773647, + "learning_rate": 1.928573596114306e-06, + "loss": 0.2518, + "step": 10161 + }, + { + "epoch": 0.8050703109526639, + "grad_norm": 1.123008349440525, + "learning_rate": 1.9270589884338706e-06, + "loss": 0.1639, + "step": 10162 + }, + { + "epoch": 0.8051495345612992, + "grad_norm": 1.3749026691254196, + "learning_rate": 1.9255449123177563e-06, + "loss": 0.1867, + "step": 10163 + }, + { + "epoch": 0.8052287581699347, + "grad_norm": 1.679347130175776, + "learning_rate": 1.924031367865655e-06, + "loss": 0.3011, + "step": 10164 + }, + { + "epoch": 0.80530798177857, + "grad_norm": 1.1681616226210787, + "learning_rate": 1.922518355177232e-06, + "loss": 0.1981, + "step": 10165 + }, + { + "epoch": 0.8053872053872054, + "grad_norm": 1.6358109392165265, + "learning_rate": 1.921005874352109e-06, + "loss": 0.2858, + "step": 10166 + }, + { + "epoch": 0.8054664289958408, + "grad_norm": 1.4148121378431107, + "learning_rate": 1.9194939254898746e-06, + "loss": 0.2671, + "step": 10167 + }, + { + "epoch": 0.8055456526044761, + "grad_norm": 1.9709461917683746, + "learning_rate": 1.917982508690085e-06, + "loss": 0.257, + "step": 10168 + }, + { + "epoch": 0.8056248762131115, + "grad_norm": 1.3283857009639417, + "learning_rate": 1.916471624052256e-06, + "loss": 0.2407, + "step": 10169 + }, + { + "epoch": 0.8057040998217468, + "grad_norm": 1.7441939210997033, + "learning_rate": 1.914961271675879e-06, + "loss": 0.2595, + "step": 10170 + }, + { + "epoch": 0.8057833234303823, + "grad_norm": 1.3436681496543106, + "learning_rate": 1.9134514516603987e-06, + "loss": 0.1934, + "step": 10171 + }, + { + "epoch": 0.8058625470390176, + "grad_norm": 1.0126466646847239, + "learning_rate": 1.9119421641052294e-06, + "loss": 0.1765, + "step": 10172 + }, + { + "epoch": 0.805941770647653, + "grad_norm": 1.139293071200593, + "learning_rate": 1.91043340910975e-06, + "loss": 0.1723, + "step": 10173 + }, + { + "epoch": 0.8060209942562884, + "grad_norm": 1.024457390453502, + "learning_rate": 1.908925186773308e-06, + "loss": 0.1129, + "step": 10174 + }, + { + "epoch": 0.8061002178649237, + "grad_norm": 1.0863679127375503, + "learning_rate": 1.907417497195211e-06, + "loss": 0.1429, + "step": 10175 + }, + { + "epoch": 0.8061794414735591, + "grad_norm": 1.2456251375640532, + "learning_rate": 1.9059103404747303e-06, + "loss": 0.291, + "step": 10176 + }, + { + "epoch": 0.8062586650821945, + "grad_norm": 1.1604744518959167, + "learning_rate": 1.9044037167111096e-06, + "loss": 0.2226, + "step": 10177 + }, + { + "epoch": 0.8063378886908299, + "grad_norm": 1.2719263677423331, + "learning_rate": 1.9028976260035515e-06, + "loss": 0.1929, + "step": 10178 + }, + { + "epoch": 0.8064171122994652, + "grad_norm": 1.2327099428680226, + "learning_rate": 1.901392068451221e-06, + "loss": 0.1935, + "step": 10179 + }, + { + "epoch": 0.8064963359081007, + "grad_norm": 1.5565207733174424, + "learning_rate": 1.8998870441532569e-06, + "loss": 0.26, + "step": 10180 + }, + { + "epoch": 0.806575559516736, + "grad_norm": 1.3136776035339608, + "learning_rate": 1.8983825532087551e-06, + "loss": 0.1883, + "step": 10181 + }, + { + "epoch": 0.8066547831253713, + "grad_norm": 1.2064689701188687, + "learning_rate": 1.8968785957167779e-06, + "loss": 0.2026, + "step": 10182 + }, + { + "epoch": 0.8067340067340067, + "grad_norm": 1.181341101025212, + "learning_rate": 1.8953751717763592e-06, + "loss": 0.1685, + "step": 10183 + }, + { + "epoch": 0.8068132303426421, + "grad_norm": 1.3827006210285366, + "learning_rate": 1.8938722814864863e-06, + "loss": 0.2018, + "step": 10184 + }, + { + "epoch": 0.8068924539512775, + "grad_norm": 1.487161883281403, + "learning_rate": 1.8923699249461214e-06, + "loss": 0.3085, + "step": 10185 + }, + { + "epoch": 0.8069716775599128, + "grad_norm": 1.3983962053077565, + "learning_rate": 1.890868102254182e-06, + "loss": 0.2635, + "step": 10186 + }, + { + "epoch": 0.8070509011685483, + "grad_norm": 2.7845862684748783, + "learning_rate": 1.8893668135095611e-06, + "loss": 0.2555, + "step": 10187 + }, + { + "epoch": 0.8071301247771836, + "grad_norm": 1.5844006341350203, + "learning_rate": 1.8878660588111108e-06, + "loss": 0.2634, + "step": 10188 + }, + { + "epoch": 0.8072093483858189, + "grad_norm": 1.0989350790019177, + "learning_rate": 1.8863658382576444e-06, + "loss": 0.1618, + "step": 10189 + }, + { + "epoch": 0.8072885719944544, + "grad_norm": 1.301647239250617, + "learning_rate": 1.8848661519479504e-06, + "loss": 0.1981, + "step": 10190 + }, + { + "epoch": 0.8073677956030897, + "grad_norm": 1.1557738100831196, + "learning_rate": 1.8833669999807723e-06, + "loss": 0.1836, + "step": 10191 + }, + { + "epoch": 0.8074470192117251, + "grad_norm": 1.4667194456496075, + "learning_rate": 1.88186838245482e-06, + "loss": 0.2454, + "step": 10192 + }, + { + "epoch": 0.8075262428203605, + "grad_norm": 1.2149222998878704, + "learning_rate": 1.8803702994687755e-06, + "loss": 0.202, + "step": 10193 + }, + { + "epoch": 0.8076054664289959, + "grad_norm": 1.63789996797214, + "learning_rate": 1.8788727511212768e-06, + "loss": 0.2799, + "step": 10194 + }, + { + "epoch": 0.8076846900376312, + "grad_norm": 1.144414113429165, + "learning_rate": 1.8773757375109292e-06, + "loss": 0.1598, + "step": 10195 + }, + { + "epoch": 0.8077639136462665, + "grad_norm": 1.2403543074836227, + "learning_rate": 1.8758792587363084e-06, + "loss": 0.2009, + "step": 10196 + }, + { + "epoch": 0.807843137254902, + "grad_norm": 1.1765324395672558, + "learning_rate": 1.8743833148959479e-06, + "loss": 0.1989, + "step": 10197 + }, + { + "epoch": 0.8079223608635373, + "grad_norm": 1.2257385597523451, + "learning_rate": 1.8728879060883443e-06, + "loss": 0.2103, + "step": 10198 + }, + { + "epoch": 0.8080015844721727, + "grad_norm": 1.4361904065234052, + "learning_rate": 1.8713930324119711e-06, + "loss": 0.2105, + "step": 10199 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 1.2684809532953119, + "learning_rate": 1.869898693965253e-06, + "loss": 0.1979, + "step": 10200 + }, + { + "epoch": 0.8081600316894435, + "grad_norm": 1.4506908921812272, + "learning_rate": 1.868404890846587e-06, + "loss": 0.2237, + "step": 10201 + }, + { + "epoch": 0.8082392552980788, + "grad_norm": 1.4154582042485293, + "learning_rate": 1.8669116231543294e-06, + "loss": 0.1312, + "step": 10202 + }, + { + "epoch": 0.8083184789067142, + "grad_norm": 1.1846778449260267, + "learning_rate": 1.865418890986811e-06, + "loss": 0.2417, + "step": 10203 + }, + { + "epoch": 0.8083977025153496, + "grad_norm": 0.998375449218653, + "learning_rate": 1.8639266944423163e-06, + "loss": 0.1259, + "step": 10204 + }, + { + "epoch": 0.8084769261239849, + "grad_norm": 1.3296422310118596, + "learning_rate": 1.8624350336190977e-06, + "loss": 0.2569, + "step": 10205 + }, + { + "epoch": 0.8085561497326204, + "grad_norm": 1.5121592959466554, + "learning_rate": 1.8609439086153803e-06, + "loss": 0.2357, + "step": 10206 + }, + { + "epoch": 0.8086353733412557, + "grad_norm": 1.4511070352829922, + "learning_rate": 1.859453319529343e-06, + "loss": 0.2666, + "step": 10207 + }, + { + "epoch": 0.8087145969498911, + "grad_norm": 1.254892053827073, + "learning_rate": 1.857963266459133e-06, + "loss": 0.2159, + "step": 10208 + }, + { + "epoch": 0.8087938205585264, + "grad_norm": 1.2875845036910536, + "learning_rate": 1.8564737495028673e-06, + "loss": 0.1779, + "step": 10209 + }, + { + "epoch": 0.8088730441671618, + "grad_norm": 1.9037867781349262, + "learning_rate": 1.854984768758621e-06, + "loss": 0.3066, + "step": 10210 + }, + { + "epoch": 0.8089522677757972, + "grad_norm": 1.3034801439114339, + "learning_rate": 1.853496324324434e-06, + "loss": 0.1879, + "step": 10211 + }, + { + "epoch": 0.8090314913844325, + "grad_norm": 1.343642085044102, + "learning_rate": 1.8520084162983176e-06, + "loss": 0.2259, + "step": 10212 + }, + { + "epoch": 0.809110714993068, + "grad_norm": 1.115846241283189, + "learning_rate": 1.8505210447782418e-06, + "loss": 0.1711, + "step": 10213 + }, + { + "epoch": 0.8091899386017033, + "grad_norm": 1.287983609836565, + "learning_rate": 1.8490342098621395e-06, + "loss": 0.2299, + "step": 10214 + }, + { + "epoch": 0.8092691622103387, + "grad_norm": 1.2446600253159052, + "learning_rate": 1.8475479116479166e-06, + "loss": 0.1993, + "step": 10215 + }, + { + "epoch": 0.8093483858189741, + "grad_norm": 1.588182118207134, + "learning_rate": 1.8460621502334375e-06, + "loss": 0.2857, + "step": 10216 + }, + { + "epoch": 0.8094276094276094, + "grad_norm": 1.814212128768382, + "learning_rate": 1.8445769257165314e-06, + "loss": 0.3378, + "step": 10217 + }, + { + "epoch": 0.8095068330362448, + "grad_norm": 1.1729493010856318, + "learning_rate": 1.8430922381949912e-06, + "loss": 0.1747, + "step": 10218 + }, + { + "epoch": 0.8095860566448801, + "grad_norm": 1.2348952610724928, + "learning_rate": 1.84160808776658e-06, + "loss": 0.1726, + "step": 10219 + }, + { + "epoch": 0.8096652802535156, + "grad_norm": 1.272296879440776, + "learning_rate": 1.8401244745290214e-06, + "loss": 0.2311, + "step": 10220 + }, + { + "epoch": 0.8097445038621509, + "grad_norm": 1.4692163605441064, + "learning_rate": 1.838641398580001e-06, + "loss": 0.2233, + "step": 10221 + }, + { + "epoch": 0.8098237274707863, + "grad_norm": 1.1636044130383294, + "learning_rate": 1.8371588600171764e-06, + "loss": 0.2129, + "step": 10222 + }, + { + "epoch": 0.8099029510794217, + "grad_norm": 1.1582485878060729, + "learning_rate": 1.8356768589381646e-06, + "loss": 0.1838, + "step": 10223 + }, + { + "epoch": 0.809982174688057, + "grad_norm": 1.2179009344241836, + "learning_rate": 1.8341953954405434e-06, + "loss": 0.1652, + "step": 10224 + }, + { + "epoch": 0.8100613982966924, + "grad_norm": 1.3052393438348684, + "learning_rate": 1.832714469621868e-06, + "loss": 0.2035, + "step": 10225 + }, + { + "epoch": 0.8101406219053278, + "grad_norm": 1.2281412441058612, + "learning_rate": 1.8312340815796458e-06, + "loss": 0.2292, + "step": 10226 + }, + { + "epoch": 0.8102198455139632, + "grad_norm": 1.467045008574425, + "learning_rate": 1.8297542314113515e-06, + "loss": 0.2852, + "step": 10227 + }, + { + "epoch": 0.8102990691225985, + "grad_norm": 1.3758352800801341, + "learning_rate": 1.82827491921443e-06, + "loss": 0.2579, + "step": 10228 + }, + { + "epoch": 0.810378292731234, + "grad_norm": 1.29540385183792, + "learning_rate": 1.8267961450862859e-06, + "loss": 0.204, + "step": 10229 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 1.1650957366618704, + "learning_rate": 1.8253179091242868e-06, + "loss": 0.2194, + "step": 10230 + }, + { + "epoch": 0.8105367399485046, + "grad_norm": 1.1815721874418448, + "learning_rate": 1.8238402114257714e-06, + "loss": 0.1963, + "step": 10231 + }, + { + "epoch": 0.81061596355714, + "grad_norm": 1.3190540223575808, + "learning_rate": 1.8223630520880365e-06, + "loss": 0.2065, + "step": 10232 + }, + { + "epoch": 0.8106951871657754, + "grad_norm": 1.2907715712642336, + "learning_rate": 1.8208864312083462e-06, + "loss": 0.2418, + "step": 10233 + }, + { + "epoch": 0.8107744107744108, + "grad_norm": 1.0001408320805503, + "learning_rate": 1.8194103488839265e-06, + "loss": 0.1384, + "step": 10234 + }, + { + "epoch": 0.8108536343830461, + "grad_norm": 1.414418477532868, + "learning_rate": 1.817934805211976e-06, + "loss": 0.29, + "step": 10235 + }, + { + "epoch": 0.8109328579916815, + "grad_norm": 0.9326814359755048, + "learning_rate": 1.8164598002896484e-06, + "loss": 0.1112, + "step": 10236 + }, + { + "epoch": 0.8110120816003169, + "grad_norm": 1.4504881888123107, + "learning_rate": 1.8149853342140644e-06, + "loss": 0.2644, + "step": 10237 + }, + { + "epoch": 0.8110913052089522, + "grad_norm": 1.409400362517609, + "learning_rate": 1.8135114070823145e-06, + "loss": 0.1756, + "step": 10238 + }, + { + "epoch": 0.8111705288175877, + "grad_norm": 1.1782037114620536, + "learning_rate": 1.8120380189914476e-06, + "loss": 0.1884, + "step": 10239 + }, + { + "epoch": 0.811249752426223, + "grad_norm": 1.3316947943494324, + "learning_rate": 1.8105651700384764e-06, + "loss": 0.2428, + "step": 10240 + }, + { + "epoch": 0.8113289760348584, + "grad_norm": 1.3826143216831768, + "learning_rate": 1.8090928603203871e-06, + "loss": 0.2192, + "step": 10241 + }, + { + "epoch": 0.8114081996434938, + "grad_norm": 0.9404104605513147, + "learning_rate": 1.8076210899341196e-06, + "loss": 0.1084, + "step": 10242 + }, + { + "epoch": 0.8114874232521291, + "grad_norm": 1.230086720832417, + "learning_rate": 1.8061498589765824e-06, + "loss": 0.2075, + "step": 10243 + }, + { + "epoch": 0.8115666468607645, + "grad_norm": 1.2371368633193467, + "learning_rate": 1.804679167544655e-06, + "loss": 0.1727, + "step": 10244 + }, + { + "epoch": 0.8116458704693998, + "grad_norm": 1.7424896414268622, + "learning_rate": 1.8032090157351701e-06, + "loss": 0.2947, + "step": 10245 + }, + { + "epoch": 0.8117250940780353, + "grad_norm": 1.1787494076789051, + "learning_rate": 1.8017394036449276e-06, + "loss": 0.1903, + "step": 10246 + }, + { + "epoch": 0.8118043176866706, + "grad_norm": 1.34669875521707, + "learning_rate": 1.8002703313706993e-06, + "loss": 0.2065, + "step": 10247 + }, + { + "epoch": 0.811883541295306, + "grad_norm": 1.3281843386193821, + "learning_rate": 1.7988017990092167e-06, + "loss": 0.232, + "step": 10248 + }, + { + "epoch": 0.8119627649039414, + "grad_norm": 1.295546345942209, + "learning_rate": 1.797333806657171e-06, + "loss": 0.2509, + "step": 10249 + }, + { + "epoch": 0.8120419885125767, + "grad_norm": 1.3786530864373474, + "learning_rate": 1.7958663544112277e-06, + "loss": 0.257, + "step": 10250 + }, + { + "epoch": 0.8121212121212121, + "grad_norm": 1.3995634685977423, + "learning_rate": 1.794399442368009e-06, + "loss": 0.2737, + "step": 10251 + }, + { + "epoch": 0.8122004357298475, + "grad_norm": 1.6194665112494062, + "learning_rate": 1.7929330706241023e-06, + "loss": 0.2472, + "step": 10252 + }, + { + "epoch": 0.8122796593384829, + "grad_norm": 1.5776913207683405, + "learning_rate": 1.7914672392760645e-06, + "loss": 0.2712, + "step": 10253 + }, + { + "epoch": 0.8123588829471182, + "grad_norm": 1.189055709264225, + "learning_rate": 1.7900019484204135e-06, + "loss": 0.2346, + "step": 10254 + }, + { + "epoch": 0.8124381065557537, + "grad_norm": 1.6584302929940211, + "learning_rate": 1.788537198153627e-06, + "loss": 0.2967, + "step": 10255 + }, + { + "epoch": 0.812517330164389, + "grad_norm": 1.1198439538396494, + "learning_rate": 1.787072988572157e-06, + "loss": 0.2213, + "step": 10256 + }, + { + "epoch": 0.8125965537730243, + "grad_norm": 1.297205225142965, + "learning_rate": 1.7856093197724133e-06, + "loss": 0.2564, + "step": 10257 + }, + { + "epoch": 0.8126757773816597, + "grad_norm": 0.9705550236969642, + "learning_rate": 1.7841461918507708e-06, + "loss": 0.1685, + "step": 10258 + }, + { + "epoch": 0.8127550009902951, + "grad_norm": 1.90949201417659, + "learning_rate": 1.7826836049035655e-06, + "loss": 0.2706, + "step": 10259 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 1.2711044578456732, + "learning_rate": 1.7812215590271099e-06, + "loss": 0.1941, + "step": 10260 + }, + { + "epoch": 0.8129134482075658, + "grad_norm": 1.208488851403463, + "learning_rate": 1.7797600543176675e-06, + "loss": 0.2128, + "step": 10261 + }, + { + "epoch": 0.8129926718162013, + "grad_norm": 1.3401095932771625, + "learning_rate": 1.7782990908714703e-06, + "loss": 0.2443, + "step": 10262 + }, + { + "epoch": 0.8130718954248366, + "grad_norm": 1.1773150423768914, + "learning_rate": 1.7768386687847194e-06, + "loss": 0.2481, + "step": 10263 + }, + { + "epoch": 0.8131511190334719, + "grad_norm": 1.1317846167546541, + "learning_rate": 1.7753787881535757e-06, + "loss": 0.1617, + "step": 10264 + }, + { + "epoch": 0.8132303426421074, + "grad_norm": 1.2325199084524263, + "learning_rate": 1.7739194490741607e-06, + "loss": 0.2744, + "step": 10265 + }, + { + "epoch": 0.8133095662507427, + "grad_norm": 1.0459303766153756, + "learning_rate": 1.7724606516425724e-06, + "loss": 0.1311, + "step": 10266 + }, + { + "epoch": 0.8133887898593781, + "grad_norm": 1.4635184626565356, + "learning_rate": 1.7710023959548617e-06, + "loss": 0.3027, + "step": 10267 + }, + { + "epoch": 0.8134680134680135, + "grad_norm": 1.4666310331054864, + "learning_rate": 1.7695446821070438e-06, + "loss": 0.2131, + "step": 10268 + }, + { + "epoch": 0.8135472370766489, + "grad_norm": 1.0664895229219125, + "learning_rate": 1.76808751019511e-06, + "loss": 0.1703, + "step": 10269 + }, + { + "epoch": 0.8136264606852842, + "grad_norm": 1.4358776736052012, + "learning_rate": 1.7666308803150045e-06, + "loss": 0.2408, + "step": 10270 + }, + { + "epoch": 0.8137056842939195, + "grad_norm": 1.5802910115029503, + "learning_rate": 1.7651747925626383e-06, + "loss": 0.2877, + "step": 10271 + }, + { + "epoch": 0.813784907902555, + "grad_norm": 1.2365544352665416, + "learning_rate": 1.763719247033886e-06, + "loss": 0.184, + "step": 10272 + }, + { + "epoch": 0.8138641315111903, + "grad_norm": 1.3473060195080302, + "learning_rate": 1.762264243824594e-06, + "loss": 0.2266, + "step": 10273 + }, + { + "epoch": 0.8139433551198257, + "grad_norm": 1.4943388232597272, + "learning_rate": 1.7608097830305637e-06, + "loss": 0.1842, + "step": 10274 + }, + { + "epoch": 0.8140225787284611, + "grad_norm": 1.4123639575882567, + "learning_rate": 1.7593558647475627e-06, + "loss": 0.2691, + "step": 10275 + }, + { + "epoch": 0.8141018023370965, + "grad_norm": 1.1825159472120483, + "learning_rate": 1.7579024890713282e-06, + "loss": 0.1955, + "step": 10276 + }, + { + "epoch": 0.8141810259457318, + "grad_norm": 1.1835983889608375, + "learning_rate": 1.7564496560975574e-06, + "loss": 0.1859, + "step": 10277 + }, + { + "epoch": 0.8142602495543672, + "grad_norm": 1.1057596763373965, + "learning_rate": 1.7549973659219077e-06, + "loss": 0.1672, + "step": 10278 + }, + { + "epoch": 0.8143394731630026, + "grad_norm": 1.3260921355826003, + "learning_rate": 1.7535456186400123e-06, + "loss": 0.1994, + "step": 10279 + }, + { + "epoch": 0.8144186967716379, + "grad_norm": 1.0582699870264463, + "learning_rate": 1.7520944143474584e-06, + "loss": 0.1808, + "step": 10280 + }, + { + "epoch": 0.8144979203802734, + "grad_norm": 1.554432483448072, + "learning_rate": 1.750643753139798e-06, + "loss": 0.2899, + "step": 10281 + }, + { + "epoch": 0.8145771439889087, + "grad_norm": 1.28817796701486, + "learning_rate": 1.749193635112556e-06, + "loss": 0.2379, + "step": 10282 + }, + { + "epoch": 0.8146563675975441, + "grad_norm": 1.1402680726084276, + "learning_rate": 1.7477440603612127e-06, + "loss": 0.1677, + "step": 10283 + }, + { + "epoch": 0.8147355912061794, + "grad_norm": 1.5324841499773714, + "learning_rate": 1.746295028981213e-06, + "loss": 0.246, + "step": 10284 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 1.4081360060841852, + "learning_rate": 1.7448465410679737e-06, + "loss": 0.1446, + "step": 10285 + }, + { + "epoch": 0.8148940384234502, + "grad_norm": 1.5748660026528625, + "learning_rate": 1.7433985967168686e-06, + "loss": 0.2066, + "step": 10286 + }, + { + "epoch": 0.8149732620320855, + "grad_norm": 1.324420943349605, + "learning_rate": 1.7419511960232384e-06, + "loss": 0.2073, + "step": 10287 + }, + { + "epoch": 0.815052485640721, + "grad_norm": 1.2594131264642532, + "learning_rate": 1.7405043390823827e-06, + "loss": 0.1789, + "step": 10288 + }, + { + "epoch": 0.8151317092493563, + "grad_norm": 1.4520718884925568, + "learning_rate": 1.7390580259895783e-06, + "loss": 0.1849, + "step": 10289 + }, + { + "epoch": 0.8152109328579917, + "grad_norm": 1.3862950222123092, + "learning_rate": 1.7376122568400533e-06, + "loss": 0.2414, + "step": 10290 + }, + { + "epoch": 0.8152901564666271, + "grad_norm": 1.5026304541181141, + "learning_rate": 1.7361670317290014e-06, + "loss": 0.3038, + "step": 10291 + }, + { + "epoch": 0.8153693800752624, + "grad_norm": 1.0904410936026638, + "learning_rate": 1.7347223507515908e-06, + "loss": 0.2092, + "step": 10292 + }, + { + "epoch": 0.8154486036838978, + "grad_norm": 1.209528048456497, + "learning_rate": 1.7332782140029436e-06, + "loss": 0.234, + "step": 10293 + }, + { + "epoch": 0.8155278272925331, + "grad_norm": 1.3974957146397042, + "learning_rate": 1.7318346215781468e-06, + "loss": 0.1821, + "step": 10294 + }, + { + "epoch": 0.8156070509011686, + "grad_norm": 1.274340726574614, + "learning_rate": 1.7303915735722586e-06, + "loss": 0.2081, + "step": 10295 + }, + { + "epoch": 0.8156862745098039, + "grad_norm": 1.2421061391655035, + "learning_rate": 1.7289490700802947e-06, + "loss": 0.2373, + "step": 10296 + }, + { + "epoch": 0.8157654981184393, + "grad_norm": 1.4259941335948567, + "learning_rate": 1.727507111197233e-06, + "loss": 0.2602, + "step": 10297 + }, + { + "epoch": 0.8158447217270747, + "grad_norm": 1.192839607111933, + "learning_rate": 1.7260656970180268e-06, + "loss": 0.1846, + "step": 10298 + }, + { + "epoch": 0.81592394533571, + "grad_norm": 1.5370843421180753, + "learning_rate": 1.7246248276375832e-06, + "loss": 0.2546, + "step": 10299 + }, + { + "epoch": 0.8160031689443454, + "grad_norm": 1.2597798624620644, + "learning_rate": 1.7231845031507732e-06, + "loss": 0.2121, + "step": 10300 + }, + { + "epoch": 0.8160823925529808, + "grad_norm": 1.377613046561397, + "learning_rate": 1.72174472365244e-06, + "loss": 0.2565, + "step": 10301 + }, + { + "epoch": 0.8161616161616162, + "grad_norm": 1.785585502046287, + "learning_rate": 1.720305489237385e-06, + "loss": 0.2387, + "step": 10302 + }, + { + "epoch": 0.8162408397702515, + "grad_norm": 1.5014508382227219, + "learning_rate": 1.718866800000375e-06, + "loss": 0.2272, + "step": 10303 + }, + { + "epoch": 0.816320063378887, + "grad_norm": 1.4412576474118937, + "learning_rate": 1.7174286560361364e-06, + "loss": 0.3261, + "step": 10304 + }, + { + "epoch": 0.8163992869875223, + "grad_norm": 1.0957979987181663, + "learning_rate": 1.7159910574393702e-06, + "loss": 0.1618, + "step": 10305 + }, + { + "epoch": 0.8164785105961576, + "grad_norm": 1.209546413018405, + "learning_rate": 1.7145540043047327e-06, + "loss": 0.2248, + "step": 10306 + }, + { + "epoch": 0.816557734204793, + "grad_norm": 1.1621168537330795, + "learning_rate": 1.713117496726845e-06, + "loss": 0.1638, + "step": 10307 + }, + { + "epoch": 0.8166369578134284, + "grad_norm": 1.416173438535659, + "learning_rate": 1.711681534800298e-06, + "loss": 0.2776, + "step": 10308 + }, + { + "epoch": 0.8167161814220638, + "grad_norm": 1.437073638808075, + "learning_rate": 1.7102461186196418e-06, + "loss": 0.2706, + "step": 10309 + }, + { + "epoch": 0.8167954050306991, + "grad_norm": 1.5157392696702352, + "learning_rate": 1.7088112482793872e-06, + "loss": 0.2677, + "step": 10310 + }, + { + "epoch": 0.8168746286393346, + "grad_norm": 1.5381388140838068, + "learning_rate": 1.7073769238740213e-06, + "loss": 0.2769, + "step": 10311 + }, + { + "epoch": 0.8169538522479699, + "grad_norm": 1.3247262068757826, + "learning_rate": 1.7059431454979825e-06, + "loss": 0.1797, + "step": 10312 + }, + { + "epoch": 0.8170330758566052, + "grad_norm": 1.2728621172571906, + "learning_rate": 1.7045099132456766e-06, + "loss": 0.1784, + "step": 10313 + }, + { + "epoch": 0.8171122994652407, + "grad_norm": 2.1232685417655315, + "learning_rate": 1.7030772272114803e-06, + "loss": 0.3448, + "step": 10314 + }, + { + "epoch": 0.817191523073876, + "grad_norm": 1.4068864869705144, + "learning_rate": 1.7016450874897273e-06, + "loss": 0.1907, + "step": 10315 + }, + { + "epoch": 0.8172707466825114, + "grad_norm": 1.57852913028593, + "learning_rate": 1.7002134941747116e-06, + "loss": 0.2098, + "step": 10316 + }, + { + "epoch": 0.8173499702911468, + "grad_norm": 1.2301279745603875, + "learning_rate": 1.698782447360705e-06, + "loss": 0.1905, + "step": 10317 + }, + { + "epoch": 0.8174291938997821, + "grad_norm": 1.1207002454014958, + "learning_rate": 1.697351947141932e-06, + "loss": 0.1333, + "step": 10318 + }, + { + "epoch": 0.8175084175084175, + "grad_norm": 1.4359269579492713, + "learning_rate": 1.6959219936125827e-06, + "loss": 0.2583, + "step": 10319 + }, + { + "epoch": 0.8175876411170528, + "grad_norm": 1.3639522354032674, + "learning_rate": 1.6944925868668106e-06, + "loss": 0.2161, + "step": 10320 + }, + { + "epoch": 0.8176668647256883, + "grad_norm": 1.2383139379667367, + "learning_rate": 1.6930637269987415e-06, + "loss": 0.1981, + "step": 10321 + }, + { + "epoch": 0.8177460883343236, + "grad_norm": 1.3533050767149346, + "learning_rate": 1.691635414102455e-06, + "loss": 0.2228, + "step": 10322 + }, + { + "epoch": 0.817825311942959, + "grad_norm": 1.171941407824025, + "learning_rate": 1.6902076482719987e-06, + "loss": 0.1889, + "step": 10323 + }, + { + "epoch": 0.8179045355515944, + "grad_norm": 1.591989080662042, + "learning_rate": 1.6887804296013854e-06, + "loss": 0.2997, + "step": 10324 + }, + { + "epoch": 0.8179837591602297, + "grad_norm": 1.4794138470015934, + "learning_rate": 1.6873537581845866e-06, + "loss": 0.2466, + "step": 10325 + }, + { + "epoch": 0.8180629827688651, + "grad_norm": 1.6253032975568673, + "learning_rate": 1.6859276341155483e-06, + "loss": 0.2873, + "step": 10326 + }, + { + "epoch": 0.8181422063775005, + "grad_norm": 1.4149302691843844, + "learning_rate": 1.68450205748817e-06, + "loss": 0.2213, + "step": 10327 + }, + { + "epoch": 0.8182214299861359, + "grad_norm": 1.412223343932639, + "learning_rate": 1.6830770283963194e-06, + "loss": 0.2295, + "step": 10328 + }, + { + "epoch": 0.8183006535947712, + "grad_norm": 1.2809231024141314, + "learning_rate": 1.6816525469338252e-06, + "loss": 0.229, + "step": 10329 + }, + { + "epoch": 0.8183798772034067, + "grad_norm": 1.2995554972626846, + "learning_rate": 1.6802286131944889e-06, + "loss": 0.2339, + "step": 10330 + }, + { + "epoch": 0.818459100812042, + "grad_norm": 1.3471722232092467, + "learning_rate": 1.6788052272720656e-06, + "loss": 0.2197, + "step": 10331 + }, + { + "epoch": 0.8185383244206773, + "grad_norm": 1.2927139783459367, + "learning_rate": 1.677382389260277e-06, + "loss": 0.197, + "step": 10332 + }, + { + "epoch": 0.8186175480293127, + "grad_norm": 1.2592630129454887, + "learning_rate": 1.6759600992528147e-06, + "loss": 0.2238, + "step": 10333 + }, + { + "epoch": 0.8186967716379481, + "grad_norm": 1.213021850314714, + "learning_rate": 1.674538357343326e-06, + "loss": 0.2326, + "step": 10334 + }, + { + "epoch": 0.8187759952465835, + "grad_norm": 1.3713100036709034, + "learning_rate": 1.6731171636254263e-06, + "loss": 0.1972, + "step": 10335 + }, + { + "epoch": 0.8188552188552188, + "grad_norm": 1.0865475728156522, + "learning_rate": 1.6716965181926959e-06, + "loss": 0.1969, + "step": 10336 + }, + { + "epoch": 0.8189344424638543, + "grad_norm": 1.4639722932368535, + "learning_rate": 1.670276421138677e-06, + "loss": 0.23, + "step": 10337 + }, + { + "epoch": 0.8190136660724896, + "grad_norm": 1.36637401712028, + "learning_rate": 1.6688568725568732e-06, + "loss": 0.2396, + "step": 10338 + }, + { + "epoch": 0.8190928896811249, + "grad_norm": 1.1068080607427992, + "learning_rate": 1.6674378725407603e-06, + "loss": 0.1854, + "step": 10339 + }, + { + "epoch": 0.8191721132897604, + "grad_norm": 1.1373757725705118, + "learning_rate": 1.6660194211837687e-06, + "loss": 0.157, + "step": 10340 + }, + { + "epoch": 0.8192513368983957, + "grad_norm": 1.4529655792121996, + "learning_rate": 1.6646015185792963e-06, + "loss": 0.269, + "step": 10341 + }, + { + "epoch": 0.8193305605070311, + "grad_norm": 1.5709448244830584, + "learning_rate": 1.6631841648207092e-06, + "loss": 0.2288, + "step": 10342 + }, + { + "epoch": 0.8194097841156665, + "grad_norm": 1.2709831515115881, + "learning_rate": 1.6617673600013295e-06, + "loss": 0.2132, + "step": 10343 + }, + { + "epoch": 0.8194890077243019, + "grad_norm": 1.3817262216962494, + "learning_rate": 1.6603511042144494e-06, + "loss": 0.2113, + "step": 10344 + }, + { + "epoch": 0.8195682313329372, + "grad_norm": 1.4004216623460284, + "learning_rate": 1.6589353975533174e-06, + "loss": 0.2452, + "step": 10345 + }, + { + "epoch": 0.8196474549415725, + "grad_norm": 1.1397019793174794, + "learning_rate": 1.6575202401111578e-06, + "loss": 0.2117, + "step": 10346 + }, + { + "epoch": 0.819726678550208, + "grad_norm": 1.3005336089755082, + "learning_rate": 1.6561056319811497e-06, + "loss": 0.2337, + "step": 10347 + }, + { + "epoch": 0.8198059021588433, + "grad_norm": 1.2571241667943538, + "learning_rate": 1.654691573256434e-06, + "loss": 0.1912, + "step": 10348 + }, + { + "epoch": 0.8198851257674787, + "grad_norm": 1.4114978669902711, + "learning_rate": 1.653278064030126e-06, + "loss": 0.2705, + "step": 10349 + }, + { + "epoch": 0.8199643493761141, + "grad_norm": 2.1951536027815726, + "learning_rate": 1.651865104395296e-06, + "loss": 0.3389, + "step": 10350 + }, + { + "epoch": 0.8200435729847495, + "grad_norm": 1.4692628375697956, + "learning_rate": 1.6504526944449772e-06, + "loss": 0.283, + "step": 10351 + }, + { + "epoch": 0.8201227965933848, + "grad_norm": 0.9088173713682491, + "learning_rate": 1.6490408342721764e-06, + "loss": 0.1667, + "step": 10352 + }, + { + "epoch": 0.8202020202020202, + "grad_norm": 1.4165749674275787, + "learning_rate": 1.6476295239698537e-06, + "loss": 0.2307, + "step": 10353 + }, + { + "epoch": 0.8202812438106556, + "grad_norm": 1.5300314861445203, + "learning_rate": 1.6462187636309345e-06, + "loss": 0.2924, + "step": 10354 + }, + { + "epoch": 0.8203604674192909, + "grad_norm": 1.4401525327339801, + "learning_rate": 1.6448085533483172e-06, + "loss": 0.3099, + "step": 10355 + }, + { + "epoch": 0.8204396910279264, + "grad_norm": 1.3132576232091553, + "learning_rate": 1.6433988932148547e-06, + "loss": 0.1781, + "step": 10356 + }, + { + "epoch": 0.8205189146365617, + "grad_norm": 1.421611373861576, + "learning_rate": 1.6419897833233644e-06, + "loss": 0.2676, + "step": 10357 + }, + { + "epoch": 0.8205981382451971, + "grad_norm": 1.3529378320975918, + "learning_rate": 1.6405812237666296e-06, + "loss": 0.2506, + "step": 10358 + }, + { + "epoch": 0.8206773618538324, + "grad_norm": 1.2486852963495985, + "learning_rate": 1.6391732146373994e-06, + "loss": 0.2156, + "step": 10359 + }, + { + "epoch": 0.8207565854624678, + "grad_norm": 1.0582853093689502, + "learning_rate": 1.6377657560283844e-06, + "loss": 0.1363, + "step": 10360 + }, + { + "epoch": 0.8208358090711032, + "grad_norm": 1.1741922401650111, + "learning_rate": 1.6363588480322545e-06, + "loss": 0.1976, + "step": 10361 + }, + { + "epoch": 0.8209150326797385, + "grad_norm": 1.3491918239893574, + "learning_rate": 1.6349524907416536e-06, + "loss": 0.2448, + "step": 10362 + }, + { + "epoch": 0.820994256288374, + "grad_norm": 1.3061875093073403, + "learning_rate": 1.6335466842491821e-06, + "loss": 0.2547, + "step": 10363 + }, + { + "epoch": 0.8210734798970093, + "grad_norm": 1.2939148498516355, + "learning_rate": 1.6321414286474014e-06, + "loss": 0.2417, + "step": 10364 + }, + { + "epoch": 0.8211527035056447, + "grad_norm": 1.1108571738423376, + "learning_rate": 1.6307367240288463e-06, + "loss": 0.2071, + "step": 10365 + }, + { + "epoch": 0.8212319271142801, + "grad_norm": 1.4448227630754369, + "learning_rate": 1.6293325704860087e-06, + "loss": 0.2877, + "step": 10366 + }, + { + "epoch": 0.8213111507229154, + "grad_norm": 1.5158763692825543, + "learning_rate": 1.6279289681113407e-06, + "loss": 0.2142, + "step": 10367 + }, + { + "epoch": 0.8213903743315508, + "grad_norm": 1.146351910766024, + "learning_rate": 1.626525916997269e-06, + "loss": 0.1805, + "step": 10368 + }, + { + "epoch": 0.8214695979401861, + "grad_norm": 1.0646648279273332, + "learning_rate": 1.6251234172361763e-06, + "loss": 0.1766, + "step": 10369 + }, + { + "epoch": 0.8215488215488216, + "grad_norm": 1.1809710572867445, + "learning_rate": 1.623721468920405e-06, + "loss": 0.1829, + "step": 10370 + }, + { + "epoch": 0.8216280451574569, + "grad_norm": 1.3656010045319111, + "learning_rate": 1.6223200721422739e-06, + "loss": 0.2001, + "step": 10371 + }, + { + "epoch": 0.8217072687660923, + "grad_norm": 1.509409543101267, + "learning_rate": 1.6209192269940555e-06, + "loss": 0.1841, + "step": 10372 + }, + { + "epoch": 0.8217864923747277, + "grad_norm": 1.324564976827142, + "learning_rate": 1.6195189335679884e-06, + "loss": 0.2096, + "step": 10373 + }, + { + "epoch": 0.821865715983363, + "grad_norm": 1.5918482414211266, + "learning_rate": 1.6181191919562734e-06, + "loss": 0.3143, + "step": 10374 + }, + { + "epoch": 0.8219449395919984, + "grad_norm": 1.4284160694796504, + "learning_rate": 1.6167200022510799e-06, + "loss": 0.2241, + "step": 10375 + }, + { + "epoch": 0.8220241632006338, + "grad_norm": 1.438861035765178, + "learning_rate": 1.6153213645445376e-06, + "loss": 0.3243, + "step": 10376 + }, + { + "epoch": 0.8221033868092692, + "grad_norm": 1.2804668749894867, + "learning_rate": 1.613923278928735e-06, + "loss": 0.2377, + "step": 10377 + }, + { + "epoch": 0.8221826104179045, + "grad_norm": 1.640306674414265, + "learning_rate": 1.6125257454957365e-06, + "loss": 0.2982, + "step": 10378 + }, + { + "epoch": 0.82226183402654, + "grad_norm": 1.473771925798413, + "learning_rate": 1.6111287643375607e-06, + "loss": 0.2463, + "step": 10379 + }, + { + "epoch": 0.8223410576351753, + "grad_norm": 1.4404475061089161, + "learning_rate": 1.6097323355461869e-06, + "loss": 0.181, + "step": 10380 + }, + { + "epoch": 0.8224202812438106, + "grad_norm": 1.4478799335377857, + "learning_rate": 1.6083364592135708e-06, + "loss": 0.2052, + "step": 10381 + }, + { + "epoch": 0.822499504852446, + "grad_norm": 1.2013489423930401, + "learning_rate": 1.6069411354316212e-06, + "loss": 0.183, + "step": 10382 + }, + { + "epoch": 0.8225787284610814, + "grad_norm": 1.3995359702064103, + "learning_rate": 1.6055463642922098e-06, + "loss": 0.177, + "step": 10383 + }, + { + "epoch": 0.8226579520697168, + "grad_norm": 1.1100114667780525, + "learning_rate": 1.6041521458871812e-06, + "loss": 0.195, + "step": 10384 + }, + { + "epoch": 0.8227371756783521, + "grad_norm": 1.392688884767605, + "learning_rate": 1.6027584803083351e-06, + "loss": 0.2193, + "step": 10385 + }, + { + "epoch": 0.8228163992869876, + "grad_norm": 1.316965799772601, + "learning_rate": 1.6013653676474371e-06, + "loss": 0.2444, + "step": 10386 + }, + { + "epoch": 0.8228956228956229, + "grad_norm": 1.1463397689776766, + "learning_rate": 1.5999728079962197e-06, + "loss": 0.2095, + "step": 10387 + }, + { + "epoch": 0.8229748465042582, + "grad_norm": 1.318659053358065, + "learning_rate": 1.5985808014463745e-06, + "loss": 0.2093, + "step": 10388 + }, + { + "epoch": 0.8230540701128937, + "grad_norm": 1.4214008979548918, + "learning_rate": 1.5971893480895583e-06, + "loss": 0.1911, + "step": 10389 + }, + { + "epoch": 0.823133293721529, + "grad_norm": 1.1974444608587302, + "learning_rate": 1.5957984480173893e-06, + "loss": 0.2136, + "step": 10390 + }, + { + "epoch": 0.8232125173301644, + "grad_norm": 1.3597731862588451, + "learning_rate": 1.5944081013214575e-06, + "loss": 0.1795, + "step": 10391 + }, + { + "epoch": 0.8232917409387998, + "grad_norm": 1.4632313272608521, + "learning_rate": 1.593018308093306e-06, + "loss": 0.3382, + "step": 10392 + }, + { + "epoch": 0.8233709645474351, + "grad_norm": 1.6441841653193927, + "learning_rate": 1.5916290684244452e-06, + "loss": 0.2671, + "step": 10393 + }, + { + "epoch": 0.8234501881560705, + "grad_norm": 1.4790059493130998, + "learning_rate": 1.5902403824063539e-06, + "loss": 0.2519, + "step": 10394 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.5018856840644295, + "learning_rate": 1.5888522501304682e-06, + "loss": 0.2535, + "step": 10395 + }, + { + "epoch": 0.8236086353733413, + "grad_norm": 1.4279542940321752, + "learning_rate": 1.587464671688187e-06, + "loss": 0.205, + "step": 10396 + }, + { + "epoch": 0.8236878589819766, + "grad_norm": 1.4944454237399976, + "learning_rate": 1.5860776471708816e-06, + "loss": 0.2653, + "step": 10397 + }, + { + "epoch": 0.823767082590612, + "grad_norm": 1.52851382509781, + "learning_rate": 1.5846911766698781e-06, + "loss": 0.2545, + "step": 10398 + }, + { + "epoch": 0.8238463061992474, + "grad_norm": 1.5499167860420955, + "learning_rate": 1.5833052602764664e-06, + "loss": 0.2935, + "step": 10399 + }, + { + "epoch": 0.8239255298078827, + "grad_norm": 1.2656159569319236, + "learning_rate": 1.5819198980819096e-06, + "loss": 0.1989, + "step": 10400 + }, + { + "epoch": 0.8240047534165181, + "grad_norm": 1.2850325877446969, + "learning_rate": 1.5805350901774197e-06, + "loss": 0.2054, + "step": 10401 + }, + { + "epoch": 0.8240839770251535, + "grad_norm": 1.1260981729498543, + "learning_rate": 1.5791508366541797e-06, + "loss": 0.1571, + "step": 10402 + }, + { + "epoch": 0.8241632006337889, + "grad_norm": 1.4404055901653536, + "learning_rate": 1.577767137603341e-06, + "loss": 0.1775, + "step": 10403 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.9534078221552195, + "learning_rate": 1.5763839931160108e-06, + "loss": 0.1479, + "step": 10404 + }, + { + "epoch": 0.8243216478510597, + "grad_norm": 1.384704749144126, + "learning_rate": 1.5750014032832617e-06, + "loss": 0.2461, + "step": 10405 + }, + { + "epoch": 0.824400871459695, + "grad_norm": 1.3006642087200235, + "learning_rate": 1.5736193681961332e-06, + "loss": 0.1674, + "step": 10406 + }, + { + "epoch": 0.8244800950683303, + "grad_norm": 1.1089500614798569, + "learning_rate": 1.5722378879456234e-06, + "loss": 0.1854, + "step": 10407 + }, + { + "epoch": 0.8245593186769657, + "grad_norm": 1.095246332754987, + "learning_rate": 1.5708569626226954e-06, + "loss": 0.1716, + "step": 10408 + }, + { + "epoch": 0.8246385422856011, + "grad_norm": 1.174671687959912, + "learning_rate": 1.5694765923182798e-06, + "loss": 0.1643, + "step": 10409 + }, + { + "epoch": 0.8247177658942365, + "grad_norm": 1.2825065746484636, + "learning_rate": 1.5680967771232659e-06, + "loss": 0.2172, + "step": 10410 + }, + { + "epoch": 0.8247969895028718, + "grad_norm": 1.2968781985275724, + "learning_rate": 1.5667175171285054e-06, + "loss": 0.1824, + "step": 10411 + }, + { + "epoch": 0.8248762131115073, + "grad_norm": 1.0978913996852, + "learning_rate": 1.5653388124248203e-06, + "loss": 0.2001, + "step": 10412 + }, + { + "epoch": 0.8249554367201426, + "grad_norm": 1.377710707717662, + "learning_rate": 1.5639606631029892e-06, + "loss": 0.2097, + "step": 10413 + }, + { + "epoch": 0.8250346603287779, + "grad_norm": 1.5427707606580752, + "learning_rate": 1.5625830692537569e-06, + "loss": 0.3436, + "step": 10414 + }, + { + "epoch": 0.8251138839374134, + "grad_norm": 1.1786801147703365, + "learning_rate": 1.561206030967828e-06, + "loss": 0.1814, + "step": 10415 + }, + { + "epoch": 0.8251931075460487, + "grad_norm": 1.2274595711774268, + "learning_rate": 1.5598295483358804e-06, + "loss": 0.2014, + "step": 10416 + }, + { + "epoch": 0.8252723311546841, + "grad_norm": 1.235756359941025, + "learning_rate": 1.5584536214485457e-06, + "loss": 0.2043, + "step": 10417 + }, + { + "epoch": 0.8253515547633195, + "grad_norm": 1.788015719964634, + "learning_rate": 1.5570782503964188e-06, + "loss": 0.2822, + "step": 10418 + }, + { + "epoch": 0.8254307783719549, + "grad_norm": 1.223167146359783, + "learning_rate": 1.5557034352700672e-06, + "loss": 0.1676, + "step": 10419 + }, + { + "epoch": 0.8255100019805902, + "grad_norm": 1.3122266132947737, + "learning_rate": 1.5543291761600133e-06, + "loss": 0.2539, + "step": 10420 + }, + { + "epoch": 0.8255892255892255, + "grad_norm": 1.212045610154158, + "learning_rate": 1.552955473156742e-06, + "loss": 0.2231, + "step": 10421 + }, + { + "epoch": 0.825668449197861, + "grad_norm": 1.350331950579808, + "learning_rate": 1.5515823263507112e-06, + "loss": 0.2474, + "step": 10422 + }, + { + "epoch": 0.8257476728064963, + "grad_norm": 1.0968873116705355, + "learning_rate": 1.5502097358323321e-06, + "loss": 0.1462, + "step": 10423 + }, + { + "epoch": 0.8258268964151317, + "grad_norm": 1.1023080961360494, + "learning_rate": 1.548837701691983e-06, + "loss": 0.1638, + "step": 10424 + }, + { + "epoch": 0.8259061200237671, + "grad_norm": 1.3081675469735337, + "learning_rate": 1.547466224020009e-06, + "loss": 0.1831, + "step": 10425 + }, + { + "epoch": 0.8259853436324025, + "grad_norm": 1.5820404729924766, + "learning_rate": 1.5460953029067128e-06, + "loss": 0.1919, + "step": 10426 + }, + { + "epoch": 0.8260645672410378, + "grad_norm": 1.294956337815726, + "learning_rate": 1.5447249384423624e-06, + "loss": 0.1771, + "step": 10427 + }, + { + "epoch": 0.8261437908496732, + "grad_norm": 1.3373271013669448, + "learning_rate": 1.543355130717189e-06, + "loss": 0.2279, + "step": 10428 + }, + { + "epoch": 0.8262230144583086, + "grad_norm": 1.1791264852630465, + "learning_rate": 1.5419858798213928e-06, + "loss": 0.2001, + "step": 10429 + }, + { + "epoch": 0.8263022380669439, + "grad_norm": 1.2237456306618302, + "learning_rate": 1.540617185845128e-06, + "loss": 0.22, + "step": 10430 + }, + { + "epoch": 0.8263814616755794, + "grad_norm": 1.158662628670014, + "learning_rate": 1.5392490488785151e-06, + "loss": 0.1863, + "step": 10431 + }, + { + "epoch": 0.8264606852842147, + "grad_norm": 1.6245465078614232, + "learning_rate": 1.537881469011645e-06, + "loss": 0.3369, + "step": 10432 + }, + { + "epoch": 0.8265399088928501, + "grad_norm": 1.3985079170662937, + "learning_rate": 1.5365144463345627e-06, + "loss": 0.2179, + "step": 10433 + }, + { + "epoch": 0.8266191325014854, + "grad_norm": 1.6526039938845045, + "learning_rate": 1.5351479809372772e-06, + "loss": 0.3087, + "step": 10434 + }, + { + "epoch": 0.8266983561101208, + "grad_norm": 1.2990904626019235, + "learning_rate": 1.5337820729097697e-06, + "loss": 0.1781, + "step": 10435 + }, + { + "epoch": 0.8267775797187562, + "grad_norm": 1.4811222804168258, + "learning_rate": 1.5324167223419762e-06, + "loss": 0.2721, + "step": 10436 + }, + { + "epoch": 0.8268568033273915, + "grad_norm": 1.4557939736013057, + "learning_rate": 1.5310519293237958e-06, + "loss": 0.2832, + "step": 10437 + }, + { + "epoch": 0.826936026936027, + "grad_norm": 1.23819330798364, + "learning_rate": 1.5296876939450978e-06, + "loss": 0.1812, + "step": 10438 + }, + { + "epoch": 0.8270152505446623, + "grad_norm": 1.3790964660525777, + "learning_rate": 1.528324016295709e-06, + "loss": 0.2235, + "step": 10439 + }, + { + "epoch": 0.8270944741532977, + "grad_norm": 1.3793749855621042, + "learning_rate": 1.5269608964654181e-06, + "loss": 0.176, + "step": 10440 + }, + { + "epoch": 0.8271736977619331, + "grad_norm": 1.4205379380261078, + "learning_rate": 1.525598334543985e-06, + "loss": 0.1938, + "step": 10441 + }, + { + "epoch": 0.8272529213705684, + "grad_norm": 1.4264341725788607, + "learning_rate": 1.524236330621125e-06, + "loss": 0.2735, + "step": 10442 + }, + { + "epoch": 0.8273321449792038, + "grad_norm": 1.404979967365074, + "learning_rate": 1.5228748847865205e-06, + "loss": 0.219, + "step": 10443 + }, + { + "epoch": 0.8274113685878391, + "grad_norm": 1.3157075881798561, + "learning_rate": 1.5215139971298131e-06, + "loss": 0.2062, + "step": 10444 + }, + { + "epoch": 0.8274905921964746, + "grad_norm": 1.2271147826848667, + "learning_rate": 1.5201536677406147e-06, + "loss": 0.215, + "step": 10445 + }, + { + "epoch": 0.8275698158051099, + "grad_norm": 1.2056536752009905, + "learning_rate": 1.518793896708496e-06, + "loss": 0.1767, + "step": 10446 + }, + { + "epoch": 0.8276490394137453, + "grad_norm": 1.4052691483357655, + "learning_rate": 1.517434684122987e-06, + "loss": 0.1866, + "step": 10447 + }, + { + "epoch": 0.8277282630223807, + "grad_norm": 1.6209246354830482, + "learning_rate": 1.5160760300735911e-06, + "loss": 0.3082, + "step": 10448 + }, + { + "epoch": 0.827807486631016, + "grad_norm": 1.6872191124006044, + "learning_rate": 1.5147179346497665e-06, + "loss": 0.2915, + "step": 10449 + }, + { + "epoch": 0.8278867102396514, + "grad_norm": 1.616362676388551, + "learning_rate": 1.513360397940935e-06, + "loss": 0.364, + "step": 10450 + }, + { + "epoch": 0.8279659338482868, + "grad_norm": 1.6069752087735503, + "learning_rate": 1.5120034200364885e-06, + "loss": 0.2942, + "step": 10451 + }, + { + "epoch": 0.8280451574569222, + "grad_norm": 1.1670965955740882, + "learning_rate": 1.5106470010257758e-06, + "loss": 0.1975, + "step": 10452 + }, + { + "epoch": 0.8281243810655575, + "grad_norm": 1.218441728890744, + "learning_rate": 1.509291140998107e-06, + "loss": 0.1941, + "step": 10453 + }, + { + "epoch": 0.828203604674193, + "grad_norm": 1.4345412072207402, + "learning_rate": 1.5079358400427635e-06, + "loss": 0.2281, + "step": 10454 + }, + { + "epoch": 0.8282828282828283, + "grad_norm": 1.4721384706701444, + "learning_rate": 1.5065810982489849e-06, + "loss": 0.2506, + "step": 10455 + }, + { + "epoch": 0.8283620518914636, + "grad_norm": 1.3730560855914193, + "learning_rate": 1.5052269157059707e-06, + "loss": 0.2224, + "step": 10456 + }, + { + "epoch": 0.828441275500099, + "grad_norm": 1.0984627790630974, + "learning_rate": 1.503873292502892e-06, + "loss": 0.1761, + "step": 10457 + }, + { + "epoch": 0.8285204991087344, + "grad_norm": 1.276739895936656, + "learning_rate": 1.5025202287288764e-06, + "loss": 0.186, + "step": 10458 + }, + { + "epoch": 0.8285997227173698, + "grad_norm": 1.158664052023081, + "learning_rate": 1.501167724473016e-06, + "loss": 0.2495, + "step": 10459 + }, + { + "epoch": 0.8286789463260051, + "grad_norm": 1.1677818488737044, + "learning_rate": 1.499815779824365e-06, + "loss": 0.1792, + "step": 10460 + }, + { + "epoch": 0.8287581699346406, + "grad_norm": 1.2078443445862654, + "learning_rate": 1.4984643948719469e-06, + "loss": 0.178, + "step": 10461 + }, + { + "epoch": 0.8288373935432759, + "grad_norm": 1.2939661778658178, + "learning_rate": 1.4971135697047422e-06, + "loss": 0.2069, + "step": 10462 + }, + { + "epoch": 0.8289166171519112, + "grad_norm": 1.3709717316926502, + "learning_rate": 1.4957633044116925e-06, + "loss": 0.2213, + "step": 10463 + }, + { + "epoch": 0.8289958407605467, + "grad_norm": 1.2694239326792562, + "learning_rate": 1.4944135990817121e-06, + "loss": 0.2097, + "step": 10464 + }, + { + "epoch": 0.829075064369182, + "grad_norm": 1.1448841492351802, + "learning_rate": 1.4930644538036709e-06, + "loss": 0.1331, + "step": 10465 + }, + { + "epoch": 0.8291542879778174, + "grad_norm": 1.447332060384047, + "learning_rate": 1.4917158686663992e-06, + "loss": 0.209, + "step": 10466 + }, + { + "epoch": 0.8292335115864528, + "grad_norm": 1.1696458467418067, + "learning_rate": 1.490367843758701e-06, + "loss": 0.1956, + "step": 10467 + }, + { + "epoch": 0.8293127351950882, + "grad_norm": 1.2903550393989425, + "learning_rate": 1.4890203791693337e-06, + "loss": 0.2446, + "step": 10468 + }, + { + "epoch": 0.8293919588037235, + "grad_norm": 1.2929151922134108, + "learning_rate": 1.4876734749870213e-06, + "loss": 0.1887, + "step": 10469 + }, + { + "epoch": 0.8294711824123588, + "grad_norm": 1.3493758751616824, + "learning_rate": 1.4863271313004535e-06, + "loss": 0.2603, + "step": 10470 + }, + { + "epoch": 0.8295504060209943, + "grad_norm": 1.3427871768272837, + "learning_rate": 1.4849813481982788e-06, + "loss": 0.2038, + "step": 10471 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 1.09143748838598, + "learning_rate": 1.483636125769108e-06, + "loss": 0.1815, + "step": 10472 + }, + { + "epoch": 0.829708853238265, + "grad_norm": 1.5203669474670853, + "learning_rate": 1.482291464101523e-06, + "loss": 0.2375, + "step": 10473 + }, + { + "epoch": 0.8297880768469004, + "grad_norm": 1.3708184548188178, + "learning_rate": 1.480947363284061e-06, + "loss": 0.2453, + "step": 10474 + }, + { + "epoch": 0.8298673004555357, + "grad_norm": 1.64704832916517, + "learning_rate": 1.4796038234052235e-06, + "loss": 0.2428, + "step": 10475 + }, + { + "epoch": 0.8299465240641711, + "grad_norm": 1.4317492154301195, + "learning_rate": 1.4782608445534741e-06, + "loss": 0.2513, + "step": 10476 + }, + { + "epoch": 0.8300257476728065, + "grad_norm": 1.2600340543932351, + "learning_rate": 1.4769184268172465e-06, + "loss": 0.1873, + "step": 10477 + }, + { + "epoch": 0.8301049712814419, + "grad_norm": 1.4383762299639489, + "learning_rate": 1.4755765702849311e-06, + "loss": 0.247, + "step": 10478 + }, + { + "epoch": 0.8301841948900772, + "grad_norm": 1.4803630563481738, + "learning_rate": 1.4742352750448806e-06, + "loss": 0.2188, + "step": 10479 + }, + { + "epoch": 0.8302634184987127, + "grad_norm": 1.354573727373033, + "learning_rate": 1.4728945411854135e-06, + "loss": 0.2612, + "step": 10480 + }, + { + "epoch": 0.830342642107348, + "grad_norm": 1.4326339975414173, + "learning_rate": 1.4715543687948096e-06, + "loss": 0.2588, + "step": 10481 + }, + { + "epoch": 0.8304218657159833, + "grad_norm": 1.2164016612067483, + "learning_rate": 1.470214757961317e-06, + "loss": 0.198, + "step": 10482 + }, + { + "epoch": 0.8305010893246187, + "grad_norm": 1.1555759947209618, + "learning_rate": 1.4688757087731386e-06, + "loss": 0.2237, + "step": 10483 + }, + { + "epoch": 0.8305803129332541, + "grad_norm": 1.5733336407777812, + "learning_rate": 1.4675372213184458e-06, + "loss": 0.2592, + "step": 10484 + }, + { + "epoch": 0.8306595365418895, + "grad_norm": 1.8759869510185854, + "learning_rate": 1.4661992956853699e-06, + "loss": 0.259, + "step": 10485 + }, + { + "epoch": 0.8307387601505248, + "grad_norm": 1.5596886337090299, + "learning_rate": 1.4648619319620105e-06, + "loss": 0.2756, + "step": 10486 + }, + { + "epoch": 0.8308179837591603, + "grad_norm": 1.2608031732856335, + "learning_rate": 1.463525130236424e-06, + "loss": 0.1794, + "step": 10487 + }, + { + "epoch": 0.8308972073677956, + "grad_norm": 1.1529548292727665, + "learning_rate": 1.4621888905966308e-06, + "loss": 0.176, + "step": 10488 + }, + { + "epoch": 0.8309764309764309, + "grad_norm": 1.4339897861899062, + "learning_rate": 1.4608532131306198e-06, + "loss": 0.3178, + "step": 10489 + }, + { + "epoch": 0.8310556545850664, + "grad_norm": 1.509207170922193, + "learning_rate": 1.459518097926337e-06, + "loss": 0.2594, + "step": 10490 + }, + { + "epoch": 0.8311348781937017, + "grad_norm": 0.933489930944928, + "learning_rate": 1.4581835450716907e-06, + "loss": 0.122, + "step": 10491 + }, + { + "epoch": 0.8312141018023371, + "grad_norm": 1.3098267596961726, + "learning_rate": 1.4568495546545603e-06, + "loss": 0.2302, + "step": 10492 + }, + { + "epoch": 0.8312933254109725, + "grad_norm": 1.5673986074660746, + "learning_rate": 1.4555161267627793e-06, + "loss": 0.3526, + "step": 10493 + }, + { + "epoch": 0.8313725490196079, + "grad_norm": 1.3639975537414533, + "learning_rate": 1.4541832614841455e-06, + "loss": 0.2062, + "step": 10494 + }, + { + "epoch": 0.8314517726282432, + "grad_norm": 1.2594827228710086, + "learning_rate": 1.4528509589064276e-06, + "loss": 0.1386, + "step": 10495 + }, + { + "epoch": 0.8315309962368785, + "grad_norm": 1.5253252160816442, + "learning_rate": 1.4515192191173466e-06, + "loss": 0.2561, + "step": 10496 + }, + { + "epoch": 0.831610219845514, + "grad_norm": 1.2112868283898253, + "learning_rate": 1.45018804220459e-06, + "loss": 0.1939, + "step": 10497 + }, + { + "epoch": 0.8316894434541493, + "grad_norm": 1.0772623401697152, + "learning_rate": 1.4488574282558143e-06, + "loss": 0.1722, + "step": 10498 + }, + { + "epoch": 0.8317686670627847, + "grad_norm": 1.1517497412415536, + "learning_rate": 1.4475273773586319e-06, + "loss": 0.191, + "step": 10499 + }, + { + "epoch": 0.8318478906714201, + "grad_norm": 1.3576760029433559, + "learning_rate": 1.446197889600619e-06, + "loss": 0.2114, + "step": 10500 + }, + { + "epoch": 0.8319271142800555, + "grad_norm": 1.3456224959096212, + "learning_rate": 1.444868965069315e-06, + "loss": 0.2632, + "step": 10501 + }, + { + "epoch": 0.8320063378886908, + "grad_norm": 1.1746166613153766, + "learning_rate": 1.443540603852227e-06, + "loss": 0.1889, + "step": 10502 + }, + { + "epoch": 0.8320855614973262, + "grad_norm": 1.117846493387386, + "learning_rate": 1.4422128060368201e-06, + "loss": 0.15, + "step": 10503 + }, + { + "epoch": 0.8321647851059616, + "grad_norm": 1.7555418597228245, + "learning_rate": 1.4408855717105197e-06, + "loss": 0.2966, + "step": 10504 + }, + { + "epoch": 0.8322440087145969, + "grad_norm": 1.2560473747015137, + "learning_rate": 1.4395589009607225e-06, + "loss": 0.2138, + "step": 10505 + }, + { + "epoch": 0.8323232323232324, + "grad_norm": 1.3595679190079988, + "learning_rate": 1.4382327938747808e-06, + "loss": 0.2185, + "step": 10506 + }, + { + "epoch": 0.8324024559318677, + "grad_norm": 1.298919626404769, + "learning_rate": 1.4369072505400117e-06, + "loss": 0.1877, + "step": 10507 + }, + { + "epoch": 0.8324816795405031, + "grad_norm": 1.224169809273626, + "learning_rate": 1.4355822710436995e-06, + "loss": 0.174, + "step": 10508 + }, + { + "epoch": 0.8325609031491384, + "grad_norm": 1.2992214249532263, + "learning_rate": 1.4342578554730858e-06, + "loss": 0.1638, + "step": 10509 + }, + { + "epoch": 0.8326401267577738, + "grad_norm": 1.1051795468237293, + "learning_rate": 1.4329340039153738e-06, + "loss": 0.1907, + "step": 10510 + }, + { + "epoch": 0.8327193503664092, + "grad_norm": 1.307194867459968, + "learning_rate": 1.4316107164577376e-06, + "loss": 0.1835, + "step": 10511 + }, + { + "epoch": 0.8327985739750445, + "grad_norm": 1.1230924771616186, + "learning_rate": 1.430287993187307e-06, + "loss": 0.2111, + "step": 10512 + }, + { + "epoch": 0.83287779758368, + "grad_norm": 1.2699326232187946, + "learning_rate": 1.4289658341911782e-06, + "loss": 0.1635, + "step": 10513 + }, + { + "epoch": 0.8329570211923153, + "grad_norm": 1.46608894835311, + "learning_rate": 1.4276442395564049e-06, + "loss": 0.2441, + "step": 10514 + }, + { + "epoch": 0.8330362448009507, + "grad_norm": 1.2287292315959046, + "learning_rate": 1.426323209370014e-06, + "loss": 0.1739, + "step": 10515 + }, + { + "epoch": 0.8331154684095861, + "grad_norm": 1.5236128841183416, + "learning_rate": 1.425002743718985e-06, + "loss": 0.2452, + "step": 10516 + }, + { + "epoch": 0.8331946920182214, + "grad_norm": 1.5824350853744693, + "learning_rate": 1.4236828426902626e-06, + "loss": 0.271, + "step": 10517 + }, + { + "epoch": 0.8332739156268568, + "grad_norm": 1.0270909450230639, + "learning_rate": 1.4223635063707619e-06, + "loss": 0.1875, + "step": 10518 + }, + { + "epoch": 0.8333531392354921, + "grad_norm": 1.2362274115367011, + "learning_rate": 1.421044734847351e-06, + "loss": 0.2117, + "step": 10519 + }, + { + "epoch": 0.8334323628441276, + "grad_norm": 1.5682810614841907, + "learning_rate": 1.4197265282068618e-06, + "loss": 0.2766, + "step": 10520 + }, + { + "epoch": 0.8335115864527629, + "grad_norm": 1.8509526057611487, + "learning_rate": 1.4184088865360978e-06, + "loss": 0.3006, + "step": 10521 + }, + { + "epoch": 0.8335908100613983, + "grad_norm": 1.2981534492302385, + "learning_rate": 1.4170918099218166e-06, + "loss": 0.1939, + "step": 10522 + }, + { + "epoch": 0.8336700336700337, + "grad_norm": 1.6089274125683373, + "learning_rate": 1.41577529845074e-06, + "loss": 0.2716, + "step": 10523 + }, + { + "epoch": 0.833749257278669, + "grad_norm": 1.1815635085174374, + "learning_rate": 1.4144593522095563e-06, + "loss": 0.2106, + "step": 10524 + }, + { + "epoch": 0.8338284808873044, + "grad_norm": 2.0633184183539854, + "learning_rate": 1.4131439712849148e-06, + "loss": 0.2271, + "step": 10525 + }, + { + "epoch": 0.8339077044959398, + "grad_norm": 1.4039238887972836, + "learning_rate": 1.4118291557634223e-06, + "loss": 0.3216, + "step": 10526 + }, + { + "epoch": 0.8339869281045752, + "grad_norm": 1.5581597463961219, + "learning_rate": 1.410514905731658e-06, + "loss": 0.2781, + "step": 10527 + }, + { + "epoch": 0.8340661517132105, + "grad_norm": 1.2319396328229066, + "learning_rate": 1.4092012212761574e-06, + "loss": 0.2069, + "step": 10528 + }, + { + "epoch": 0.834145375321846, + "grad_norm": 1.0120120004353914, + "learning_rate": 1.4078881024834213e-06, + "loss": 0.1267, + "step": 10529 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 1.203910036146133, + "learning_rate": 1.406575549439907e-06, + "loss": 0.1761, + "step": 10530 + }, + { + "epoch": 0.8343038225391166, + "grad_norm": 1.3937406881915537, + "learning_rate": 1.4052635622320477e-06, + "loss": 0.2266, + "step": 10531 + }, + { + "epoch": 0.834383046147752, + "grad_norm": 1.4767836842521311, + "learning_rate": 1.4039521409462265e-06, + "loss": 0.2669, + "step": 10532 + }, + { + "epoch": 0.8344622697563874, + "grad_norm": 1.2466478366329732, + "learning_rate": 1.4026412856687931e-06, + "loss": 0.2587, + "step": 10533 + }, + { + "epoch": 0.8345414933650228, + "grad_norm": 1.2716867736926591, + "learning_rate": 1.4013309964860667e-06, + "loss": 0.1974, + "step": 10534 + }, + { + "epoch": 0.8346207169736581, + "grad_norm": 1.1441397814842527, + "learning_rate": 1.4000212734843187e-06, + "loss": 0.1749, + "step": 10535 + }, + { + "epoch": 0.8346999405822936, + "grad_norm": 1.8932301766464903, + "learning_rate": 1.3987121167497874e-06, + "loss": 0.2529, + "step": 10536 + }, + { + "epoch": 0.8347791641909289, + "grad_norm": 1.3135878958835159, + "learning_rate": 1.3974035263686792e-06, + "loss": 0.2206, + "step": 10537 + }, + { + "epoch": 0.8348583877995642, + "grad_norm": 0.9846565731925334, + "learning_rate": 1.396095502427155e-06, + "loss": 0.1067, + "step": 10538 + }, + { + "epoch": 0.8349376114081997, + "grad_norm": 1.2814572175289134, + "learning_rate": 1.3947880450113404e-06, + "loss": 0.2197, + "step": 10539 + }, + { + "epoch": 0.835016835016835, + "grad_norm": 1.3351611547655284, + "learning_rate": 1.39348115420733e-06, + "loss": 0.2049, + "step": 10540 + }, + { + "epoch": 0.8350960586254704, + "grad_norm": 1.3186450346978396, + "learning_rate": 1.392174830101174e-06, + "loss": 0.1883, + "step": 10541 + }, + { + "epoch": 0.8351752822341058, + "grad_norm": 1.0813180871141732, + "learning_rate": 1.3908690727788842e-06, + "loss": 0.1367, + "step": 10542 + }, + { + "epoch": 0.8352545058427412, + "grad_norm": 1.2404744643224725, + "learning_rate": 1.3895638823264447e-06, + "loss": 0.2313, + "step": 10543 + }, + { + "epoch": 0.8353337294513765, + "grad_norm": 1.470303528888458, + "learning_rate": 1.3882592588297917e-06, + "loss": 0.2156, + "step": 10544 + }, + { + "epoch": 0.8354129530600118, + "grad_norm": 1.1296365480557664, + "learning_rate": 1.38695520237483e-06, + "loss": 0.2032, + "step": 10545 + }, + { + "epoch": 0.8354921766686473, + "grad_norm": 1.323176135950088, + "learning_rate": 1.3856517130474235e-06, + "loss": 0.2389, + "step": 10546 + }, + { + "epoch": 0.8355714002772826, + "grad_norm": 1.3284572194270088, + "learning_rate": 1.384348790933403e-06, + "loss": 0.2813, + "step": 10547 + }, + { + "epoch": 0.835650623885918, + "grad_norm": 1.137063173093719, + "learning_rate": 1.3830464361185592e-06, + "loss": 0.158, + "step": 10548 + }, + { + "epoch": 0.8357298474945534, + "grad_norm": 1.3689786919127183, + "learning_rate": 1.3817446486886433e-06, + "loss": 0.1825, + "step": 10549 + }, + { + "epoch": 0.8358090711031888, + "grad_norm": 1.166034337622154, + "learning_rate": 1.3804434287293756e-06, + "loss": 0.1648, + "step": 10550 + }, + { + "epoch": 0.8358882947118241, + "grad_norm": 1.4022572638477493, + "learning_rate": 1.3791427763264342e-06, + "loss": 0.2287, + "step": 10551 + }, + { + "epoch": 0.8359675183204595, + "grad_norm": 1.6221468833774844, + "learning_rate": 1.3778426915654575e-06, + "loss": 0.2689, + "step": 10552 + }, + { + "epoch": 0.8360467419290949, + "grad_norm": 1.411561457489168, + "learning_rate": 1.3765431745320546e-06, + "loss": 0.1948, + "step": 10553 + }, + { + "epoch": 0.8361259655377302, + "grad_norm": 1.3625077478813943, + "learning_rate": 1.3752442253117903e-06, + "loss": 0.2672, + "step": 10554 + }, + { + "epoch": 0.8362051891463657, + "grad_norm": 1.1818437776152657, + "learning_rate": 1.373945843990192e-06, + "loss": 0.2, + "step": 10555 + }, + { + "epoch": 0.836284412755001, + "grad_norm": 1.393240285654648, + "learning_rate": 1.3726480306527578e-06, + "loss": 0.2114, + "step": 10556 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 1.3860434319826995, + "learning_rate": 1.3713507853849373e-06, + "loss": 0.2425, + "step": 10557 + }, + { + "epoch": 0.8364428599722717, + "grad_norm": 1.0172884683093302, + "learning_rate": 1.3700541082721464e-06, + "loss": 0.153, + "step": 10558 + }, + { + "epoch": 0.8365220835809071, + "grad_norm": 1.5240819100071796, + "learning_rate": 1.3687579993997703e-06, + "loss": 0.2846, + "step": 10559 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 1.5108697664713944, + "learning_rate": 1.3674624588531481e-06, + "loss": 0.2132, + "step": 10560 + }, + { + "epoch": 0.8366805307981778, + "grad_norm": 1.1935349047651764, + "learning_rate": 1.3661674867175844e-06, + "loss": 0.2076, + "step": 10561 + }, + { + "epoch": 0.8367597544068133, + "grad_norm": 1.2433805926004182, + "learning_rate": 1.3648730830783507e-06, + "loss": 0.2069, + "step": 10562 + }, + { + "epoch": 0.8368389780154486, + "grad_norm": 1.1183355725368211, + "learning_rate": 1.3635792480206744e-06, + "loss": 0.168, + "step": 10563 + }, + { + "epoch": 0.8369182016240839, + "grad_norm": 1.399026550174716, + "learning_rate": 1.3622859816297473e-06, + "loss": 0.2324, + "step": 10564 + }, + { + "epoch": 0.8369974252327194, + "grad_norm": 1.2565435085372032, + "learning_rate": 1.3609932839907281e-06, + "loss": 0.1816, + "step": 10565 + }, + { + "epoch": 0.8370766488413547, + "grad_norm": 1.4572249630085097, + "learning_rate": 1.3597011551887329e-06, + "loss": 0.2448, + "step": 10566 + }, + { + "epoch": 0.8371558724499901, + "grad_norm": 1.2350664164415799, + "learning_rate": 1.3584095953088405e-06, + "loss": 0.1937, + "step": 10567 + }, + { + "epoch": 0.8372350960586254, + "grad_norm": 0.9197494095088624, + "learning_rate": 1.3571186044360973e-06, + "loss": 0.1442, + "step": 10568 + }, + { + "epoch": 0.8373143196672609, + "grad_norm": 1.4518128342091754, + "learning_rate": 1.3558281826555065e-06, + "loss": 0.2605, + "step": 10569 + }, + { + "epoch": 0.8373935432758962, + "grad_norm": 2.0573082274836474, + "learning_rate": 1.3545383300520375e-06, + "loss": 0.3431, + "step": 10570 + }, + { + "epoch": 0.8374727668845315, + "grad_norm": 1.352905874637864, + "learning_rate": 1.3532490467106186e-06, + "loss": 0.2705, + "step": 10571 + }, + { + "epoch": 0.837551990493167, + "grad_norm": 1.3023197662109454, + "learning_rate": 1.3519603327161456e-06, + "loss": 0.2966, + "step": 10572 + }, + { + "epoch": 0.8376312141018023, + "grad_norm": 1.2107913018479692, + "learning_rate": 1.3506721881534734e-06, + "loss": 0.1523, + "step": 10573 + }, + { + "epoch": 0.8377104377104377, + "grad_norm": 1.132899724751293, + "learning_rate": 1.3493846131074173e-06, + "loss": 0.1437, + "step": 10574 + }, + { + "epoch": 0.8377896613190731, + "grad_norm": 1.5194909336116953, + "learning_rate": 1.3480976076627617e-06, + "loss": 0.2769, + "step": 10575 + }, + { + "epoch": 0.8378688849277085, + "grad_norm": 1.3932035569980017, + "learning_rate": 1.3468111719042497e-06, + "loss": 0.2719, + "step": 10576 + }, + { + "epoch": 0.8379481085363438, + "grad_norm": 1.035175483166696, + "learning_rate": 1.345525305916583e-06, + "loss": 0.1058, + "step": 10577 + }, + { + "epoch": 0.8380273321449792, + "grad_norm": 1.0872354805156903, + "learning_rate": 1.3442400097844344e-06, + "loss": 0.1681, + "step": 10578 + }, + { + "epoch": 0.8381065557536146, + "grad_norm": 1.201323635196354, + "learning_rate": 1.342955283592432e-06, + "loss": 0.1694, + "step": 10579 + }, + { + "epoch": 0.8381857793622499, + "grad_norm": 1.4367099800645067, + "learning_rate": 1.3416711274251671e-06, + "loss": 0.2622, + "step": 10580 + }, + { + "epoch": 0.8382650029708854, + "grad_norm": 1.1950444400949247, + "learning_rate": 1.3403875413671997e-06, + "loss": 0.2185, + "step": 10581 + }, + { + "epoch": 0.8383442265795207, + "grad_norm": 1.398346293874672, + "learning_rate": 1.3391045255030444e-06, + "loss": 0.1955, + "step": 10582 + }, + { + "epoch": 0.8384234501881561, + "grad_norm": 1.1508050543784236, + "learning_rate": 1.3378220799171815e-06, + "loss": 0.1867, + "step": 10583 + }, + { + "epoch": 0.8385026737967914, + "grad_norm": 1.4765358007275802, + "learning_rate": 1.3365402046940569e-06, + "loss": 0.2996, + "step": 10584 + }, + { + "epoch": 0.8385818974054268, + "grad_norm": 1.3182695830935403, + "learning_rate": 1.3352588999180726e-06, + "loss": 0.2307, + "step": 10585 + }, + { + "epoch": 0.8386611210140622, + "grad_norm": 1.394954662121768, + "learning_rate": 1.3339781656735995e-06, + "loss": 0.2425, + "step": 10586 + }, + { + "epoch": 0.8387403446226975, + "grad_norm": 1.2853709706661085, + "learning_rate": 1.3326980020449621e-06, + "loss": 0.1848, + "step": 10587 + }, + { + "epoch": 0.838819568231333, + "grad_norm": 1.445014214656466, + "learning_rate": 1.3314184091164605e-06, + "loss": 0.2581, + "step": 10588 + }, + { + "epoch": 0.8388987918399683, + "grad_norm": 1.5026310491922557, + "learning_rate": 1.3301393869723457e-06, + "loss": 0.2612, + "step": 10589 + }, + { + "epoch": 0.8389780154486037, + "grad_norm": 1.664314382806888, + "learning_rate": 1.328860935696833e-06, + "loss": 0.2179, + "step": 10590 + }, + { + "epoch": 0.8390572390572391, + "grad_norm": 1.0423492993377401, + "learning_rate": 1.3275830553741066e-06, + "loss": 0.1443, + "step": 10591 + }, + { + "epoch": 0.8391364626658744, + "grad_norm": 1.205682938725894, + "learning_rate": 1.3263057460883078e-06, + "loss": 0.1595, + "step": 10592 + }, + { + "epoch": 0.8392156862745098, + "grad_norm": 1.1464046454390402, + "learning_rate": 1.3250290079235383e-06, + "loss": 0.2173, + "step": 10593 + }, + { + "epoch": 0.8392949098831451, + "grad_norm": 1.3118975841036833, + "learning_rate": 1.3237528409638688e-06, + "loss": 0.2052, + "step": 10594 + }, + { + "epoch": 0.8393741334917806, + "grad_norm": 1.666342912248758, + "learning_rate": 1.3224772452933277e-06, + "loss": 0.2952, + "step": 10595 + }, + { + "epoch": 0.8394533571004159, + "grad_norm": 1.296324809678496, + "learning_rate": 1.321202220995904e-06, + "loss": 0.2191, + "step": 10596 + }, + { + "epoch": 0.8395325807090513, + "grad_norm": 0.9781787700115439, + "learning_rate": 1.3199277681555578e-06, + "loss": 0.171, + "step": 10597 + }, + { + "epoch": 0.8396118043176867, + "grad_norm": 1.3557343932237587, + "learning_rate": 1.3186538868562004e-06, + "loss": 0.216, + "step": 10598 + }, + { + "epoch": 0.839691027926322, + "grad_norm": 1.0757046240751333, + "learning_rate": 1.3173805771817138e-06, + "loss": 0.14, + "step": 10599 + }, + { + "epoch": 0.8397702515349574, + "grad_norm": 1.3844027667864627, + "learning_rate": 1.3161078392159355e-06, + "loss": 0.2697, + "step": 10600 + }, + { + "epoch": 0.8398494751435928, + "grad_norm": 1.2368137242644355, + "learning_rate": 1.3148356730426737e-06, + "loss": 0.1566, + "step": 10601 + }, + { + "epoch": 0.8399286987522282, + "grad_norm": 1.475173582252917, + "learning_rate": 1.3135640787456926e-06, + "loss": 0.2251, + "step": 10602 + }, + { + "epoch": 0.8400079223608635, + "grad_norm": 1.238176460589784, + "learning_rate": 1.312293056408719e-06, + "loss": 0.2369, + "step": 10603 + }, + { + "epoch": 0.840087145969499, + "grad_norm": 1.327112369471202, + "learning_rate": 1.3110226061154462e-06, + "loss": 0.2565, + "step": 10604 + }, + { + "epoch": 0.8401663695781343, + "grad_norm": 1.0032639526775977, + "learning_rate": 1.309752727949527e-06, + "loss": 0.13, + "step": 10605 + }, + { + "epoch": 0.8402455931867696, + "grad_norm": 0.9827589705352223, + "learning_rate": 1.3084834219945731e-06, + "loss": 0.1778, + "step": 10606 + }, + { + "epoch": 0.840324816795405, + "grad_norm": 1.2948837368552348, + "learning_rate": 1.3072146883341675e-06, + "loss": 0.2109, + "step": 10607 + }, + { + "epoch": 0.8404040404040404, + "grad_norm": 1.3589571744552542, + "learning_rate": 1.3059465270518469e-06, + "loss": 0.2164, + "step": 10608 + }, + { + "epoch": 0.8404832640126758, + "grad_norm": 1.3001982952294369, + "learning_rate": 1.3046789382311132e-06, + "loss": 0.1718, + "step": 10609 + }, + { + "epoch": 0.8405624876213111, + "grad_norm": 1.273119143220718, + "learning_rate": 1.3034119219554341e-06, + "loss": 0.1829, + "step": 10610 + }, + { + "epoch": 0.8406417112299466, + "grad_norm": 0.9759152295223514, + "learning_rate": 1.3021454783082344e-06, + "loss": 0.1378, + "step": 10611 + }, + { + "epoch": 0.8407209348385819, + "grad_norm": 1.405025925539301, + "learning_rate": 1.3008796073729013e-06, + "loss": 0.2789, + "step": 10612 + }, + { + "epoch": 0.8408001584472172, + "grad_norm": 1.076075224282842, + "learning_rate": 1.2996143092327906e-06, + "loss": 0.1881, + "step": 10613 + }, + { + "epoch": 0.8408793820558527, + "grad_norm": 1.9342460123087852, + "learning_rate": 1.2983495839712146e-06, + "loss": 0.399, + "step": 10614 + }, + { + "epoch": 0.840958605664488, + "grad_norm": 1.663021999341558, + "learning_rate": 1.2970854316714477e-06, + "loss": 0.2585, + "step": 10615 + }, + { + "epoch": 0.8410378292731234, + "grad_norm": 1.2706319104530757, + "learning_rate": 1.2958218524167288e-06, + "loss": 0.1883, + "step": 10616 + }, + { + "epoch": 0.8411170528817588, + "grad_norm": 1.3156460761936768, + "learning_rate": 1.2945588462902603e-06, + "loss": 0.2403, + "step": 10617 + }, + { + "epoch": 0.8411962764903942, + "grad_norm": 1.1583838390951062, + "learning_rate": 1.2932964133752036e-06, + "loss": 0.2221, + "step": 10618 + }, + { + "epoch": 0.8412755000990295, + "grad_norm": 1.3189785389125046, + "learning_rate": 1.292034553754683e-06, + "loss": 0.1825, + "step": 10619 + }, + { + "epoch": 0.8413547237076648, + "grad_norm": 1.4760290426911606, + "learning_rate": 1.2907732675117878e-06, + "loss": 0.2502, + "step": 10620 + }, + { + "epoch": 0.8414339473163003, + "grad_norm": 1.0927255835844973, + "learning_rate": 1.2895125547295672e-06, + "loss": 0.166, + "step": 10621 + }, + { + "epoch": 0.8415131709249356, + "grad_norm": 1.4071028198388331, + "learning_rate": 1.2882524154910314e-06, + "loss": 0.2524, + "step": 10622 + }, + { + "epoch": 0.841592394533571, + "grad_norm": 1.2915084599875934, + "learning_rate": 1.2869928498791572e-06, + "loss": 0.1916, + "step": 10623 + }, + { + "epoch": 0.8416716181422064, + "grad_norm": 1.093551277096425, + "learning_rate": 1.2857338579768796e-06, + "loss": 0.1659, + "step": 10624 + }, + { + "epoch": 0.8417508417508418, + "grad_norm": 1.5699565818615442, + "learning_rate": 1.2844754398670954e-06, + "loss": 0.2519, + "step": 10625 + }, + { + "epoch": 0.8418300653594771, + "grad_norm": 1.2934471388179427, + "learning_rate": 1.2832175956326686e-06, + "loss": 0.1598, + "step": 10626 + }, + { + "epoch": 0.8419092889681125, + "grad_norm": 1.4060391609400043, + "learning_rate": 1.2819603253564206e-06, + "loss": 0.2468, + "step": 10627 + }, + { + "epoch": 0.8419885125767479, + "grad_norm": 1.5633422596616775, + "learning_rate": 1.280703629121135e-06, + "loss": 0.25, + "step": 10628 + }, + { + "epoch": 0.8420677361853832, + "grad_norm": 1.5726215066220275, + "learning_rate": 1.2794475070095624e-06, + "loss": 0.2799, + "step": 10629 + }, + { + "epoch": 0.8421469597940187, + "grad_norm": 1.62906028959627, + "learning_rate": 1.2781919591044113e-06, + "loss": 0.3179, + "step": 10630 + }, + { + "epoch": 0.842226183402654, + "grad_norm": 1.338619486709502, + "learning_rate": 1.2769369854883528e-06, + "loss": 0.1927, + "step": 10631 + }, + { + "epoch": 0.8423054070112893, + "grad_norm": 1.3976863179057673, + "learning_rate": 1.2756825862440192e-06, + "loss": 0.2424, + "step": 10632 + }, + { + "epoch": 0.8423846306199247, + "grad_norm": 1.304948150923471, + "learning_rate": 1.2744287614540108e-06, + "loss": 0.2239, + "step": 10633 + }, + { + "epoch": 0.8424638542285601, + "grad_norm": 1.2964213021949842, + "learning_rate": 1.2731755112008838e-06, + "loss": 0.2133, + "step": 10634 + }, + { + "epoch": 0.8425430778371955, + "grad_norm": 1.2249409427594202, + "learning_rate": 1.2719228355671576e-06, + "loss": 0.1603, + "step": 10635 + }, + { + "epoch": 0.8426223014458308, + "grad_norm": 1.103714235205618, + "learning_rate": 1.2706707346353165e-06, + "loss": 0.1268, + "step": 10636 + }, + { + "epoch": 0.8427015250544663, + "grad_norm": 1.0457767186940763, + "learning_rate": 1.2694192084878032e-06, + "loss": 0.1748, + "step": 10637 + }, + { + "epoch": 0.8427807486631016, + "grad_norm": 1.4361624679253757, + "learning_rate": 1.2681682572070275e-06, + "loss": 0.2433, + "step": 10638 + }, + { + "epoch": 0.8428599722717369, + "grad_norm": 1.2751138282489736, + "learning_rate": 1.2669178808753568e-06, + "loss": 0.2055, + "step": 10639 + }, + { + "epoch": 0.8429391958803724, + "grad_norm": 1.4395160062166592, + "learning_rate": 1.265668079575124e-06, + "loss": 0.2665, + "step": 10640 + }, + { + "epoch": 0.8430184194890077, + "grad_norm": 1.0921868742369334, + "learning_rate": 1.264418853388618e-06, + "loss": 0.1701, + "step": 10641 + }, + { + "epoch": 0.8430976430976431, + "grad_norm": 1.1583797119288237, + "learning_rate": 1.2631702023980997e-06, + "loss": 0.1385, + "step": 10642 + }, + { + "epoch": 0.8431768667062784, + "grad_norm": 0.9456458598052674, + "learning_rate": 1.2619221266857851e-06, + "loss": 0.132, + "step": 10643 + }, + { + "epoch": 0.8432560903149139, + "grad_norm": 1.2293824359425345, + "learning_rate": 1.260674626333851e-06, + "loss": 0.2095, + "step": 10644 + }, + { + "epoch": 0.8433353139235492, + "grad_norm": 1.560850569085244, + "learning_rate": 1.259427701424445e-06, + "loss": 0.2813, + "step": 10645 + }, + { + "epoch": 0.8434145375321845, + "grad_norm": 1.1257942561351288, + "learning_rate": 1.2581813520396668e-06, + "loss": 0.1729, + "step": 10646 + }, + { + "epoch": 0.84349376114082, + "grad_norm": 1.3370231647024697, + "learning_rate": 1.256935578261581e-06, + "loss": 0.1739, + "step": 10647 + }, + { + "epoch": 0.8435729847494553, + "grad_norm": 1.6987855992331826, + "learning_rate": 1.255690380172222e-06, + "loss": 0.2112, + "step": 10648 + }, + { + "epoch": 0.8436522083580907, + "grad_norm": 1.7299513439146876, + "learning_rate": 1.2544457578535764e-06, + "loss": 0.3077, + "step": 10649 + }, + { + "epoch": 0.8437314319667261, + "grad_norm": 1.1309325695999808, + "learning_rate": 1.253201711387594e-06, + "loss": 0.1584, + "step": 10650 + }, + { + "epoch": 0.8438106555753615, + "grad_norm": 1.268727736909756, + "learning_rate": 1.2519582408561936e-06, + "loss": 0.2085, + "step": 10651 + }, + { + "epoch": 0.8438898791839968, + "grad_norm": 1.0684854951617462, + "learning_rate": 1.2507153463412513e-06, + "loss": 0.1463, + "step": 10652 + }, + { + "epoch": 0.8439691027926322, + "grad_norm": 1.1138381582997856, + "learning_rate": 1.2494730279246014e-06, + "loss": 0.1918, + "step": 10653 + }, + { + "epoch": 0.8440483264012676, + "grad_norm": 1.1550342154532884, + "learning_rate": 1.2482312856880506e-06, + "loss": 0.1768, + "step": 10654 + }, + { + "epoch": 0.8441275500099029, + "grad_norm": 1.4054152283280172, + "learning_rate": 1.2469901197133582e-06, + "loss": 0.2941, + "step": 10655 + }, + { + "epoch": 0.8442067736185384, + "grad_norm": 1.3322906002812531, + "learning_rate": 1.2457495300822497e-06, + "loss": 0.2482, + "step": 10656 + }, + { + "epoch": 0.8442859972271737, + "grad_norm": 1.1861196564050724, + "learning_rate": 1.244509516876411e-06, + "loss": 0.1405, + "step": 10657 + }, + { + "epoch": 0.8443652208358091, + "grad_norm": 1.4745334860123076, + "learning_rate": 1.2432700801774923e-06, + "loss": 0.3082, + "step": 10658 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 1.4664337802225427, + "learning_rate": 1.2420312200671048e-06, + "loss": 0.2245, + "step": 10659 + }, + { + "epoch": 0.8445236680530798, + "grad_norm": 1.4975500575346707, + "learning_rate": 1.240792936626819e-06, + "loss": 0.2828, + "step": 10660 + }, + { + "epoch": 0.8446028916617152, + "grad_norm": 1.3022236180420792, + "learning_rate": 1.2395552299381742e-06, + "loss": 0.2191, + "step": 10661 + }, + { + "epoch": 0.8446821152703505, + "grad_norm": 1.10817952657081, + "learning_rate": 1.238318100082664e-06, + "loss": 0.1236, + "step": 10662 + }, + { + "epoch": 0.844761338878986, + "grad_norm": 1.4630776430584254, + "learning_rate": 1.2370815471417464e-06, + "loss": 0.307, + "step": 10663 + }, + { + "epoch": 0.8448405624876213, + "grad_norm": 1.1510462680703422, + "learning_rate": 1.2358455711968463e-06, + "loss": 0.1879, + "step": 10664 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 1.2881976419268701, + "learning_rate": 1.2346101723293457e-06, + "loss": 0.2054, + "step": 10665 + }, + { + "epoch": 0.8449990097048921, + "grad_norm": 1.0295500437155463, + "learning_rate": 1.233375350620587e-06, + "loss": 0.1668, + "step": 10666 + }, + { + "epoch": 0.8450782333135274, + "grad_norm": 1.2843133297267746, + "learning_rate": 1.2321411061518807e-06, + "loss": 0.1966, + "step": 10667 + }, + { + "epoch": 0.8451574569221628, + "grad_norm": 1.6546456667167602, + "learning_rate": 1.2309074390044939e-06, + "loss": 0.2458, + "step": 10668 + }, + { + "epoch": 0.8452366805307981, + "grad_norm": 1.3914471160023159, + "learning_rate": 1.2296743492596587e-06, + "loss": 0.1884, + "step": 10669 + }, + { + "epoch": 0.8453159041394336, + "grad_norm": 1.3961436854037759, + "learning_rate": 1.2284418369985651e-06, + "loss": 0.2313, + "step": 10670 + }, + { + "epoch": 0.8453951277480689, + "grad_norm": 1.3242315802655225, + "learning_rate": 1.227209902302372e-06, + "loss": 0.1847, + "step": 10671 + }, + { + "epoch": 0.8454743513567043, + "grad_norm": 1.3552325434137533, + "learning_rate": 1.2259785452521956e-06, + "loss": 0.2196, + "step": 10672 + }, + { + "epoch": 0.8455535749653397, + "grad_norm": 1.2257620498050938, + "learning_rate": 1.2247477659291118e-06, + "loss": 0.2352, + "step": 10673 + }, + { + "epoch": 0.845632798573975, + "grad_norm": 1.196068629668826, + "learning_rate": 1.223517564414166e-06, + "loss": 0.1463, + "step": 10674 + }, + { + "epoch": 0.8457120221826104, + "grad_norm": 1.2538461054055066, + "learning_rate": 1.2222879407883592e-06, + "loss": 0.2, + "step": 10675 + }, + { + "epoch": 0.8457912457912458, + "grad_norm": 1.3831341506769856, + "learning_rate": 1.2210588951326542e-06, + "loss": 0.282, + "step": 10676 + }, + { + "epoch": 0.8458704693998812, + "grad_norm": 1.542175791200493, + "learning_rate": 1.2198304275279805e-06, + "loss": 0.2721, + "step": 10677 + }, + { + "epoch": 0.8459496930085165, + "grad_norm": 1.6004554091306153, + "learning_rate": 1.2186025380552259e-06, + "loss": 0.276, + "step": 10678 + }, + { + "epoch": 0.846028916617152, + "grad_norm": 1.6496607656213018, + "learning_rate": 1.2173752267952376e-06, + "loss": 0.3147, + "step": 10679 + }, + { + "epoch": 0.8461081402257873, + "grad_norm": 1.2859315189884697, + "learning_rate": 1.2161484938288348e-06, + "loss": 0.2388, + "step": 10680 + }, + { + "epoch": 0.8461873638344226, + "grad_norm": 1.4647078284338793, + "learning_rate": 1.214922339236788e-06, + "loss": 0.2573, + "step": 10681 + }, + { + "epoch": 0.846266587443058, + "grad_norm": 1.1542581593233001, + "learning_rate": 1.213696763099832e-06, + "loss": 0.187, + "step": 10682 + }, + { + "epoch": 0.8463458110516934, + "grad_norm": 1.2747255009741134, + "learning_rate": 1.2124717654986695e-06, + "loss": 0.1758, + "step": 10683 + }, + { + "epoch": 0.8464250346603288, + "grad_norm": 1.6450273815709602, + "learning_rate": 1.2112473465139586e-06, + "loss": 0.3133, + "step": 10684 + }, + { + "epoch": 0.8465042582689641, + "grad_norm": 1.7697703806848302, + "learning_rate": 1.210023506226321e-06, + "loss": 0.2668, + "step": 10685 + }, + { + "epoch": 0.8465834818775996, + "grad_norm": 1.4710091514712196, + "learning_rate": 1.2088002447163383e-06, + "loss": 0.2547, + "step": 10686 + }, + { + "epoch": 0.8466627054862349, + "grad_norm": 1.2707976982564826, + "learning_rate": 1.2075775620645613e-06, + "loss": 0.1906, + "step": 10687 + }, + { + "epoch": 0.8467419290948702, + "grad_norm": 1.3854969085595397, + "learning_rate": 1.2063554583514947e-06, + "loss": 0.2446, + "step": 10688 + }, + { + "epoch": 0.8468211527035057, + "grad_norm": 1.1481183761858849, + "learning_rate": 1.2051339336576074e-06, + "loss": 0.1665, + "step": 10689 + }, + { + "epoch": 0.846900376312141, + "grad_norm": 1.2568425692457097, + "learning_rate": 1.203912988063335e-06, + "loss": 0.2257, + "step": 10690 + }, + { + "epoch": 0.8469795999207764, + "grad_norm": 1.4102680321309617, + "learning_rate": 1.2026926216490675e-06, + "loss": 0.2875, + "step": 10691 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 1.339973681016876, + "learning_rate": 1.2014728344951587e-06, + "loss": 0.2373, + "step": 10692 + }, + { + "epoch": 0.8471380471380472, + "grad_norm": 1.1499839158549152, + "learning_rate": 1.2002536266819309e-06, + "loss": 0.1823, + "step": 10693 + }, + { + "epoch": 0.8472172707466825, + "grad_norm": 1.794664091498243, + "learning_rate": 1.1990349982896598e-06, + "loss": 0.3417, + "step": 10694 + }, + { + "epoch": 0.8472964943553178, + "grad_norm": 1.1466486229571127, + "learning_rate": 1.1978169493985836e-06, + "loss": 0.2294, + "step": 10695 + }, + { + "epoch": 0.8473757179639533, + "grad_norm": 1.1792678650295658, + "learning_rate": 1.1965994800889113e-06, + "loss": 0.2056, + "step": 10696 + }, + { + "epoch": 0.8474549415725886, + "grad_norm": 1.4172078551720109, + "learning_rate": 1.1953825904408033e-06, + "loss": 0.2114, + "step": 10697 + }, + { + "epoch": 0.847534165181224, + "grad_norm": 1.557836188046826, + "learning_rate": 1.1941662805343846e-06, + "loss": 0.2279, + "step": 10698 + }, + { + "epoch": 0.8476133887898594, + "grad_norm": 1.1792012710797648, + "learning_rate": 1.1929505504497464e-06, + "loss": 0.2183, + "step": 10699 + }, + { + "epoch": 0.8476926123984948, + "grad_norm": 1.6411063481845027, + "learning_rate": 1.191735400266939e-06, + "loss": 0.2786, + "step": 10700 + }, + { + "epoch": 0.8477718360071301, + "grad_norm": 1.6415613835517755, + "learning_rate": 1.190520830065972e-06, + "loss": 0.2377, + "step": 10701 + }, + { + "epoch": 0.8478510596157655, + "grad_norm": 1.0543882664512312, + "learning_rate": 1.189306839926818e-06, + "loss": 0.1523, + "step": 10702 + }, + { + "epoch": 0.8479302832244009, + "grad_norm": 1.1176252181289879, + "learning_rate": 1.1880934299294167e-06, + "loss": 0.1836, + "step": 10703 + }, + { + "epoch": 0.8480095068330362, + "grad_norm": 1.3566949125195122, + "learning_rate": 1.1868806001536625e-06, + "loss": 0.2121, + "step": 10704 + }, + { + "epoch": 0.8480887304416717, + "grad_norm": 1.1986825866417639, + "learning_rate": 1.185668350679413e-06, + "loss": 0.1572, + "step": 10705 + }, + { + "epoch": 0.848167954050307, + "grad_norm": 1.3463912639868627, + "learning_rate": 1.1844566815864921e-06, + "loss": 0.1892, + "step": 10706 + }, + { + "epoch": 0.8482471776589424, + "grad_norm": 1.0282939041168102, + "learning_rate": 1.1832455929546827e-06, + "loss": 0.1428, + "step": 10707 + }, + { + "epoch": 0.8483264012675777, + "grad_norm": 1.469974731763603, + "learning_rate": 1.182035084863724e-06, + "loss": 0.2123, + "step": 10708 + }, + { + "epoch": 0.8484056248762131, + "grad_norm": 1.2432520620284864, + "learning_rate": 1.1808251573933272e-06, + "loss": 0.2121, + "step": 10709 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 1.3296086953019453, + "learning_rate": 1.1796158106231603e-06, + "loss": 0.2026, + "step": 10710 + }, + { + "epoch": 0.8485640720934838, + "grad_norm": 1.020753444595949, + "learning_rate": 1.1784070446328477e-06, + "loss": 0.132, + "step": 10711 + }, + { + "epoch": 0.8486432957021193, + "grad_norm": 1.5155898658947995, + "learning_rate": 1.177198859501989e-06, + "loss": 0.2387, + "step": 10712 + }, + { + "epoch": 0.8487225193107546, + "grad_norm": 1.3879226373955473, + "learning_rate": 1.1759912553101316e-06, + "loss": 0.2754, + "step": 10713 + }, + { + "epoch": 0.8488017429193899, + "grad_norm": 1.0574272218745024, + "learning_rate": 1.1747842321367886e-06, + "loss": 0.1898, + "step": 10714 + }, + { + "epoch": 0.8488809665280254, + "grad_norm": 1.0685039234016367, + "learning_rate": 1.173577790061442e-06, + "loss": 0.2158, + "step": 10715 + }, + { + "epoch": 0.8489601901366607, + "grad_norm": 1.4135611138027584, + "learning_rate": 1.1723719291635272e-06, + "loss": 0.2401, + "step": 10716 + }, + { + "epoch": 0.8490394137452961, + "grad_norm": 1.5241627760829384, + "learning_rate": 1.171166649522444e-06, + "loss": 0.2714, + "step": 10717 + }, + { + "epoch": 0.8491186373539314, + "grad_norm": 1.3776374019313784, + "learning_rate": 1.1699619512175563e-06, + "loss": 0.2293, + "step": 10718 + }, + { + "epoch": 0.8491978609625669, + "grad_norm": 1.3711509282708116, + "learning_rate": 1.168757834328188e-06, + "loss": 0.2008, + "step": 10719 + }, + { + "epoch": 0.8492770845712022, + "grad_norm": 1.858661881400827, + "learning_rate": 1.1675542989336208e-06, + "loss": 0.2804, + "step": 10720 + }, + { + "epoch": 0.8493563081798375, + "grad_norm": 1.3261155102879394, + "learning_rate": 1.1663513451131047e-06, + "loss": 0.2105, + "step": 10721 + }, + { + "epoch": 0.849435531788473, + "grad_norm": 1.1569439477608126, + "learning_rate": 1.1651489729458487e-06, + "loss": 0.1522, + "step": 10722 + }, + { + "epoch": 0.8495147553971083, + "grad_norm": 1.1642124932773206, + "learning_rate": 1.1639471825110205e-06, + "loss": 0.1896, + "step": 10723 + }, + { + "epoch": 0.8495939790057437, + "grad_norm": 1.3718722293412, + "learning_rate": 1.1627459738877557e-06, + "loss": 0.2307, + "step": 10724 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 1.4593997512460624, + "learning_rate": 1.1615453471551462e-06, + "loss": 0.2057, + "step": 10725 + }, + { + "epoch": 0.8497524262230145, + "grad_norm": 1.2000380410783604, + "learning_rate": 1.1603453023922473e-06, + "loss": 0.1736, + "step": 10726 + }, + { + "epoch": 0.8498316498316498, + "grad_norm": 1.6404312788650104, + "learning_rate": 1.1591458396780753e-06, + "loss": 0.3527, + "step": 10727 + }, + { + "epoch": 0.8499108734402852, + "grad_norm": 1.2913847806829117, + "learning_rate": 1.1579469590916125e-06, + "loss": 0.2476, + "step": 10728 + }, + { + "epoch": 0.8499900970489206, + "grad_norm": 1.3501326816108088, + "learning_rate": 1.156748660711796e-06, + "loss": 0.2673, + "step": 10729 + }, + { + "epoch": 0.8500693206575559, + "grad_norm": 1.7066732642213205, + "learning_rate": 1.1555509446175284e-06, + "loss": 0.3191, + "step": 10730 + }, + { + "epoch": 0.8501485442661914, + "grad_norm": 1.7246599741825346, + "learning_rate": 1.1543538108876751e-06, + "loss": 0.3213, + "step": 10731 + }, + { + "epoch": 0.8502277678748267, + "grad_norm": 1.365497405581261, + "learning_rate": 1.153157259601062e-06, + "loss": 0.2253, + "step": 10732 + }, + { + "epoch": 0.8503069914834621, + "grad_norm": 1.3103151829961655, + "learning_rate": 1.1519612908364718e-06, + "loss": 0.2409, + "step": 10733 + }, + { + "epoch": 0.8503862150920974, + "grad_norm": 0.9441718809920354, + "learning_rate": 1.1507659046726605e-06, + "loss": 0.1385, + "step": 10734 + }, + { + "epoch": 0.8504654387007328, + "grad_norm": 1.4687952641057023, + "learning_rate": 1.1495711011883325e-06, + "loss": 0.2359, + "step": 10735 + }, + { + "epoch": 0.8505446623093682, + "grad_norm": 1.4886311133744696, + "learning_rate": 1.148376880462161e-06, + "loss": 0.1923, + "step": 10736 + }, + { + "epoch": 0.8506238859180035, + "grad_norm": 1.20044250197672, + "learning_rate": 1.1471832425727825e-06, + "loss": 0.2101, + "step": 10737 + }, + { + "epoch": 0.850703109526639, + "grad_norm": 1.6085337752774558, + "learning_rate": 1.14599018759879e-06, + "loss": 0.2954, + "step": 10738 + }, + { + "epoch": 0.8507823331352743, + "grad_norm": 1.1242902995263895, + "learning_rate": 1.1447977156187395e-06, + "loss": 0.1499, + "step": 10739 + }, + { + "epoch": 0.8508615567439097, + "grad_norm": 1.2822827991080508, + "learning_rate": 1.1436058267111527e-06, + "loss": 0.192, + "step": 10740 + }, + { + "epoch": 0.8509407803525451, + "grad_norm": 1.2429910285604242, + "learning_rate": 1.1424145209545079e-06, + "loss": 0.2398, + "step": 10741 + }, + { + "epoch": 0.8510200039611804, + "grad_norm": 1.1699639202459249, + "learning_rate": 1.1412237984272467e-06, + "loss": 0.146, + "step": 10742 + }, + { + "epoch": 0.8510992275698158, + "grad_norm": 1.3107355181733198, + "learning_rate": 1.140033659207771e-06, + "loss": 0.1699, + "step": 10743 + }, + { + "epoch": 0.8511784511784511, + "grad_norm": 1.665860324643064, + "learning_rate": 1.1388441033744502e-06, + "loss": 0.2261, + "step": 10744 + }, + { + "epoch": 0.8512576747870866, + "grad_norm": 1.325005151797313, + "learning_rate": 1.1376551310056073e-06, + "loss": 0.2353, + "step": 10745 + }, + { + "epoch": 0.8513368983957219, + "grad_norm": 1.1877119154206783, + "learning_rate": 1.1364667421795283e-06, + "loss": 0.2207, + "step": 10746 + }, + { + "epoch": 0.8514161220043573, + "grad_norm": 1.1700524083376647, + "learning_rate": 1.1352789369744688e-06, + "loss": 0.1673, + "step": 10747 + }, + { + "epoch": 0.8514953456129927, + "grad_norm": 1.3556005380129177, + "learning_rate": 1.134091715468636e-06, + "loss": 0.2347, + "step": 10748 + }, + { + "epoch": 0.851574569221628, + "grad_norm": 1.640276486104036, + "learning_rate": 1.132905077740203e-06, + "loss": 0.2507, + "step": 10749 + }, + { + "epoch": 0.8516537928302634, + "grad_norm": 1.938001337935845, + "learning_rate": 1.131719023867306e-06, + "loss": 0.2857, + "step": 10750 + }, + { + "epoch": 0.8517330164388988, + "grad_norm": 1.346372329412587, + "learning_rate": 1.1305335539280392e-06, + "loss": 0.2302, + "step": 10751 + }, + { + "epoch": 0.8518122400475342, + "grad_norm": 1.209952336573794, + "learning_rate": 1.1293486680004607e-06, + "loss": 0.1839, + "step": 10752 + }, + { + "epoch": 0.8518914636561695, + "grad_norm": 1.5534520465790997, + "learning_rate": 1.1281643661625896e-06, + "loss": 0.2731, + "step": 10753 + }, + { + "epoch": 0.851970687264805, + "grad_norm": 1.4358241634171365, + "learning_rate": 1.1269806484924072e-06, + "loss": 0.2255, + "step": 10754 + }, + { + "epoch": 0.8520499108734403, + "grad_norm": 1.6615862166193753, + "learning_rate": 1.1257975150678557e-06, + "loss": 0.2497, + "step": 10755 + }, + { + "epoch": 0.8521291344820756, + "grad_norm": 1.6042851753083125, + "learning_rate": 1.124614965966835e-06, + "loss": 0.2753, + "step": 10756 + }, + { + "epoch": 0.852208358090711, + "grad_norm": 1.1296527761233508, + "learning_rate": 1.1234330012672146e-06, + "loss": 0.1435, + "step": 10757 + }, + { + "epoch": 0.8522875816993464, + "grad_norm": 1.4188548771779212, + "learning_rate": 1.1222516210468204e-06, + "loss": 0.2143, + "step": 10758 + }, + { + "epoch": 0.8523668053079818, + "grad_norm": 1.212223472158802, + "learning_rate": 1.121070825383438e-06, + "loss": 0.1516, + "step": 10759 + }, + { + "epoch": 0.8524460289166171, + "grad_norm": 1.1545776919341748, + "learning_rate": 1.1198906143548216e-06, + "loss": 0.1561, + "step": 10760 + }, + { + "epoch": 0.8525252525252526, + "grad_norm": 1.1155731830344275, + "learning_rate": 1.1187109880386794e-06, + "loss": 0.1724, + "step": 10761 + }, + { + "epoch": 0.8526044761338879, + "grad_norm": 1.2005002629477377, + "learning_rate": 1.117531946512682e-06, + "loss": 0.1787, + "step": 10762 + }, + { + "epoch": 0.8526836997425232, + "grad_norm": 1.2581184976662154, + "learning_rate": 1.1163534898544692e-06, + "loss": 0.2097, + "step": 10763 + }, + { + "epoch": 0.8527629233511587, + "grad_norm": 1.1736652767290678, + "learning_rate": 1.1151756181416328e-06, + "loss": 0.1914, + "step": 10764 + }, + { + "epoch": 0.852842146959794, + "grad_norm": 1.3530846440988304, + "learning_rate": 1.1139983314517288e-06, + "loss": 0.1443, + "step": 10765 + }, + { + "epoch": 0.8529213705684294, + "grad_norm": 1.096651598235833, + "learning_rate": 1.1128216298622808e-06, + "loss": 0.1463, + "step": 10766 + }, + { + "epoch": 0.8530005941770648, + "grad_norm": 1.2058003063193818, + "learning_rate": 1.1116455134507665e-06, + "loss": 0.1839, + "step": 10767 + }, + { + "epoch": 0.8530798177857002, + "grad_norm": 1.813538853319796, + "learning_rate": 1.110469982294624e-06, + "loss": 0.2721, + "step": 10768 + }, + { + "epoch": 0.8531590413943355, + "grad_norm": 1.1984717591463547, + "learning_rate": 1.1092950364712617e-06, + "loss": 0.1572, + "step": 10769 + }, + { + "epoch": 0.8532382650029708, + "grad_norm": 1.5130929658460166, + "learning_rate": 1.1081206760580422e-06, + "loss": 0.2544, + "step": 10770 + }, + { + "epoch": 0.8533174886116063, + "grad_norm": 1.32466074032802, + "learning_rate": 1.1069469011322908e-06, + "loss": 0.1893, + "step": 10771 + }, + { + "epoch": 0.8533967122202416, + "grad_norm": 1.2814813183187004, + "learning_rate": 1.1057737117712941e-06, + "loss": 0.239, + "step": 10772 + }, + { + "epoch": 0.853475935828877, + "grad_norm": 1.187383699768313, + "learning_rate": 1.1046011080523034e-06, + "loss": 0.1888, + "step": 10773 + }, + { + "epoch": 0.8535551594375124, + "grad_norm": 1.3382170301882963, + "learning_rate": 1.1034290900525279e-06, + "loss": 0.2381, + "step": 10774 + }, + { + "epoch": 0.8536343830461478, + "grad_norm": 1.0991829217487112, + "learning_rate": 1.1022576578491372e-06, + "loss": 0.1784, + "step": 10775 + }, + { + "epoch": 0.8537136066547831, + "grad_norm": 1.3932733986841541, + "learning_rate": 1.1010868115192696e-06, + "loss": 0.2344, + "step": 10776 + }, + { + "epoch": 0.8537928302634185, + "grad_norm": 1.259474974714991, + "learning_rate": 1.0999165511400157e-06, + "loss": 0.1932, + "step": 10777 + }, + { + "epoch": 0.8538720538720539, + "grad_norm": 1.3768474669474018, + "learning_rate": 1.09874687678843e-06, + "loss": 0.1795, + "step": 10778 + }, + { + "epoch": 0.8539512774806892, + "grad_norm": 1.5358201494201102, + "learning_rate": 1.097577788541535e-06, + "loss": 0.266, + "step": 10779 + }, + { + "epoch": 0.8540305010893247, + "grad_norm": 1.4309717707934504, + "learning_rate": 1.0964092864763065e-06, + "loss": 0.2343, + "step": 10780 + }, + { + "epoch": 0.85410972469796, + "grad_norm": 1.558988023293492, + "learning_rate": 1.095241370669684e-06, + "loss": 0.2719, + "step": 10781 + }, + { + "epoch": 0.8541889483065954, + "grad_norm": 0.9901205071029419, + "learning_rate": 1.0940740411985718e-06, + "loss": 0.155, + "step": 10782 + }, + { + "epoch": 0.8542681719152307, + "grad_norm": 1.2218146976745325, + "learning_rate": 1.0929072981398313e-06, + "loss": 0.1641, + "step": 10783 + }, + { + "epoch": 0.8543473955238661, + "grad_norm": 1.6441616583651038, + "learning_rate": 1.091741141570285e-06, + "loss": 0.2327, + "step": 10784 + }, + { + "epoch": 0.8544266191325015, + "grad_norm": 1.324895351116022, + "learning_rate": 1.0905755715667222e-06, + "loss": 0.2386, + "step": 10785 + }, + { + "epoch": 0.8545058427411368, + "grad_norm": 1.5434456112929702, + "learning_rate": 1.0894105882058891e-06, + "loss": 0.2321, + "step": 10786 + }, + { + "epoch": 0.8545850663497723, + "grad_norm": 1.4569959671375003, + "learning_rate": 1.0882461915644936e-06, + "loss": 0.187, + "step": 10787 + }, + { + "epoch": 0.8546642899584076, + "grad_norm": 1.6230809011020064, + "learning_rate": 1.0870823817192045e-06, + "loss": 0.3151, + "step": 10788 + }, + { + "epoch": 0.854743513567043, + "grad_norm": 1.159258523030389, + "learning_rate": 1.0859191587466556e-06, + "loss": 0.1643, + "step": 10789 + }, + { + "epoch": 0.8548227371756784, + "grad_norm": 1.3520949967532052, + "learning_rate": 1.0847565227234392e-06, + "loss": 0.214, + "step": 10790 + }, + { + "epoch": 0.8549019607843137, + "grad_norm": 1.308914513355494, + "learning_rate": 1.0835944737261072e-06, + "loss": 0.2241, + "step": 10791 + }, + { + "epoch": 0.8549811843929491, + "grad_norm": 1.3050524273336934, + "learning_rate": 1.0824330118311765e-06, + "loss": 0.2001, + "step": 10792 + }, + { + "epoch": 0.8550604080015844, + "grad_norm": 1.3552041608815961, + "learning_rate": 1.0812721371151213e-06, + "loss": 0.212, + "step": 10793 + }, + { + "epoch": 0.8551396316102199, + "grad_norm": 1.0439392340847686, + "learning_rate": 1.080111849654384e-06, + "loss": 0.1531, + "step": 10794 + }, + { + "epoch": 0.8552188552188552, + "grad_norm": 1.2531789351974598, + "learning_rate": 1.078952149525362e-06, + "loss": 0.193, + "step": 10795 + }, + { + "epoch": 0.8552980788274905, + "grad_norm": 1.1245551316692843, + "learning_rate": 1.0777930368044143e-06, + "loss": 0.1734, + "step": 10796 + }, + { + "epoch": 0.855377302436126, + "grad_norm": 1.4592722183116267, + "learning_rate": 1.0766345115678633e-06, + "loss": 0.2249, + "step": 10797 + }, + { + "epoch": 0.8554565260447613, + "grad_norm": 1.3809091395215236, + "learning_rate": 1.0754765738919947e-06, + "loss": 0.2493, + "step": 10798 + }, + { + "epoch": 0.8555357496533967, + "grad_norm": 1.3542926354254743, + "learning_rate": 1.074319223853052e-06, + "loss": 0.2727, + "step": 10799 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 1.1583382970650118, + "learning_rate": 1.0731624615272385e-06, + "loss": 0.1644, + "step": 10800 + }, + { + "epoch": 0.8556941968706675, + "grad_norm": 1.201177784010263, + "learning_rate": 1.0720062869907255e-06, + "loss": 0.1814, + "step": 10801 + }, + { + "epoch": 0.8557734204793028, + "grad_norm": 1.1681444882922902, + "learning_rate": 1.07085070031964e-06, + "loss": 0.1708, + "step": 10802 + }, + { + "epoch": 0.8558526440879382, + "grad_norm": 1.3391259673757796, + "learning_rate": 1.06969570159007e-06, + "loss": 0.2817, + "step": 10803 + }, + { + "epoch": 0.8559318676965736, + "grad_norm": 1.9584411057971687, + "learning_rate": 1.0685412908780702e-06, + "loss": 0.3552, + "step": 10804 + }, + { + "epoch": 0.8560110913052089, + "grad_norm": 1.4923312382295966, + "learning_rate": 1.0673874682596497e-06, + "loss": 0.321, + "step": 10805 + }, + { + "epoch": 0.8560903149138444, + "grad_norm": 1.0416914219625322, + "learning_rate": 1.0662342338107823e-06, + "loss": 0.1369, + "step": 10806 + }, + { + "epoch": 0.8561695385224797, + "grad_norm": 1.642057393170728, + "learning_rate": 1.065081587607406e-06, + "loss": 0.3597, + "step": 10807 + }, + { + "epoch": 0.8562487621311151, + "grad_norm": 1.1766502998251729, + "learning_rate": 1.0639295297254149e-06, + "loss": 0.1882, + "step": 10808 + }, + { + "epoch": 0.8563279857397504, + "grad_norm": 1.1971173970989457, + "learning_rate": 1.0627780602406656e-06, + "loss": 0.2041, + "step": 10809 + }, + { + "epoch": 0.8564072093483858, + "grad_norm": 1.1447713291366683, + "learning_rate": 1.061627179228979e-06, + "loss": 0.205, + "step": 10810 + }, + { + "epoch": 0.8564864329570212, + "grad_norm": 1.6830221584923502, + "learning_rate": 1.0604768867661342e-06, + "loss": 0.2666, + "step": 10811 + }, + { + "epoch": 0.8565656565656565, + "grad_norm": 1.5616123161453341, + "learning_rate": 1.0593271829278718e-06, + "loss": 0.3028, + "step": 10812 + }, + { + "epoch": 0.856644880174292, + "grad_norm": 1.456097720113925, + "learning_rate": 1.0581780677898924e-06, + "loss": 0.2219, + "step": 10813 + }, + { + "epoch": 0.8567241037829273, + "grad_norm": 1.3718421469481186, + "learning_rate": 1.0570295414278642e-06, + "loss": 0.2132, + "step": 10814 + }, + { + "epoch": 0.8568033273915627, + "grad_norm": 1.2548612172729923, + "learning_rate": 1.0558816039174102e-06, + "loss": 0.1891, + "step": 10815 + }, + { + "epoch": 0.8568825510001981, + "grad_norm": 1.2579132836272071, + "learning_rate": 1.0547342553341144e-06, + "loss": 0.2169, + "step": 10816 + }, + { + "epoch": 0.8569617746088334, + "grad_norm": 1.087808205142531, + "learning_rate": 1.0535874957535275e-06, + "loss": 0.1935, + "step": 10817 + }, + { + "epoch": 0.8570409982174688, + "grad_norm": 1.3092224128447962, + "learning_rate": 1.0524413252511567e-06, + "loss": 0.1711, + "step": 10818 + }, + { + "epoch": 0.8571202218261041, + "grad_norm": 1.239999444578251, + "learning_rate": 1.0512957439024697e-06, + "loss": 0.2013, + "step": 10819 + }, + { + "epoch": 0.8571994454347396, + "grad_norm": 1.3046931658884642, + "learning_rate": 1.0501507517829012e-06, + "loss": 0.185, + "step": 10820 + }, + { + "epoch": 0.8572786690433749, + "grad_norm": 1.3807128633644161, + "learning_rate": 1.0490063489678427e-06, + "loss": 0.2016, + "step": 10821 + }, + { + "epoch": 0.8573578926520103, + "grad_norm": 1.465615348103796, + "learning_rate": 1.0478625355326445e-06, + "loss": 0.2251, + "step": 10822 + }, + { + "epoch": 0.8574371162606457, + "grad_norm": 1.117290017974565, + "learning_rate": 1.0467193115526254e-06, + "loss": 0.1679, + "step": 10823 + }, + { + "epoch": 0.857516339869281, + "grad_norm": 1.4638689416898418, + "learning_rate": 1.0455766771030585e-06, + "loss": 0.2718, + "step": 10824 + }, + { + "epoch": 0.8575955634779164, + "grad_norm": 1.6226504992825306, + "learning_rate": 1.0444346322591804e-06, + "loss": 0.2856, + "step": 10825 + }, + { + "epoch": 0.8576747870865518, + "grad_norm": 1.5801297461942128, + "learning_rate": 1.0432931770961907e-06, + "loss": 0.2234, + "step": 10826 + }, + { + "epoch": 0.8577540106951872, + "grad_norm": 1.3818127549970014, + "learning_rate": 1.0421523116892496e-06, + "loss": 0.1918, + "step": 10827 + }, + { + "epoch": 0.8578332343038225, + "grad_norm": 1.2535384563720011, + "learning_rate": 1.0410120361134767e-06, + "loss": 0.1995, + "step": 10828 + }, + { + "epoch": 0.857912457912458, + "grad_norm": 1.4460981075519885, + "learning_rate": 1.0398723504439512e-06, + "loss": 0.2676, + "step": 10829 + }, + { + "epoch": 0.8579916815210933, + "grad_norm": 1.5803351343786376, + "learning_rate": 1.0387332547557194e-06, + "loss": 0.1971, + "step": 10830 + }, + { + "epoch": 0.8580709051297286, + "grad_norm": 1.4507286026817658, + "learning_rate": 1.0375947491237836e-06, + "loss": 0.2111, + "step": 10831 + }, + { + "epoch": 0.858150128738364, + "grad_norm": 1.4267644205594114, + "learning_rate": 1.0364568336231085e-06, + "loss": 0.247, + "step": 10832 + }, + { + "epoch": 0.8582293523469994, + "grad_norm": 1.1324892138545073, + "learning_rate": 1.0353195083286226e-06, + "loss": 0.1267, + "step": 10833 + }, + { + "epoch": 0.8583085759556348, + "grad_norm": 1.2884812199234947, + "learning_rate": 1.034182773315211e-06, + "loss": 0.2113, + "step": 10834 + }, + { + "epoch": 0.8583877995642701, + "grad_norm": 1.439443762761076, + "learning_rate": 1.0330466286577224e-06, + "loss": 0.2463, + "step": 10835 + }, + { + "epoch": 0.8584670231729056, + "grad_norm": 1.5279724390550258, + "learning_rate": 1.031911074430968e-06, + "loss": 0.2062, + "step": 10836 + }, + { + "epoch": 0.8585462467815409, + "grad_norm": 1.3292679260055595, + "learning_rate": 1.030776110709718e-06, + "loss": 0.2448, + "step": 10837 + }, + { + "epoch": 0.8586254703901762, + "grad_norm": 1.2214399069232247, + "learning_rate": 1.0296417375687017e-06, + "loss": 0.2057, + "step": 10838 + }, + { + "epoch": 0.8587046939988117, + "grad_norm": 1.1564201755316976, + "learning_rate": 1.0285079550826172e-06, + "loss": 0.2187, + "step": 10839 + }, + { + "epoch": 0.858783917607447, + "grad_norm": 1.2406853934459663, + "learning_rate": 1.0273747633261144e-06, + "loss": 0.1961, + "step": 10840 + }, + { + "epoch": 0.8588631412160824, + "grad_norm": 1.3732701770371192, + "learning_rate": 1.0262421623738105e-06, + "loss": 0.2393, + "step": 10841 + }, + { + "epoch": 0.8589423648247178, + "grad_norm": 1.5730219129167005, + "learning_rate": 1.0251101523002805e-06, + "loss": 0.1727, + "step": 10842 + }, + { + "epoch": 0.8590215884333532, + "grad_norm": 1.118679662629517, + "learning_rate": 1.0239787331800632e-06, + "loss": 0.158, + "step": 10843 + }, + { + "epoch": 0.8591008120419885, + "grad_norm": 1.1631830075625404, + "learning_rate": 1.022847905087656e-06, + "loss": 0.1615, + "step": 10844 + }, + { + "epoch": 0.8591800356506238, + "grad_norm": 1.6087535526755574, + "learning_rate": 1.0217176680975183e-06, + "loss": 0.2423, + "step": 10845 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 1.3370618555695546, + "learning_rate": 1.0205880222840726e-06, + "loss": 0.2383, + "step": 10846 + }, + { + "epoch": 0.8593384828678946, + "grad_norm": 1.3318962256469504, + "learning_rate": 1.0194589677216992e-06, + "loss": 0.185, + "step": 10847 + }, + { + "epoch": 0.85941770647653, + "grad_norm": 1.692345563064196, + "learning_rate": 1.0183305044847402e-06, + "loss": 0.2175, + "step": 10848 + }, + { + "epoch": 0.8594969300851654, + "grad_norm": 1.2978267033185473, + "learning_rate": 1.0172026326475016e-06, + "loss": 0.2141, + "step": 10849 + }, + { + "epoch": 0.8595761536938008, + "grad_norm": 1.2196388175762076, + "learning_rate": 1.0160753522842482e-06, + "loss": 0.231, + "step": 10850 + }, + { + "epoch": 0.8596553773024361, + "grad_norm": 1.6921766287279663, + "learning_rate": 1.0149486634692019e-06, + "loss": 0.3379, + "step": 10851 + }, + { + "epoch": 0.8597346009110715, + "grad_norm": 1.47423904976091, + "learning_rate": 1.0138225662765555e-06, + "loss": 0.2367, + "step": 10852 + }, + { + "epoch": 0.8598138245197069, + "grad_norm": 1.7374752846928612, + "learning_rate": 1.0126970607804532e-06, + "loss": 0.1667, + "step": 10853 + }, + { + "epoch": 0.8598930481283422, + "grad_norm": 1.0170818877706014, + "learning_rate": 1.0115721470550045e-06, + "loss": 0.1631, + "step": 10854 + }, + { + "epoch": 0.8599722717369777, + "grad_norm": 1.5170281319712673, + "learning_rate": 1.0104478251742822e-06, + "loss": 0.2714, + "step": 10855 + }, + { + "epoch": 0.860051495345613, + "grad_norm": 1.5913677264856463, + "learning_rate": 1.009324095212315e-06, + "loss": 0.2403, + "step": 10856 + }, + { + "epoch": 0.8601307189542484, + "grad_norm": 1.3704028689403447, + "learning_rate": 1.0082009572430963e-06, + "loss": 0.2547, + "step": 10857 + }, + { + "epoch": 0.8602099425628837, + "grad_norm": 1.4796370476274001, + "learning_rate": 1.0070784113405763e-06, + "loss": 0.2507, + "step": 10858 + }, + { + "epoch": 0.8602891661715191, + "grad_norm": 1.3055146840303782, + "learning_rate": 1.005956457578675e-06, + "loss": 0.1884, + "step": 10859 + }, + { + "epoch": 0.8603683897801545, + "grad_norm": 1.2035464800675986, + "learning_rate": 1.0048350960312637e-06, + "loss": 0.1684, + "step": 10860 + }, + { + "epoch": 0.8604476133887898, + "grad_norm": 1.1532555328066032, + "learning_rate": 1.003714326772176e-06, + "loss": 0.1463, + "step": 10861 + }, + { + "epoch": 0.8605268369974253, + "grad_norm": 1.3399555225281417, + "learning_rate": 1.0025941498752167e-06, + "loss": 0.2066, + "step": 10862 + }, + { + "epoch": 0.8606060606060606, + "grad_norm": 1.6773949674236435, + "learning_rate": 1.001474565414139e-06, + "loss": 0.2937, + "step": 10863 + }, + { + "epoch": 0.860685284214696, + "grad_norm": 0.9738684746189048, + "learning_rate": 1.0003555734626603e-06, + "loss": 0.1537, + "step": 10864 + }, + { + "epoch": 0.8607645078233314, + "grad_norm": 1.1208106012646089, + "learning_rate": 9.992371740944663e-07, + "loss": 0.1972, + "step": 10865 + }, + { + "epoch": 0.8608437314319667, + "grad_norm": 1.0273624656252895, + "learning_rate": 9.981193673831946e-07, + "loss": 0.1622, + "step": 10866 + }, + { + "epoch": 0.8609229550406021, + "grad_norm": 1.6657120843540014, + "learning_rate": 9.970021534024476e-07, + "loss": 0.2752, + "step": 10867 + }, + { + "epoch": 0.8610021786492374, + "grad_norm": 1.1879822947993006, + "learning_rate": 9.958855322257922e-07, + "loss": 0.2283, + "step": 10868 + }, + { + "epoch": 0.8610814022578729, + "grad_norm": 1.41161644193935, + "learning_rate": 9.94769503926748e-07, + "loss": 0.2205, + "step": 10869 + }, + { + "epoch": 0.8611606258665082, + "grad_norm": 0.9753072853180768, + "learning_rate": 9.936540685787998e-07, + "loss": 0.1499, + "step": 10870 + }, + { + "epoch": 0.8612398494751435, + "grad_norm": 1.3430045506099497, + "learning_rate": 9.925392262553968e-07, + "loss": 0.2134, + "step": 10871 + }, + { + "epoch": 0.861319073083779, + "grad_norm": 1.319930473486728, + "learning_rate": 9.914249770299445e-07, + "loss": 0.1776, + "step": 10872 + }, + { + "epoch": 0.8613982966924143, + "grad_norm": 1.1837601979409187, + "learning_rate": 9.903113209758098e-07, + "loss": 0.2013, + "step": 10873 + }, + { + "epoch": 0.8614775203010497, + "grad_norm": 1.1854408675832382, + "learning_rate": 9.89198258166324e-07, + "loss": 0.2166, + "step": 10874 + }, + { + "epoch": 0.8615567439096851, + "grad_norm": 1.6716272760210298, + "learning_rate": 9.880857886747753e-07, + "loss": 0.2477, + "step": 10875 + }, + { + "epoch": 0.8616359675183205, + "grad_norm": 1.7497373625726915, + "learning_rate": 9.869739125744138e-07, + "loss": 0.2253, + "step": 10876 + }, + { + "epoch": 0.8617151911269558, + "grad_norm": 1.3116517658669882, + "learning_rate": 9.858626299384532e-07, + "loss": 0.193, + "step": 10877 + }, + { + "epoch": 0.8617944147355912, + "grad_norm": 1.3303498927127198, + "learning_rate": 9.847519408400663e-07, + "loss": 0.1933, + "step": 10878 + }, + { + "epoch": 0.8618736383442266, + "grad_norm": 1.134190258981955, + "learning_rate": 9.836418453523833e-07, + "loss": 0.1556, + "step": 10879 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 1.2408127782636982, + "learning_rate": 9.825323435485024e-07, + "loss": 0.1865, + "step": 10880 + }, + { + "epoch": 0.8620320855614974, + "grad_norm": 1.1714285081219094, + "learning_rate": 9.814234355014774e-07, + "loss": 0.2211, + "step": 10881 + }, + { + "epoch": 0.8621113091701327, + "grad_norm": 1.3132939582467724, + "learning_rate": 9.803151212843253e-07, + "loss": 0.2494, + "step": 10882 + }, + { + "epoch": 0.8621905327787681, + "grad_norm": 1.1852329333289056, + "learning_rate": 9.792074009700192e-07, + "loss": 0.1878, + "step": 10883 + }, + { + "epoch": 0.8622697563874034, + "grad_norm": 1.168486320282818, + "learning_rate": 9.781002746315039e-07, + "loss": 0.218, + "step": 10884 + }, + { + "epoch": 0.8623489799960388, + "grad_norm": 1.2109875371880268, + "learning_rate": 9.769937423416741e-07, + "loss": 0.1728, + "step": 10885 + }, + { + "epoch": 0.8624282036046742, + "grad_norm": 1.543794392817461, + "learning_rate": 9.758878041733877e-07, + "loss": 0.235, + "step": 10886 + }, + { + "epoch": 0.8625074272133095, + "grad_norm": 1.2262341573831652, + "learning_rate": 9.747824601994715e-07, + "loss": 0.1972, + "step": 10887 + }, + { + "epoch": 0.862586650821945, + "grad_norm": 1.5004806187741724, + "learning_rate": 9.73677710492703e-07, + "loss": 0.247, + "step": 10888 + }, + { + "epoch": 0.8626658744305803, + "grad_norm": 1.1547087878482098, + "learning_rate": 9.725735551258241e-07, + "loss": 0.143, + "step": 10889 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 1.4221493170959851, + "learning_rate": 9.7146999417154e-07, + "loss": 0.2801, + "step": 10890 + }, + { + "epoch": 0.8628243216478511, + "grad_norm": 1.3623810348968333, + "learning_rate": 9.703670277025158e-07, + "loss": 0.2082, + "step": 10891 + }, + { + "epoch": 0.8629035452564864, + "grad_norm": 1.2665455746022487, + "learning_rate": 9.69264655791372e-07, + "loss": 0.1748, + "step": 10892 + }, + { + "epoch": 0.8629827688651218, + "grad_norm": 1.3590893187993502, + "learning_rate": 9.681628785107e-07, + "loss": 0.1834, + "step": 10893 + }, + { + "epoch": 0.8630619924737571, + "grad_norm": 1.5082112304555808, + "learning_rate": 9.670616959330437e-07, + "loss": 0.259, + "step": 10894 + }, + { + "epoch": 0.8631412160823926, + "grad_norm": 1.3962149660803789, + "learning_rate": 9.659611081309095e-07, + "loss": 0.2243, + "step": 10895 + }, + { + "epoch": 0.8632204396910279, + "grad_norm": 1.240801038383661, + "learning_rate": 9.648611151767683e-07, + "loss": 0.1844, + "step": 10896 + }, + { + "epoch": 0.8632996632996633, + "grad_norm": 1.2873919308560537, + "learning_rate": 9.637617171430492e-07, + "loss": 0.1974, + "step": 10897 + }, + { + "epoch": 0.8633788869082987, + "grad_norm": 1.2778640224426707, + "learning_rate": 9.626629141021414e-07, + "loss": 0.2188, + "step": 10898 + }, + { + "epoch": 0.863458110516934, + "grad_norm": 1.606348585382887, + "learning_rate": 9.615647061263933e-07, + "loss": 0.2703, + "step": 10899 + }, + { + "epoch": 0.8635373341255694, + "grad_norm": 1.3798048352331447, + "learning_rate": 9.604670932881211e-07, + "loss": 0.2375, + "step": 10900 + }, + { + "epoch": 0.8636165577342048, + "grad_norm": 1.211770339108247, + "learning_rate": 9.593700756595958e-07, + "loss": 0.1788, + "step": 10901 + }, + { + "epoch": 0.8636957813428402, + "grad_norm": 1.0797022505941447, + "learning_rate": 9.582736533130488e-07, + "loss": 0.167, + "step": 10902 + }, + { + "epoch": 0.8637750049514755, + "grad_norm": 1.4780619163654718, + "learning_rate": 9.571778263206767e-07, + "loss": 0.2458, + "step": 10903 + }, + { + "epoch": 0.863854228560111, + "grad_norm": 1.043049188107328, + "learning_rate": 9.560825947546337e-07, + "loss": 0.1337, + "step": 10904 + }, + { + "epoch": 0.8639334521687463, + "grad_norm": 1.2947803000906086, + "learning_rate": 9.549879586870336e-07, + "loss": 0.185, + "step": 10905 + }, + { + "epoch": 0.8640126757773816, + "grad_norm": 1.5344780614419649, + "learning_rate": 9.538939181899565e-07, + "loss": 0.2566, + "step": 10906 + }, + { + "epoch": 0.864091899386017, + "grad_norm": 1.5942962347850829, + "learning_rate": 9.528004733354379e-07, + "loss": 0.3098, + "step": 10907 + }, + { + "epoch": 0.8641711229946524, + "grad_norm": 1.326237347161247, + "learning_rate": 9.517076241954737e-07, + "loss": 0.211, + "step": 10908 + }, + { + "epoch": 0.8642503466032878, + "grad_norm": 1.356001450969405, + "learning_rate": 9.506153708420263e-07, + "loss": 0.2195, + "step": 10909 + }, + { + "epoch": 0.8643295702119231, + "grad_norm": 1.6521792743959813, + "learning_rate": 9.495237133470148e-07, + "loss": 0.2301, + "step": 10910 + }, + { + "epoch": 0.8644087938205586, + "grad_norm": 1.5180080303497498, + "learning_rate": 9.484326517823173e-07, + "loss": 0.1946, + "step": 10911 + }, + { + "epoch": 0.8644880174291939, + "grad_norm": 1.1322782186896199, + "learning_rate": 9.473421862197751e-07, + "loss": 0.1524, + "step": 10912 + }, + { + "epoch": 0.8645672410378292, + "grad_norm": 1.4243448995952706, + "learning_rate": 9.462523167311943e-07, + "loss": 0.1872, + "step": 10913 + }, + { + "epoch": 0.8646464646464647, + "grad_norm": 1.1732463340075379, + "learning_rate": 9.45163043388333e-07, + "loss": 0.207, + "step": 10914 + }, + { + "epoch": 0.8647256882551, + "grad_norm": 1.2158997976338526, + "learning_rate": 9.440743662629149e-07, + "loss": 0.1815, + "step": 10915 + }, + { + "epoch": 0.8648049118637354, + "grad_norm": 1.3248290534335512, + "learning_rate": 9.429862854266281e-07, + "loss": 0.2098, + "step": 10916 + }, + { + "epoch": 0.8648841354723708, + "grad_norm": 1.2034546424848298, + "learning_rate": 9.418988009511143e-07, + "loss": 0.2353, + "step": 10917 + }, + { + "epoch": 0.8649633590810062, + "grad_norm": 1.2895565116565588, + "learning_rate": 9.408119129079774e-07, + "loss": 0.183, + "step": 10918 + }, + { + "epoch": 0.8650425826896415, + "grad_norm": 1.554008069256629, + "learning_rate": 9.397256213687877e-07, + "loss": 0.2293, + "step": 10919 + }, + { + "epoch": 0.8651218062982768, + "grad_norm": 1.2394962198928428, + "learning_rate": 9.386399264050705e-07, + "loss": 0.2105, + "step": 10920 + }, + { + "epoch": 0.8652010299069123, + "grad_norm": 1.339429508837962, + "learning_rate": 9.375548280883129e-07, + "loss": 0.2375, + "step": 10921 + }, + { + "epoch": 0.8652802535155476, + "grad_norm": 0.9124102492944345, + "learning_rate": 9.364703264899655e-07, + "loss": 0.1065, + "step": 10922 + }, + { + "epoch": 0.865359477124183, + "grad_norm": 1.1151787840672538, + "learning_rate": 9.353864216814356e-07, + "loss": 0.1661, + "step": 10923 + }, + { + "epoch": 0.8654387007328184, + "grad_norm": 1.4177055426984204, + "learning_rate": 9.34303113734093e-07, + "loss": 0.2484, + "step": 10924 + }, + { + "epoch": 0.8655179243414538, + "grad_norm": 1.5578039263984262, + "learning_rate": 9.332204027192693e-07, + "loss": 0.2164, + "step": 10925 + }, + { + "epoch": 0.8655971479500891, + "grad_norm": 1.6193992303596956, + "learning_rate": 9.321382887082564e-07, + "loss": 0.2751, + "step": 10926 + }, + { + "epoch": 0.8656763715587245, + "grad_norm": 1.1964422362871079, + "learning_rate": 9.310567717723063e-07, + "loss": 0.1943, + "step": 10927 + }, + { + "epoch": 0.8657555951673599, + "grad_norm": 1.6999540086970655, + "learning_rate": 9.299758519826274e-07, + "loss": 0.2348, + "step": 10928 + }, + { + "epoch": 0.8658348187759952, + "grad_norm": 1.5117679335126262, + "learning_rate": 9.288955294103996e-07, + "loss": 0.2438, + "step": 10929 + }, + { + "epoch": 0.8659140423846307, + "grad_norm": 1.1995615499375527, + "learning_rate": 9.278158041267526e-07, + "loss": 0.1496, + "step": 10930 + }, + { + "epoch": 0.865993265993266, + "grad_norm": 1.1587269927524892, + "learning_rate": 9.267366762027818e-07, + "loss": 0.1585, + "step": 10931 + }, + { + "epoch": 0.8660724896019014, + "grad_norm": 0.9650094265975712, + "learning_rate": 9.256581457095437e-07, + "loss": 0.1245, + "step": 10932 + }, + { + "epoch": 0.8661517132105367, + "grad_norm": 1.380043514146755, + "learning_rate": 9.245802127180547e-07, + "loss": 0.1904, + "step": 10933 + }, + { + "epoch": 0.8662309368191721, + "grad_norm": 1.5037731371741119, + "learning_rate": 9.235028772992883e-07, + "loss": 0.285, + "step": 10934 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 1.4442399337657974, + "learning_rate": 9.224261395241862e-07, + "loss": 0.1578, + "step": 10935 + }, + { + "epoch": 0.8663893840364428, + "grad_norm": 1.1398482578251599, + "learning_rate": 9.213499994636443e-07, + "loss": 0.1862, + "step": 10936 + }, + { + "epoch": 0.8664686076450783, + "grad_norm": 1.2647795753196418, + "learning_rate": 9.202744571885191e-07, + "loss": 0.2002, + "step": 10937 + }, + { + "epoch": 0.8665478312537136, + "grad_norm": 1.1883335094521148, + "learning_rate": 9.19199512769634e-07, + "loss": 0.2333, + "step": 10938 + }, + { + "epoch": 0.866627054862349, + "grad_norm": 1.1945691477103177, + "learning_rate": 9.181251662777668e-07, + "loss": 0.2093, + "step": 10939 + }, + { + "epoch": 0.8667062784709844, + "grad_norm": 1.1913456661792743, + "learning_rate": 9.170514177836565e-07, + "loss": 0.1837, + "step": 10940 + }, + { + "epoch": 0.8667855020796197, + "grad_norm": 1.2243469031713343, + "learning_rate": 9.159782673580075e-07, + "loss": 0.118, + "step": 10941 + }, + { + "epoch": 0.8668647256882551, + "grad_norm": 1.2756906533400505, + "learning_rate": 9.149057150714802e-07, + "loss": 0.212, + "step": 10942 + }, + { + "epoch": 0.8669439492968904, + "grad_norm": 1.1134171618131126, + "learning_rate": 9.138337609946979e-07, + "loss": 0.2144, + "step": 10943 + }, + { + "epoch": 0.8670231729055259, + "grad_norm": 1.4768432506185165, + "learning_rate": 9.127624051982398e-07, + "loss": 0.2339, + "step": 10944 + }, + { + "epoch": 0.8671023965141612, + "grad_norm": 1.263897220311466, + "learning_rate": 9.116916477526539e-07, + "loss": 0.1851, + "step": 10945 + }, + { + "epoch": 0.8671816201227966, + "grad_norm": 1.5791891811291119, + "learning_rate": 9.106214887284437e-07, + "loss": 0.2852, + "step": 10946 + }, + { + "epoch": 0.867260843731432, + "grad_norm": 1.2386945625987698, + "learning_rate": 9.095519281960729e-07, + "loss": 0.2138, + "step": 10947 + }, + { + "epoch": 0.8673400673400673, + "grad_norm": 1.3290999891055273, + "learning_rate": 9.084829662259665e-07, + "loss": 0.2024, + "step": 10948 + }, + { + "epoch": 0.8674192909487027, + "grad_norm": 1.861921203918453, + "learning_rate": 9.0741460288851e-07, + "loss": 0.342, + "step": 10949 + }, + { + "epoch": 0.8674985145573381, + "grad_norm": 1.305999328023627, + "learning_rate": 9.06346838254053e-07, + "loss": 0.1971, + "step": 10950 + }, + { + "epoch": 0.8675777381659735, + "grad_norm": 1.2046897929786464, + "learning_rate": 9.052796723929002e-07, + "loss": 0.1916, + "step": 10951 + }, + { + "epoch": 0.8676569617746088, + "grad_norm": 1.3205026075550685, + "learning_rate": 9.042131053753211e-07, + "loss": 0.2048, + "step": 10952 + }, + { + "epoch": 0.8677361853832442, + "grad_norm": 1.4058997461848455, + "learning_rate": 9.031471372715405e-07, + "loss": 0.266, + "step": 10953 + }, + { + "epoch": 0.8678154089918796, + "grad_norm": 1.316529575663865, + "learning_rate": 9.020817681517513e-07, + "loss": 0.2145, + "step": 10954 + }, + { + "epoch": 0.8678946326005149, + "grad_norm": 1.2794858968322589, + "learning_rate": 9.010169980861005e-07, + "loss": 0.207, + "step": 10955 + }, + { + "epoch": 0.8679738562091504, + "grad_norm": 1.2287307359967574, + "learning_rate": 8.999528271446989e-07, + "loss": 0.1669, + "step": 10956 + }, + { + "epoch": 0.8680530798177857, + "grad_norm": 1.0903229590726147, + "learning_rate": 8.988892553976169e-07, + "loss": 0.1455, + "step": 10957 + }, + { + "epoch": 0.8681323034264211, + "grad_norm": 1.582330315197623, + "learning_rate": 8.978262829148876e-07, + "loss": 0.2678, + "step": 10958 + }, + { + "epoch": 0.8682115270350564, + "grad_norm": 1.5708438063577659, + "learning_rate": 8.96763909766497e-07, + "loss": 0.2695, + "step": 10959 + }, + { + "epoch": 0.8682907506436918, + "grad_norm": 1.3148791979664767, + "learning_rate": 8.957021360224039e-07, + "loss": 0.2388, + "step": 10960 + }, + { + "epoch": 0.8683699742523272, + "grad_norm": 1.7123619185406738, + "learning_rate": 8.946409617525175e-07, + "loss": 0.2949, + "step": 10961 + }, + { + "epoch": 0.8684491978609625, + "grad_norm": 1.4435241513846273, + "learning_rate": 8.935803870267101e-07, + "loss": 0.2522, + "step": 10962 + }, + { + "epoch": 0.868528421469598, + "grad_norm": 1.120034147840874, + "learning_rate": 8.925204119148189e-07, + "loss": 0.1554, + "step": 10963 + }, + { + "epoch": 0.8686076450782333, + "grad_norm": 1.3501466941465314, + "learning_rate": 8.914610364866361e-07, + "loss": 0.2862, + "step": 10964 + }, + { + "epoch": 0.8686868686868687, + "grad_norm": 1.452284692523353, + "learning_rate": 8.904022608119145e-07, + "loss": 0.2429, + "step": 10965 + }, + { + "epoch": 0.868766092295504, + "grad_norm": 1.2375972280288476, + "learning_rate": 8.89344084960374e-07, + "loss": 0.2069, + "step": 10966 + }, + { + "epoch": 0.8688453159041394, + "grad_norm": 1.2389605328924405, + "learning_rate": 8.882865090016868e-07, + "loss": 0.1876, + "step": 10967 + }, + { + "epoch": 0.8689245395127748, + "grad_norm": 1.8458958577789004, + "learning_rate": 8.872295330054915e-07, + "loss": 0.3134, + "step": 10968 + }, + { + "epoch": 0.8690037631214101, + "grad_norm": 1.371580727712424, + "learning_rate": 8.861731570413801e-07, + "loss": 0.225, + "step": 10969 + }, + { + "epoch": 0.8690829867300456, + "grad_norm": 1.7226551739868343, + "learning_rate": 8.85117381178916e-07, + "loss": 0.2482, + "step": 10970 + }, + { + "epoch": 0.8691622103386809, + "grad_norm": 1.2290738992879762, + "learning_rate": 8.840622054876147e-07, + "loss": 0.1513, + "step": 10971 + }, + { + "epoch": 0.8692414339473163, + "grad_norm": 1.2683605262144488, + "learning_rate": 8.830076300369517e-07, + "loss": 0.1634, + "step": 10972 + }, + { + "epoch": 0.8693206575559517, + "grad_norm": 1.34435966259007, + "learning_rate": 8.819536548963703e-07, + "loss": 0.1668, + "step": 10973 + }, + { + "epoch": 0.869399881164587, + "grad_norm": 1.386717789523184, + "learning_rate": 8.809002801352673e-07, + "loss": 0.1848, + "step": 10974 + }, + { + "epoch": 0.8694791047732224, + "grad_norm": 0.9654778197765824, + "learning_rate": 8.798475058230005e-07, + "loss": 0.1142, + "step": 10975 + }, + { + "epoch": 0.8695583283818578, + "grad_norm": 1.6760387777734673, + "learning_rate": 8.787953320288945e-07, + "loss": 0.2834, + "step": 10976 + }, + { + "epoch": 0.8696375519904932, + "grad_norm": 1.3551031098416466, + "learning_rate": 8.777437588222271e-07, + "loss": 0.2072, + "step": 10977 + }, + { + "epoch": 0.8697167755991285, + "grad_norm": 1.4616504531889354, + "learning_rate": 8.766927862722374e-07, + "loss": 0.2546, + "step": 10978 + }, + { + "epoch": 0.869795999207764, + "grad_norm": 1.3499854907353181, + "learning_rate": 8.756424144481313e-07, + "loss": 0.2135, + "step": 10979 + }, + { + "epoch": 0.8698752228163993, + "grad_norm": 1.322560336155635, + "learning_rate": 8.745926434190688e-07, + "loss": 0.1567, + "step": 10980 + }, + { + "epoch": 0.8699544464250346, + "grad_norm": 1.4502476581345753, + "learning_rate": 8.735434732541704e-07, + "loss": 0.2297, + "step": 10981 + }, + { + "epoch": 0.87003367003367, + "grad_norm": 1.1899944458752492, + "learning_rate": 8.724949040225217e-07, + "loss": 0.1604, + "step": 10982 + }, + { + "epoch": 0.8701128936423054, + "grad_norm": 1.001816183839164, + "learning_rate": 8.714469357931654e-07, + "loss": 0.1413, + "step": 10983 + }, + { + "epoch": 0.8701921172509408, + "grad_norm": 1.3237821533786718, + "learning_rate": 8.703995686351041e-07, + "loss": 0.2083, + "step": 10984 + }, + { + "epoch": 0.8702713408595761, + "grad_norm": 1.4795948710707902, + "learning_rate": 8.693528026173015e-07, + "loss": 0.2303, + "step": 10985 + }, + { + "epoch": 0.8703505644682116, + "grad_norm": 1.5103329626783464, + "learning_rate": 8.683066378086846e-07, + "loss": 0.2453, + "step": 10986 + }, + { + "epoch": 0.8704297880768469, + "grad_norm": 1.3303880241439068, + "learning_rate": 8.672610742781363e-07, + "loss": 0.266, + "step": 10987 + }, + { + "epoch": 0.8705090116854822, + "grad_norm": 1.428034222228885, + "learning_rate": 8.662161120945e-07, + "loss": 0.2029, + "step": 10988 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 1.5814420767899395, + "learning_rate": 8.651717513265867e-07, + "loss": 0.2601, + "step": 10989 + }, + { + "epoch": 0.870667458902753, + "grad_norm": 1.2654750184815742, + "learning_rate": 8.641279920431589e-07, + "loss": 0.2102, + "step": 10990 + }, + { + "epoch": 0.8707466825113884, + "grad_norm": 1.1956528303822411, + "learning_rate": 8.630848343129417e-07, + "loss": 0.1529, + "step": 10991 + }, + { + "epoch": 0.8708259061200238, + "grad_norm": 1.432123432044092, + "learning_rate": 8.620422782046268e-07, + "loss": 0.2574, + "step": 10992 + }, + { + "epoch": 0.8709051297286592, + "grad_norm": 1.556229503727342, + "learning_rate": 8.61000323786858e-07, + "loss": 0.2348, + "step": 10993 + }, + { + "epoch": 0.8709843533372945, + "grad_norm": 1.536150081819509, + "learning_rate": 8.599589711282419e-07, + "loss": 0.2741, + "step": 10994 + }, + { + "epoch": 0.8710635769459298, + "grad_norm": 1.326657506866106, + "learning_rate": 8.589182202973512e-07, + "loss": 0.1705, + "step": 10995 + }, + { + "epoch": 0.8711428005545653, + "grad_norm": 1.4198472440024905, + "learning_rate": 8.578780713627111e-07, + "loss": 0.1942, + "step": 10996 + }, + { + "epoch": 0.8712220241632006, + "grad_norm": 1.5086303009041653, + "learning_rate": 8.568385243928112e-07, + "loss": 0.2148, + "step": 10997 + }, + { + "epoch": 0.871301247771836, + "grad_norm": 1.5810375505062402, + "learning_rate": 8.55799579456098e-07, + "loss": 0.2061, + "step": 10998 + }, + { + "epoch": 0.8713804713804714, + "grad_norm": 1.407463848256604, + "learning_rate": 8.547612366209856e-07, + "loss": 0.2059, + "step": 10999 + }, + { + "epoch": 0.8714596949891068, + "grad_norm": 1.4260066276032026, + "learning_rate": 8.537234959558416e-07, + "loss": 0.249, + "step": 11000 + }, + { + "epoch": 0.8715389185977421, + "grad_norm": 1.2726993720274564, + "learning_rate": 8.526863575289945e-07, + "loss": 0.1818, + "step": 11001 + }, + { + "epoch": 0.8716181422063775, + "grad_norm": 1.3746304664130624, + "learning_rate": 8.516498214087387e-07, + "loss": 0.2604, + "step": 11002 + }, + { + "epoch": 0.8716973658150129, + "grad_norm": 1.3874968294741086, + "learning_rate": 8.50613887663323e-07, + "loss": 0.211, + "step": 11003 + }, + { + "epoch": 0.8717765894236482, + "grad_norm": 1.6073515607700228, + "learning_rate": 8.495785563609571e-07, + "loss": 0.2727, + "step": 11004 + }, + { + "epoch": 0.8718558130322837, + "grad_norm": 1.091733494208886, + "learning_rate": 8.485438275698154e-07, + "loss": 0.1515, + "step": 11005 + }, + { + "epoch": 0.871935036640919, + "grad_norm": 1.598684502948701, + "learning_rate": 8.475097013580292e-07, + "loss": 0.3159, + "step": 11006 + }, + { + "epoch": 0.8720142602495544, + "grad_norm": 1.5456911200046748, + "learning_rate": 8.46476177793688e-07, + "loss": 0.257, + "step": 11007 + }, + { + "epoch": 0.8720934838581897, + "grad_norm": 1.4604479451342425, + "learning_rate": 8.454432569448489e-07, + "loss": 0.2196, + "step": 11008 + }, + { + "epoch": 0.8721727074668251, + "grad_norm": 1.1876870982750651, + "learning_rate": 8.444109388795218e-07, + "loss": 0.1993, + "step": 11009 + }, + { + "epoch": 0.8722519310754605, + "grad_norm": 1.4787841642784216, + "learning_rate": 8.43379223665679e-07, + "loss": 0.2092, + "step": 11010 + }, + { + "epoch": 0.8723311546840958, + "grad_norm": 1.8490366852740983, + "learning_rate": 8.423481113712573e-07, + "loss": 0.2833, + "step": 11011 + }, + { + "epoch": 0.8724103782927313, + "grad_norm": 1.4103347143508123, + "learning_rate": 8.413176020641489e-07, + "loss": 0.2437, + "step": 11012 + }, + { + "epoch": 0.8724896019013666, + "grad_norm": 1.7664250082128186, + "learning_rate": 8.402876958122075e-07, + "loss": 0.2252, + "step": 11013 + }, + { + "epoch": 0.872568825510002, + "grad_norm": 1.0938766565396143, + "learning_rate": 8.392583926832454e-07, + "loss": 0.1693, + "step": 11014 + }, + { + "epoch": 0.8726480491186374, + "grad_norm": 0.9905668909971334, + "learning_rate": 8.382296927450417e-07, + "loss": 0.1774, + "step": 11015 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 1.233948719998692, + "learning_rate": 8.37201596065329e-07, + "loss": 0.2105, + "step": 11016 + }, + { + "epoch": 0.8728064963359081, + "grad_norm": 1.3127308656656749, + "learning_rate": 8.361741027118009e-07, + "loss": 0.1875, + "step": 11017 + }, + { + "epoch": 0.8728857199445434, + "grad_norm": 1.232155174580531, + "learning_rate": 8.351472127521166e-07, + "loss": 0.2242, + "step": 11018 + }, + { + "epoch": 0.8729649435531789, + "grad_norm": 1.0847082777662558, + "learning_rate": 8.341209262538896e-07, + "loss": 0.1817, + "step": 11019 + }, + { + "epoch": 0.8730441671618142, + "grad_norm": 1.1706559251979154, + "learning_rate": 8.330952432846939e-07, + "loss": 0.1875, + "step": 11020 + }, + { + "epoch": 0.8731233907704496, + "grad_norm": 1.8639322309976276, + "learning_rate": 8.320701639120709e-07, + "loss": 0.2501, + "step": 11021 + }, + { + "epoch": 0.873202614379085, + "grad_norm": 1.2378732219655406, + "learning_rate": 8.310456882035145e-07, + "loss": 0.2084, + "step": 11022 + }, + { + "epoch": 0.8732818379877203, + "grad_norm": 1.6296874187400598, + "learning_rate": 8.300218162264783e-07, + "loss": 0.2419, + "step": 11023 + }, + { + "epoch": 0.8733610615963557, + "grad_norm": 1.6257912942697956, + "learning_rate": 8.289985480483864e-07, + "loss": 0.245, + "step": 11024 + }, + { + "epoch": 0.8734402852049911, + "grad_norm": 1.5562493278331457, + "learning_rate": 8.279758837366103e-07, + "loss": 0.3024, + "step": 11025 + }, + { + "epoch": 0.8735195088136265, + "grad_norm": 1.0670702602443827, + "learning_rate": 8.269538233584884e-07, + "loss": 0.1337, + "step": 11026 + }, + { + "epoch": 0.8735987324222618, + "grad_norm": 1.197466529482208, + "learning_rate": 8.259323669813202e-07, + "loss": 0.1636, + "step": 11027 + }, + { + "epoch": 0.8736779560308973, + "grad_norm": 1.0730691179579321, + "learning_rate": 8.24911514672363e-07, + "loss": 0.182, + "step": 11028 + }, + { + "epoch": 0.8737571796395326, + "grad_norm": 1.2043224340468681, + "learning_rate": 8.23891266498833e-07, + "loss": 0.231, + "step": 11029 + }, + { + "epoch": 0.8738364032481679, + "grad_norm": 1.2343701467696153, + "learning_rate": 8.228716225279121e-07, + "loss": 0.1678, + "step": 11030 + }, + { + "epoch": 0.8739156268568034, + "grad_norm": 1.2574325458080249, + "learning_rate": 8.218525828267377e-07, + "loss": 0.2006, + "step": 11031 + }, + { + "epoch": 0.8739948504654387, + "grad_norm": 1.3345528043643402, + "learning_rate": 8.208341474624071e-07, + "loss": 0.2395, + "step": 11032 + }, + { + "epoch": 0.8740740740740741, + "grad_norm": 1.5516198060829922, + "learning_rate": 8.198163165019812e-07, + "loss": 0.2861, + "step": 11033 + }, + { + "epoch": 0.8741532976827094, + "grad_norm": 1.2017585778556426, + "learning_rate": 8.187990900124787e-07, + "loss": 0.1844, + "step": 11034 + }, + { + "epoch": 0.8742325212913448, + "grad_norm": 1.2394218360158256, + "learning_rate": 8.177824680608781e-07, + "loss": 0.202, + "step": 11035 + }, + { + "epoch": 0.8743117448999802, + "grad_norm": 1.3411817428891513, + "learning_rate": 8.167664507141215e-07, + "loss": 0.2764, + "step": 11036 + }, + { + "epoch": 0.8743909685086155, + "grad_norm": 1.3824592149289336, + "learning_rate": 8.157510380391065e-07, + "loss": 0.2176, + "step": 11037 + }, + { + "epoch": 0.874470192117251, + "grad_norm": 1.52509665045312, + "learning_rate": 8.14736230102694e-07, + "loss": 0.2434, + "step": 11038 + }, + { + "epoch": 0.8745494157258863, + "grad_norm": 1.3003068545630025, + "learning_rate": 8.137220269717028e-07, + "loss": 0.1838, + "step": 11039 + }, + { + "epoch": 0.8746286393345217, + "grad_norm": 1.6021208927992185, + "learning_rate": 8.127084287129161e-07, + "loss": 0.2531, + "step": 11040 + }, + { + "epoch": 0.874707862943157, + "grad_norm": 1.8677573615930845, + "learning_rate": 8.116954353930728e-07, + "loss": 0.302, + "step": 11041 + }, + { + "epoch": 0.8747870865517924, + "grad_norm": 1.401093329763425, + "learning_rate": 8.106830470788729e-07, + "loss": 0.1962, + "step": 11042 + }, + { + "epoch": 0.8748663101604278, + "grad_norm": 1.2534528435668444, + "learning_rate": 8.096712638369797e-07, + "loss": 0.2002, + "step": 11043 + }, + { + "epoch": 0.8749455337690631, + "grad_norm": 1.0708341198687092, + "learning_rate": 8.086600857340121e-07, + "loss": 0.1855, + "step": 11044 + }, + { + "epoch": 0.8750247573776986, + "grad_norm": 1.1905031722178174, + "learning_rate": 8.076495128365502e-07, + "loss": 0.1736, + "step": 11045 + }, + { + "epoch": 0.8751039809863339, + "grad_norm": 1.1261796938267055, + "learning_rate": 8.066395452111387e-07, + "loss": 0.1873, + "step": 11046 + }, + { + "epoch": 0.8751832045949693, + "grad_norm": 1.271527684546844, + "learning_rate": 8.056301829242785e-07, + "loss": 0.2105, + "step": 11047 + }, + { + "epoch": 0.8752624282036047, + "grad_norm": 1.4334977524564723, + "learning_rate": 8.046214260424279e-07, + "loss": 0.2136, + "step": 11048 + }, + { + "epoch": 0.87534165181224, + "grad_norm": 1.1686400051912618, + "learning_rate": 8.036132746320125e-07, + "loss": 0.1832, + "step": 11049 + }, + { + "epoch": 0.8754208754208754, + "grad_norm": 1.3117493178899653, + "learning_rate": 8.026057287594136e-07, + "loss": 0.2429, + "step": 11050 + }, + { + "epoch": 0.8755000990295108, + "grad_norm": 1.22526825990827, + "learning_rate": 8.015987884909692e-07, + "loss": 0.2219, + "step": 11051 + }, + { + "epoch": 0.8755793226381462, + "grad_norm": 1.2637910742561942, + "learning_rate": 8.005924538929877e-07, + "loss": 0.2317, + "step": 11052 + }, + { + "epoch": 0.8756585462467815, + "grad_norm": 1.2336321698790642, + "learning_rate": 7.99586725031728e-07, + "loss": 0.1879, + "step": 11053 + }, + { + "epoch": 0.875737769855417, + "grad_norm": 1.4825172356737804, + "learning_rate": 7.985816019734127e-07, + "loss": 0.2367, + "step": 11054 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 1.0725833525479378, + "learning_rate": 7.975770847842234e-07, + "loss": 0.1663, + "step": 11055 + }, + { + "epoch": 0.8758962170726876, + "grad_norm": 1.4873619822004873, + "learning_rate": 7.965731735303051e-07, + "loss": 0.2388, + "step": 11056 + }, + { + "epoch": 0.875975440681323, + "grad_norm": 1.2603667465022275, + "learning_rate": 7.955698682777601e-07, + "loss": 0.1762, + "step": 11057 + }, + { + "epoch": 0.8760546642899584, + "grad_norm": 1.155112415391232, + "learning_rate": 7.945671690926471e-07, + "loss": 0.1318, + "step": 11058 + }, + { + "epoch": 0.8761338878985938, + "grad_norm": 1.3153989966272523, + "learning_rate": 7.935650760409952e-07, + "loss": 0.2959, + "step": 11059 + }, + { + "epoch": 0.8762131115072291, + "grad_norm": 1.3061923356999392, + "learning_rate": 7.925635891887839e-07, + "loss": 0.2124, + "step": 11060 + }, + { + "epoch": 0.8762923351158646, + "grad_norm": 1.406229190156455, + "learning_rate": 7.915627086019561e-07, + "loss": 0.1997, + "step": 11061 + }, + { + "epoch": 0.8763715587244999, + "grad_norm": 1.4943950331766096, + "learning_rate": 7.905624343464169e-07, + "loss": 0.209, + "step": 11062 + }, + { + "epoch": 0.8764507823331352, + "grad_norm": 1.1755314891839619, + "learning_rate": 7.895627664880278e-07, + "loss": 0.1829, + "step": 11063 + }, + { + "epoch": 0.8765300059417707, + "grad_norm": 1.049558416720703, + "learning_rate": 7.88563705092612e-07, + "loss": 0.1286, + "step": 11064 + }, + { + "epoch": 0.876609229550406, + "grad_norm": 1.156833231199455, + "learning_rate": 7.875652502259545e-07, + "loss": 0.204, + "step": 11065 + }, + { + "epoch": 0.8766884531590414, + "grad_norm": 1.1483347128609238, + "learning_rate": 7.865674019537983e-07, + "loss": 0.1588, + "step": 11066 + }, + { + "epoch": 0.8767676767676768, + "grad_norm": 1.240745984158369, + "learning_rate": 7.855701603418442e-07, + "loss": 0.2374, + "step": 11067 + }, + { + "epoch": 0.8768469003763122, + "grad_norm": 1.126659633137706, + "learning_rate": 7.845735254557608e-07, + "loss": 0.1643, + "step": 11068 + }, + { + "epoch": 0.8769261239849475, + "grad_norm": 1.3738534308841979, + "learning_rate": 7.835774973611687e-07, + "loss": 0.2059, + "step": 11069 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 1.7316568053771537, + "learning_rate": 7.825820761236514e-07, + "loss": 0.2723, + "step": 11070 + }, + { + "epoch": 0.8770845712022183, + "grad_norm": 1.4668483675071118, + "learning_rate": 7.815872618087506e-07, + "loss": 0.2109, + "step": 11071 + }, + { + "epoch": 0.8771637948108536, + "grad_norm": 1.255684583962485, + "learning_rate": 7.805930544819751e-07, + "loss": 0.2296, + "step": 11072 + }, + { + "epoch": 0.877243018419489, + "grad_norm": 1.479811680000921, + "learning_rate": 7.795994542087859e-07, + "loss": 0.2477, + "step": 11073 + }, + { + "epoch": 0.8773222420281244, + "grad_norm": 1.307791566773147, + "learning_rate": 7.786064610546051e-07, + "loss": 0.2363, + "step": 11074 + }, + { + "epoch": 0.8774014656367598, + "grad_norm": 1.4106382168016707, + "learning_rate": 7.776140750848205e-07, + "loss": 0.2341, + "step": 11075 + }, + { + "epoch": 0.8774806892453951, + "grad_norm": 1.4019740144926003, + "learning_rate": 7.766222963647729e-07, + "loss": 0.2185, + "step": 11076 + }, + { + "epoch": 0.8775599128540305, + "grad_norm": 1.361092548799947, + "learning_rate": 7.756311249597659e-07, + "loss": 0.1828, + "step": 11077 + }, + { + "epoch": 0.8776391364626659, + "grad_norm": 1.0306974626227006, + "learning_rate": 7.746405609350661e-07, + "loss": 0.1323, + "step": 11078 + }, + { + "epoch": 0.8777183600713012, + "grad_norm": 1.461759794181452, + "learning_rate": 7.736506043558956e-07, + "loss": 0.2763, + "step": 11079 + }, + { + "epoch": 0.8777975836799367, + "grad_norm": 1.5478319559655647, + "learning_rate": 7.726612552874368e-07, + "loss": 0.2292, + "step": 11080 + }, + { + "epoch": 0.877876807288572, + "grad_norm": 1.3286755972548578, + "learning_rate": 7.716725137948366e-07, + "loss": 0.1805, + "step": 11081 + }, + { + "epoch": 0.8779560308972074, + "grad_norm": 1.3462673078352407, + "learning_rate": 7.706843799431985e-07, + "loss": 0.2273, + "step": 11082 + }, + { + "epoch": 0.8780352545058427, + "grad_norm": 1.3847279568819344, + "learning_rate": 7.696968537975847e-07, + "loss": 0.2084, + "step": 11083 + }, + { + "epoch": 0.8781144781144781, + "grad_norm": 1.5835594301569003, + "learning_rate": 7.687099354230177e-07, + "loss": 0.2529, + "step": 11084 + }, + { + "epoch": 0.8781937017231135, + "grad_norm": 1.4199092441760521, + "learning_rate": 7.677236248844855e-07, + "loss": 0.1946, + "step": 11085 + }, + { + "epoch": 0.8782729253317488, + "grad_norm": 1.3960097555945583, + "learning_rate": 7.667379222469295e-07, + "loss": 0.2542, + "step": 11086 + }, + { + "epoch": 0.8783521489403843, + "grad_norm": 1.3809982901171423, + "learning_rate": 7.657528275752524e-07, + "loss": 0.1681, + "step": 11087 + }, + { + "epoch": 0.8784313725490196, + "grad_norm": 1.322361657260325, + "learning_rate": 7.647683409343198e-07, + "loss": 0.2203, + "step": 11088 + }, + { + "epoch": 0.878510596157655, + "grad_norm": 1.2374436235833877, + "learning_rate": 7.637844623889557e-07, + "loss": 0.171, + "step": 11089 + }, + { + "epoch": 0.8785898197662904, + "grad_norm": 1.0636589319077459, + "learning_rate": 7.628011920039414e-07, + "loss": 0.1251, + "step": 11090 + }, + { + "epoch": 0.8786690433749257, + "grad_norm": 1.2797663851150352, + "learning_rate": 7.618185298440239e-07, + "loss": 0.2013, + "step": 11091 + }, + { + "epoch": 0.8787482669835611, + "grad_norm": 1.017909437188796, + "learning_rate": 7.608364759739039e-07, + "loss": 0.1548, + "step": 11092 + }, + { + "epoch": 0.8788274905921964, + "grad_norm": 1.2060453997920095, + "learning_rate": 7.598550304582453e-07, + "loss": 0.1451, + "step": 11093 + }, + { + "epoch": 0.8789067142008319, + "grad_norm": 1.4554247336312824, + "learning_rate": 7.588741933616728e-07, + "loss": 0.2478, + "step": 11094 + }, + { + "epoch": 0.8789859378094672, + "grad_norm": 1.139426926125611, + "learning_rate": 7.578939647487705e-07, + "loss": 0.199, + "step": 11095 + }, + { + "epoch": 0.8790651614181026, + "grad_norm": 1.7164346720472863, + "learning_rate": 7.569143446840776e-07, + "loss": 0.1908, + "step": 11096 + }, + { + "epoch": 0.879144385026738, + "grad_norm": 1.2241552551552792, + "learning_rate": 7.559353332321029e-07, + "loss": 0.1579, + "step": 11097 + }, + { + "epoch": 0.8792236086353733, + "grad_norm": 1.3042312548615864, + "learning_rate": 7.549569304573057e-07, + "loss": 0.2983, + "step": 11098 + }, + { + "epoch": 0.8793028322440087, + "grad_norm": 1.4597314264982106, + "learning_rate": 7.539791364241111e-07, + "loss": 0.2171, + "step": 11099 + }, + { + "epoch": 0.8793820558526441, + "grad_norm": 1.288960280731306, + "learning_rate": 7.530019511969e-07, + "loss": 0.2132, + "step": 11100 + }, + { + "epoch": 0.8794612794612795, + "grad_norm": 1.2335618199467266, + "learning_rate": 7.520253748400175e-07, + "loss": 0.1876, + "step": 11101 + }, + { + "epoch": 0.8795405030699148, + "grad_norm": 1.282500744679882, + "learning_rate": 7.510494074177666e-07, + "loss": 0.1572, + "step": 11102 + }, + { + "epoch": 0.8796197266785503, + "grad_norm": 1.3745829898359752, + "learning_rate": 7.500740489944092e-07, + "loss": 0.204, + "step": 11103 + }, + { + "epoch": 0.8796989502871856, + "grad_norm": 1.2433566767216206, + "learning_rate": 7.490992996341662e-07, + "loss": 0.1751, + "step": 11104 + }, + { + "epoch": 0.8797781738958209, + "grad_norm": 1.2242354687551547, + "learning_rate": 7.481251594012218e-07, + "loss": 0.2253, + "step": 11105 + }, + { + "epoch": 0.8798573975044564, + "grad_norm": 1.0540689391313347, + "learning_rate": 7.471516283597191e-07, + "loss": 0.1301, + "step": 11106 + }, + { + "epoch": 0.8799366211130917, + "grad_norm": 1.1384716469699712, + "learning_rate": 7.461787065737602e-07, + "loss": 0.1699, + "step": 11107 + }, + { + "epoch": 0.8800158447217271, + "grad_norm": 1.1431374500923583, + "learning_rate": 7.452063941074073e-07, + "loss": 0.1847, + "step": 11108 + }, + { + "epoch": 0.8800950683303624, + "grad_norm": 1.3455507058059666, + "learning_rate": 7.442346910246801e-07, + "loss": 0.2071, + "step": 11109 + }, + { + "epoch": 0.8801742919389978, + "grad_norm": 1.0702238151840346, + "learning_rate": 7.432635973895652e-07, + "loss": 0.1506, + "step": 11110 + }, + { + "epoch": 0.8802535155476332, + "grad_norm": 1.4738004171473327, + "learning_rate": 7.422931132660005e-07, + "loss": 0.2722, + "step": 11111 + }, + { + "epoch": 0.8803327391562685, + "grad_norm": 1.1988007343533718, + "learning_rate": 7.413232387178882e-07, + "loss": 0.1566, + "step": 11112 + }, + { + "epoch": 0.880411962764904, + "grad_norm": 1.5629736332022217, + "learning_rate": 7.403539738090914e-07, + "loss": 0.182, + "step": 11113 + }, + { + "epoch": 0.8804911863735393, + "grad_norm": 1.2045602113079472, + "learning_rate": 7.393853186034316e-07, + "loss": 0.2437, + "step": 11114 + }, + { + "epoch": 0.8805704099821747, + "grad_norm": 1.7718334320501854, + "learning_rate": 7.384172731646877e-07, + "loss": 0.2216, + "step": 11115 + }, + { + "epoch": 0.88064963359081, + "grad_norm": 1.4815659028967725, + "learning_rate": 7.374498375566042e-07, + "loss": 0.2767, + "step": 11116 + }, + { + "epoch": 0.8807288571994454, + "grad_norm": 1.3618822420471255, + "learning_rate": 7.364830118428801e-07, + "loss": 0.3079, + "step": 11117 + }, + { + "epoch": 0.8808080808080808, + "grad_norm": 1.319939752152183, + "learning_rate": 7.355167960871745e-07, + "loss": 0.2413, + "step": 11118 + }, + { + "epoch": 0.8808873044167161, + "grad_norm": 1.3448470650454571, + "learning_rate": 7.345511903531122e-07, + "loss": 0.206, + "step": 11119 + }, + { + "epoch": 0.8809665280253516, + "grad_norm": 1.199780251106472, + "learning_rate": 7.335861947042711e-07, + "loss": 0.1903, + "step": 11120 + }, + { + "epoch": 0.8810457516339869, + "grad_norm": 1.0882751935282775, + "learning_rate": 7.326218092041903e-07, + "loss": 0.1897, + "step": 11121 + }, + { + "epoch": 0.8811249752426223, + "grad_norm": 1.5217220129699023, + "learning_rate": 7.316580339163736e-07, + "loss": 0.2356, + "step": 11122 + }, + { + "epoch": 0.8812041988512577, + "grad_norm": 1.0117488945426965, + "learning_rate": 7.306948689042792e-07, + "loss": 0.1478, + "step": 11123 + }, + { + "epoch": 0.881283422459893, + "grad_norm": 1.3570468389853543, + "learning_rate": 7.297323142313262e-07, + "loss": 0.1979, + "step": 11124 + }, + { + "epoch": 0.8813626460685284, + "grad_norm": 1.1616341187651766, + "learning_rate": 7.287703699608928e-07, + "loss": 0.2282, + "step": 11125 + }, + { + "epoch": 0.8814418696771638, + "grad_norm": 1.222054044275733, + "learning_rate": 7.278090361563228e-07, + "loss": 0.199, + "step": 11126 + }, + { + "epoch": 0.8815210932857992, + "grad_norm": 1.6931377696165042, + "learning_rate": 7.268483128809122e-07, + "loss": 0.3056, + "step": 11127 + }, + { + "epoch": 0.8816003168944345, + "grad_norm": 1.358530680217993, + "learning_rate": 7.258882001979184e-07, + "loss": 0.2068, + "step": 11128 + }, + { + "epoch": 0.88167954050307, + "grad_norm": 1.2980385919963116, + "learning_rate": 7.24928698170565e-07, + "loss": 0.2191, + "step": 11129 + }, + { + "epoch": 0.8817587641117053, + "grad_norm": 1.7320359056148045, + "learning_rate": 7.239698068620272e-07, + "loss": 0.3471, + "step": 11130 + }, + { + "epoch": 0.8818379877203406, + "grad_norm": 1.1214908432589366, + "learning_rate": 7.230115263354431e-07, + "loss": 0.1731, + "step": 11131 + }, + { + "epoch": 0.881917211328976, + "grad_norm": 1.4544838823560668, + "learning_rate": 7.220538566539137e-07, + "loss": 0.2368, + "step": 11132 + }, + { + "epoch": 0.8819964349376114, + "grad_norm": 1.444508196116555, + "learning_rate": 7.21096797880495e-07, + "loss": 0.242, + "step": 11133 + }, + { + "epoch": 0.8820756585462468, + "grad_norm": 1.5998714020825704, + "learning_rate": 7.201403500782034e-07, + "loss": 0.258, + "step": 11134 + }, + { + "epoch": 0.8821548821548821, + "grad_norm": 1.5047038343701382, + "learning_rate": 7.191845133100195e-07, + "loss": 0.2553, + "step": 11135 + }, + { + "epoch": 0.8822341057635176, + "grad_norm": 1.2335738143808541, + "learning_rate": 7.182292876388785e-07, + "loss": 0.1876, + "step": 11136 + }, + { + "epoch": 0.8823133293721529, + "grad_norm": 1.2675700310218028, + "learning_rate": 7.17274673127677e-07, + "loss": 0.1927, + "step": 11137 + }, + { + "epoch": 0.8823925529807882, + "grad_norm": 1.6746170357596029, + "learning_rate": 7.163206698392744e-07, + "loss": 0.235, + "step": 11138 + }, + { + "epoch": 0.8824717765894237, + "grad_norm": 1.112452350595933, + "learning_rate": 7.153672778364851e-07, + "loss": 0.1865, + "step": 11139 + }, + { + "epoch": 0.882551000198059, + "grad_norm": 1.3476963583302197, + "learning_rate": 7.144144971820855e-07, + "loss": 0.2492, + "step": 11140 + }, + { + "epoch": 0.8826302238066944, + "grad_norm": 1.3745601291651905, + "learning_rate": 7.134623279388098e-07, + "loss": 0.2108, + "step": 11141 + }, + { + "epoch": 0.8827094474153298, + "grad_norm": 1.4471829751991212, + "learning_rate": 7.12510770169359e-07, + "loss": 0.1942, + "step": 11142 + }, + { + "epoch": 0.8827886710239652, + "grad_norm": 1.3579750508280246, + "learning_rate": 7.115598239363842e-07, + "loss": 0.2415, + "step": 11143 + }, + { + "epoch": 0.8828678946326005, + "grad_norm": 1.649660353080689, + "learning_rate": 7.106094893025006e-07, + "loss": 0.3616, + "step": 11144 + }, + { + "epoch": 0.8829471182412358, + "grad_norm": 1.4083689544411717, + "learning_rate": 7.096597663302862e-07, + "loss": 0.2494, + "step": 11145 + }, + { + "epoch": 0.8830263418498713, + "grad_norm": 1.1868097209900177, + "learning_rate": 7.087106550822731e-07, + "loss": 0.2271, + "step": 11146 + }, + { + "epoch": 0.8831055654585066, + "grad_norm": 0.9655119414661911, + "learning_rate": 7.077621556209557e-07, + "loss": 0.1362, + "step": 11147 + }, + { + "epoch": 0.883184789067142, + "grad_norm": 1.25980641754946, + "learning_rate": 7.068142680087909e-07, + "loss": 0.2124, + "step": 11148 + }, + { + "epoch": 0.8832640126757774, + "grad_norm": 1.382570708612095, + "learning_rate": 7.058669923081896e-07, + "loss": 0.2326, + "step": 11149 + }, + { + "epoch": 0.8833432362844128, + "grad_norm": 1.2965121182964578, + "learning_rate": 7.049203285815253e-07, + "loss": 0.1973, + "step": 11150 + }, + { + "epoch": 0.8834224598930481, + "grad_norm": 1.2962697860733432, + "learning_rate": 7.03974276891134e-07, + "loss": 0.1832, + "step": 11151 + }, + { + "epoch": 0.8835016835016835, + "grad_norm": 1.1769949871104188, + "learning_rate": 7.030288372993066e-07, + "loss": 0.1942, + "step": 11152 + }, + { + "epoch": 0.8835809071103189, + "grad_norm": 1.3493706930594893, + "learning_rate": 7.020840098682968e-07, + "loss": 0.1892, + "step": 11153 + }, + { + "epoch": 0.8836601307189542, + "grad_norm": 1.19527452146034, + "learning_rate": 7.011397946603138e-07, + "loss": 0.1981, + "step": 11154 + }, + { + "epoch": 0.8837393543275897, + "grad_norm": 1.2802627183424293, + "learning_rate": 7.001961917375344e-07, + "loss": 0.2223, + "step": 11155 + }, + { + "epoch": 0.883818577936225, + "grad_norm": 1.0112715127758398, + "learning_rate": 6.992532011620878e-07, + "loss": 0.1678, + "step": 11156 + }, + { + "epoch": 0.8838978015448604, + "grad_norm": 1.3056125498038613, + "learning_rate": 6.983108229960633e-07, + "loss": 0.1819, + "step": 11157 + }, + { + "epoch": 0.8839770251534957, + "grad_norm": 1.6220393942856794, + "learning_rate": 6.973690573015168e-07, + "loss": 0.3556, + "step": 11158 + }, + { + "epoch": 0.8840562487621311, + "grad_norm": 1.2363436001711858, + "learning_rate": 6.964279041404553e-07, + "loss": 0.2301, + "step": 11159 + }, + { + "epoch": 0.8841354723707665, + "grad_norm": 1.2124722647500983, + "learning_rate": 6.954873635748493e-07, + "loss": 0.1897, + "step": 11160 + }, + { + "epoch": 0.8842146959794018, + "grad_norm": 1.5778933884048083, + "learning_rate": 6.945474356666326e-07, + "loss": 0.2883, + "step": 11161 + }, + { + "epoch": 0.8842939195880373, + "grad_norm": 1.5878407158219214, + "learning_rate": 6.936081204776913e-07, + "loss": 0.243, + "step": 11162 + }, + { + "epoch": 0.8843731431966726, + "grad_norm": 1.2327575442565035, + "learning_rate": 6.926694180698734e-07, + "loss": 0.2355, + "step": 11163 + }, + { + "epoch": 0.884452366805308, + "grad_norm": 1.7790062040204306, + "learning_rate": 6.917313285049931e-07, + "loss": 0.2953, + "step": 11164 + }, + { + "epoch": 0.8845315904139434, + "grad_norm": 0.9657329218742188, + "learning_rate": 6.907938518448154e-07, + "loss": 0.1193, + "step": 11165 + }, + { + "epoch": 0.8846108140225787, + "grad_norm": 1.7615270238150564, + "learning_rate": 6.898569881510686e-07, + "loss": 0.2776, + "step": 11166 + }, + { + "epoch": 0.8846900376312141, + "grad_norm": 1.0801892530572057, + "learning_rate": 6.889207374854434e-07, + "loss": 0.144, + "step": 11167 + }, + { + "epoch": 0.8847692612398494, + "grad_norm": 1.3636255150494567, + "learning_rate": 6.879850999095849e-07, + "loss": 0.1833, + "step": 11168 + }, + { + "epoch": 0.8848484848484849, + "grad_norm": 1.2873457344877273, + "learning_rate": 6.870500754851017e-07, + "loss": 0.2284, + "step": 11169 + }, + { + "epoch": 0.8849277084571202, + "grad_norm": 1.3261903324981807, + "learning_rate": 6.861156642735578e-07, + "loss": 0.2459, + "step": 11170 + }, + { + "epoch": 0.8850069320657556, + "grad_norm": 1.3679142563892701, + "learning_rate": 6.851818663364839e-07, + "loss": 0.2227, + "step": 11171 + }, + { + "epoch": 0.885086155674391, + "grad_norm": 1.259947387383908, + "learning_rate": 6.842486817353633e-07, + "loss": 0.1846, + "step": 11172 + }, + { + "epoch": 0.8851653792830263, + "grad_norm": 1.255798605486765, + "learning_rate": 6.833161105316421e-07, + "loss": 0.1529, + "step": 11173 + }, + { + "epoch": 0.8852446028916617, + "grad_norm": 1.3375287177979553, + "learning_rate": 6.823841527867259e-07, + "loss": 0.1565, + "step": 11174 + }, + { + "epoch": 0.8853238265002971, + "grad_norm": 1.1555889860871866, + "learning_rate": 6.814528085619809e-07, + "loss": 0.2071, + "step": 11175 + }, + { + "epoch": 0.8854030501089325, + "grad_norm": 1.607548745541149, + "learning_rate": 6.805220779187293e-07, + "loss": 0.2745, + "step": 11176 + }, + { + "epoch": 0.8854822737175678, + "grad_norm": 1.562189023855713, + "learning_rate": 6.795919609182566e-07, + "loss": 0.2255, + "step": 11177 + }, + { + "epoch": 0.8855614973262033, + "grad_norm": 1.1739269051586865, + "learning_rate": 6.78662457621807e-07, + "loss": 0.171, + "step": 11178 + }, + { + "epoch": 0.8856407209348386, + "grad_norm": 1.4386673258686113, + "learning_rate": 6.777335680905817e-07, + "loss": 0.2452, + "step": 11179 + }, + { + "epoch": 0.8857199445434739, + "grad_norm": 1.4385564294416149, + "learning_rate": 6.768052923857482e-07, + "loss": 0.2663, + "step": 11180 + }, + { + "epoch": 0.8857991681521094, + "grad_norm": 1.2830687063579134, + "learning_rate": 6.758776305684245e-07, + "loss": 0.1993, + "step": 11181 + }, + { + "epoch": 0.8858783917607447, + "grad_norm": 1.2216385577401376, + "learning_rate": 6.749505826996927e-07, + "loss": 0.1905, + "step": 11182 + }, + { + "epoch": 0.8859576153693801, + "grad_norm": 1.2403824442901343, + "learning_rate": 6.740241488405963e-07, + "loss": 0.1887, + "step": 11183 + }, + { + "epoch": 0.8860368389780154, + "grad_norm": 1.481847442711561, + "learning_rate": 6.730983290521365e-07, + "loss": 0.2798, + "step": 11184 + }, + { + "epoch": 0.8861160625866509, + "grad_norm": 1.3141211503990882, + "learning_rate": 6.721731233952722e-07, + "loss": 0.1858, + "step": 11185 + }, + { + "epoch": 0.8861952861952862, + "grad_norm": 1.3968216571322727, + "learning_rate": 6.712485319309258e-07, + "loss": 0.2189, + "step": 11186 + }, + { + "epoch": 0.8862745098039215, + "grad_norm": 1.4851416177370635, + "learning_rate": 6.703245547199777e-07, + "loss": 0.2847, + "step": 11187 + }, + { + "epoch": 0.886353733412557, + "grad_norm": 1.1651476446106792, + "learning_rate": 6.694011918232635e-07, + "loss": 0.1892, + "step": 11188 + }, + { + "epoch": 0.8864329570211923, + "grad_norm": 1.7532545089733287, + "learning_rate": 6.684784433015867e-07, + "loss": 0.3073, + "step": 11189 + }, + { + "epoch": 0.8865121806298277, + "grad_norm": 1.4838859852101163, + "learning_rate": 6.675563092157044e-07, + "loss": 0.2549, + "step": 11190 + }, + { + "epoch": 0.886591404238463, + "grad_norm": 1.345930590631144, + "learning_rate": 6.666347896263326e-07, + "loss": 0.23, + "step": 11191 + }, + { + "epoch": 0.8866706278470984, + "grad_norm": 1.3008548402229794, + "learning_rate": 6.657138845941524e-07, + "loss": 0.215, + "step": 11192 + }, + { + "epoch": 0.8867498514557338, + "grad_norm": 1.302783979038374, + "learning_rate": 6.64793594179799e-07, + "loss": 0.2004, + "step": 11193 + }, + { + "epoch": 0.8868290750643691, + "grad_norm": 1.3305961263873558, + "learning_rate": 6.638739184438681e-07, + "loss": 0.2573, + "step": 11194 + }, + { + "epoch": 0.8869082986730046, + "grad_norm": 1.4755547664068145, + "learning_rate": 6.629548574469169e-07, + "loss": 0.2312, + "step": 11195 + }, + { + "epoch": 0.8869875222816399, + "grad_norm": 1.4096334066281144, + "learning_rate": 6.620364112494627e-07, + "loss": 0.2833, + "step": 11196 + }, + { + "epoch": 0.8870667458902753, + "grad_norm": 1.601073610284219, + "learning_rate": 6.611185799119791e-07, + "loss": 0.303, + "step": 11197 + }, + { + "epoch": 0.8871459694989107, + "grad_norm": 1.2344496729232577, + "learning_rate": 6.602013634949001e-07, + "loss": 0.1862, + "step": 11198 + }, + { + "epoch": 0.887225193107546, + "grad_norm": 1.12707585042057, + "learning_rate": 6.592847620586217e-07, + "loss": 0.173, + "step": 11199 + }, + { + "epoch": 0.8873044167161814, + "grad_norm": 1.4024972166229444, + "learning_rate": 6.583687756634982e-07, + "loss": 0.284, + "step": 11200 + }, + { + "epoch": 0.8873836403248168, + "grad_norm": 1.3061542272842148, + "learning_rate": 6.574534043698399e-07, + "loss": 0.1859, + "step": 11201 + }, + { + "epoch": 0.8874628639334522, + "grad_norm": 1.2102091399374055, + "learning_rate": 6.565386482379221e-07, + "loss": 0.1791, + "step": 11202 + }, + { + "epoch": 0.8875420875420875, + "grad_norm": 1.3560088952632925, + "learning_rate": 6.556245073279777e-07, + "loss": 0.2441, + "step": 11203 + }, + { + "epoch": 0.887621311150723, + "grad_norm": 1.3616742472740673, + "learning_rate": 6.547109817001951e-07, + "loss": 0.269, + "step": 11204 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 1.5537426699024823, + "learning_rate": 6.537980714147285e-07, + "loss": 0.1964, + "step": 11205 + }, + { + "epoch": 0.8877797583679936, + "grad_norm": 1.4469620018299898, + "learning_rate": 6.528857765316887e-07, + "loss": 0.1995, + "step": 11206 + }, + { + "epoch": 0.887858981976629, + "grad_norm": 1.3594143263762029, + "learning_rate": 6.519740971111432e-07, + "loss": 0.2197, + "step": 11207 + }, + { + "epoch": 0.8879382055852644, + "grad_norm": 1.330943572967816, + "learning_rate": 6.510630332131262e-07, + "loss": 0.2282, + "step": 11208 + }, + { + "epoch": 0.8880174291938998, + "grad_norm": 1.12589695284791, + "learning_rate": 6.501525848976231e-07, + "loss": 0.1755, + "step": 11209 + }, + { + "epoch": 0.8880966528025351, + "grad_norm": 1.2699153044146974, + "learning_rate": 6.492427522245836e-07, + "loss": 0.255, + "step": 11210 + }, + { + "epoch": 0.8881758764111706, + "grad_norm": 1.0919734239627077, + "learning_rate": 6.483335352539144e-07, + "loss": 0.1212, + "step": 11211 + }, + { + "epoch": 0.8882551000198059, + "grad_norm": 1.5067786466742912, + "learning_rate": 6.474249340454874e-07, + "loss": 0.1916, + "step": 11212 + }, + { + "epoch": 0.8883343236284412, + "grad_norm": 1.4560719171590666, + "learning_rate": 6.46516948659125e-07, + "loss": 0.2527, + "step": 11213 + }, + { + "epoch": 0.8884135472370767, + "grad_norm": 1.3816567045350425, + "learning_rate": 6.456095791546147e-07, + "loss": 0.1945, + "step": 11214 + }, + { + "epoch": 0.888492770845712, + "grad_norm": 1.283116813443388, + "learning_rate": 6.447028255917054e-07, + "loss": 0.1947, + "step": 11215 + }, + { + "epoch": 0.8885719944543474, + "grad_norm": 1.2115017691856218, + "learning_rate": 6.437966880300995e-07, + "loss": 0.163, + "step": 11216 + }, + { + "epoch": 0.8886512180629828, + "grad_norm": 1.1112203629223338, + "learning_rate": 6.428911665294601e-07, + "loss": 0.1313, + "step": 11217 + }, + { + "epoch": 0.8887304416716182, + "grad_norm": 1.201416995681213, + "learning_rate": 6.419862611494165e-07, + "loss": 0.1937, + "step": 11218 + }, + { + "epoch": 0.8888096652802535, + "grad_norm": 1.4623509539080328, + "learning_rate": 6.410819719495498e-07, + "loss": 0.2313, + "step": 11219 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.5902800834121844, + "learning_rate": 6.401782989894012e-07, + "loss": 0.2523, + "step": 11220 + }, + { + "epoch": 0.8889681124975243, + "grad_norm": 1.5290063797646887, + "learning_rate": 6.392752423284765e-07, + "loss": 0.2426, + "step": 11221 + }, + { + "epoch": 0.8890473361061596, + "grad_norm": 1.5289862780140833, + "learning_rate": 6.383728020262359e-07, + "loss": 0.2038, + "step": 11222 + }, + { + "epoch": 0.889126559714795, + "grad_norm": 1.1468066913938832, + "learning_rate": 6.374709781420995e-07, + "loss": 0.1976, + "step": 11223 + }, + { + "epoch": 0.8892057833234304, + "grad_norm": 1.226412591385611, + "learning_rate": 6.365697707354512e-07, + "loss": 0.2047, + "step": 11224 + }, + { + "epoch": 0.8892850069320658, + "grad_norm": 1.7639263508154128, + "learning_rate": 6.3566917986563e-07, + "loss": 0.2615, + "step": 11225 + }, + { + "epoch": 0.8893642305407011, + "grad_norm": 1.0820193125245585, + "learning_rate": 6.347692055919353e-07, + "loss": 0.1468, + "step": 11226 + }, + { + "epoch": 0.8894434541493365, + "grad_norm": 1.2999128873784225, + "learning_rate": 6.338698479736227e-07, + "loss": 0.2429, + "step": 11227 + }, + { + "epoch": 0.8895226777579719, + "grad_norm": 1.1729513853060665, + "learning_rate": 6.329711070699162e-07, + "loss": 0.178, + "step": 11228 + }, + { + "epoch": 0.8896019013666072, + "grad_norm": 1.4321886833420465, + "learning_rate": 6.320729829399918e-07, + "loss": 0.2703, + "step": 11229 + }, + { + "epoch": 0.8896811249752427, + "grad_norm": 1.384799836562096, + "learning_rate": 6.311754756429833e-07, + "loss": 0.2262, + "step": 11230 + }, + { + "epoch": 0.889760348583878, + "grad_norm": 1.5089776676328601, + "learning_rate": 6.302785852379911e-07, + "loss": 0.3245, + "step": 11231 + }, + { + "epoch": 0.8898395721925134, + "grad_norm": 1.440779782303088, + "learning_rate": 6.293823117840703e-07, + "loss": 0.2533, + "step": 11232 + }, + { + "epoch": 0.8899187958011487, + "grad_norm": 1.3279719044597766, + "learning_rate": 6.284866553402347e-07, + "loss": 0.2717, + "step": 11233 + }, + { + "epoch": 0.8899980194097841, + "grad_norm": 1.3136427436009237, + "learning_rate": 6.275916159654616e-07, + "loss": 0.1951, + "step": 11234 + }, + { + "epoch": 0.8900772430184195, + "grad_norm": 1.3931944722997267, + "learning_rate": 6.266971937186827e-07, + "loss": 0.2134, + "step": 11235 + }, + { + "epoch": 0.8901564666270548, + "grad_norm": 1.833767100352713, + "learning_rate": 6.258033886587911e-07, + "loss": 0.2147, + "step": 11236 + }, + { + "epoch": 0.8902356902356903, + "grad_norm": 1.1945451950074149, + "learning_rate": 6.249102008446418e-07, + "loss": 0.1916, + "step": 11237 + }, + { + "epoch": 0.8903149138443256, + "grad_norm": 1.4529406545023558, + "learning_rate": 6.240176303350453e-07, + "loss": 0.243, + "step": 11238 + }, + { + "epoch": 0.890394137452961, + "grad_norm": 1.105851681039205, + "learning_rate": 6.231256771887739e-07, + "loss": 0.1409, + "step": 11239 + }, + { + "epoch": 0.8904733610615964, + "grad_norm": 1.2816397853825638, + "learning_rate": 6.222343414645571e-07, + "loss": 0.2286, + "step": 11240 + }, + { + "epoch": 0.8905525846702317, + "grad_norm": 1.2509633005763492, + "learning_rate": 6.213436232210868e-07, + "loss": 0.2671, + "step": 11241 + }, + { + "epoch": 0.8906318082788671, + "grad_norm": 1.7968282501337458, + "learning_rate": 6.204535225170116e-07, + "loss": 0.2696, + "step": 11242 + }, + { + "epoch": 0.8907110318875024, + "grad_norm": 1.3941002108600529, + "learning_rate": 6.195640394109393e-07, + "loss": 0.229, + "step": 11243 + }, + { + "epoch": 0.8907902554961379, + "grad_norm": 1.2665362964850582, + "learning_rate": 6.186751739614405e-07, + "loss": 0.2198, + "step": 11244 + }, + { + "epoch": 0.8908694791047732, + "grad_norm": 1.4517396183242446, + "learning_rate": 6.177869262270419e-07, + "loss": 0.1801, + "step": 11245 + }, + { + "epoch": 0.8909487027134086, + "grad_norm": 1.30729734339586, + "learning_rate": 6.168992962662279e-07, + "loss": 0.2201, + "step": 11246 + }, + { + "epoch": 0.891027926322044, + "grad_norm": 1.6369681960276554, + "learning_rate": 6.160122841374482e-07, + "loss": 0.3008, + "step": 11247 + }, + { + "epoch": 0.8911071499306793, + "grad_norm": 1.6392414945503126, + "learning_rate": 6.151258898991064e-07, + "loss": 0.3393, + "step": 11248 + }, + { + "epoch": 0.8911863735393147, + "grad_norm": 1.3285111060496306, + "learning_rate": 6.142401136095666e-07, + "loss": 0.2212, + "step": 11249 + }, + { + "epoch": 0.8912655971479501, + "grad_norm": 1.0393734254481144, + "learning_rate": 6.133549553271556e-07, + "loss": 0.159, + "step": 11250 + }, + { + "epoch": 0.8913448207565855, + "grad_norm": 1.1275538965744762, + "learning_rate": 6.124704151101546e-07, + "loss": 0.1616, + "step": 11251 + }, + { + "epoch": 0.8914240443652208, + "grad_norm": 1.359945567692041, + "learning_rate": 6.115864930168058e-07, + "loss": 0.2762, + "step": 11252 + }, + { + "epoch": 0.8915032679738563, + "grad_norm": 1.4855710788796759, + "learning_rate": 6.107031891053139e-07, + "loss": 0.2547, + "step": 11253 + }, + { + "epoch": 0.8915824915824916, + "grad_norm": 1.1614740301086464, + "learning_rate": 6.098205034338378e-07, + "loss": 0.1446, + "step": 11254 + }, + { + "epoch": 0.8916617151911269, + "grad_norm": 1.186898767952395, + "learning_rate": 6.089384360605e-07, + "loss": 0.2581, + "step": 11255 + }, + { + "epoch": 0.8917409387997624, + "grad_norm": 1.618945819690544, + "learning_rate": 6.080569870433773e-07, + "loss": 0.2041, + "step": 11256 + }, + { + "epoch": 0.8918201624083977, + "grad_norm": 1.4846299746216791, + "learning_rate": 6.071761564405121e-07, + "loss": 0.2572, + "step": 11257 + }, + { + "epoch": 0.8918993860170331, + "grad_norm": 1.367366346891495, + "learning_rate": 6.062959443099014e-07, + "loss": 0.2785, + "step": 11258 + }, + { + "epoch": 0.8919786096256684, + "grad_norm": 1.653177036324328, + "learning_rate": 6.054163507095035e-07, + "loss": 0.3113, + "step": 11259 + }, + { + "epoch": 0.8920578332343039, + "grad_norm": 1.1624161213576714, + "learning_rate": 6.04537375697235e-07, + "loss": 0.1656, + "step": 11260 + }, + { + "epoch": 0.8921370568429392, + "grad_norm": 1.4649129596611634, + "learning_rate": 6.036590193309711e-07, + "loss": 0.2073, + "step": 11261 + }, + { + "epoch": 0.8922162804515745, + "grad_norm": 1.2371825682247837, + "learning_rate": 6.027812816685497e-07, + "loss": 0.2097, + "step": 11262 + }, + { + "epoch": 0.89229550406021, + "grad_norm": 1.3852611306041136, + "learning_rate": 6.019041627677635e-07, + "loss": 0.213, + "step": 11263 + }, + { + "epoch": 0.8923747276688453, + "grad_norm": 1.4409802160224254, + "learning_rate": 6.010276626863687e-07, + "loss": 0.2515, + "step": 11264 + }, + { + "epoch": 0.8924539512774807, + "grad_norm": 1.5774758971813887, + "learning_rate": 6.001517814820757e-07, + "loss": 0.2279, + "step": 11265 + }, + { + "epoch": 0.892533174886116, + "grad_norm": 1.454816758733758, + "learning_rate": 5.992765192125594e-07, + "loss": 0.2403, + "step": 11266 + }, + { + "epoch": 0.8926123984947515, + "grad_norm": 1.441668957544932, + "learning_rate": 5.984018759354515e-07, + "loss": 0.2557, + "step": 11267 + }, + { + "epoch": 0.8926916221033868, + "grad_norm": 1.3576792572336833, + "learning_rate": 5.975278517083405e-07, + "loss": 0.2057, + "step": 11268 + }, + { + "epoch": 0.8927708457120221, + "grad_norm": 1.1483281862788846, + "learning_rate": 5.966544465887803e-07, + "loss": 0.1932, + "step": 11269 + }, + { + "epoch": 0.8928500693206576, + "grad_norm": 1.0860636050183037, + "learning_rate": 5.957816606342792e-07, + "loss": 0.1744, + "step": 11270 + }, + { + "epoch": 0.8929292929292929, + "grad_norm": 1.4043056715639777, + "learning_rate": 5.949094939023037e-07, + "loss": 0.2335, + "step": 11271 + }, + { + "epoch": 0.8930085165379283, + "grad_norm": 1.3232268237447964, + "learning_rate": 5.940379464502854e-07, + "loss": 0.1651, + "step": 11272 + }, + { + "epoch": 0.8930877401465637, + "grad_norm": 1.6890526154789487, + "learning_rate": 5.931670183356097e-07, + "loss": 0.2741, + "step": 11273 + }, + { + "epoch": 0.893166963755199, + "grad_norm": 1.044260339267375, + "learning_rate": 5.922967096156218e-07, + "loss": 0.1823, + "step": 11274 + }, + { + "epoch": 0.8932461873638344, + "grad_norm": 1.540836548776842, + "learning_rate": 5.914270203476291e-07, + "loss": 0.2923, + "step": 11275 + }, + { + "epoch": 0.8933254109724698, + "grad_norm": 1.2671417892062502, + "learning_rate": 5.90557950588897e-07, + "loss": 0.1631, + "step": 11276 + }, + { + "epoch": 0.8934046345811052, + "grad_norm": 1.436723689610927, + "learning_rate": 5.896895003966463e-07, + "loss": 0.2105, + "step": 11277 + }, + { + "epoch": 0.8934838581897405, + "grad_norm": 1.1905789304282068, + "learning_rate": 5.888216698280646e-07, + "loss": 0.1683, + "step": 11278 + }, + { + "epoch": 0.893563081798376, + "grad_norm": 1.2743115790041264, + "learning_rate": 5.879544589402919e-07, + "loss": 0.1936, + "step": 11279 + }, + { + "epoch": 0.8936423054070113, + "grad_norm": 1.7323363350840604, + "learning_rate": 5.870878677904302e-07, + "loss": 0.3622, + "step": 11280 + }, + { + "epoch": 0.8937215290156466, + "grad_norm": 1.5065999345055028, + "learning_rate": 5.862218964355382e-07, + "loss": 0.2784, + "step": 11281 + }, + { + "epoch": 0.893800752624282, + "grad_norm": 1.5383097990301149, + "learning_rate": 5.853565449326404e-07, + "loss": 0.2466, + "step": 11282 + }, + { + "epoch": 0.8938799762329174, + "grad_norm": 1.2400261136630268, + "learning_rate": 5.844918133387134e-07, + "loss": 0.199, + "step": 11283 + }, + { + "epoch": 0.8939591998415528, + "grad_norm": 1.2155040059420341, + "learning_rate": 5.836277017106951e-07, + "loss": 0.1675, + "step": 11284 + }, + { + "epoch": 0.8940384234501881, + "grad_norm": 1.2856551693108929, + "learning_rate": 5.827642101054854e-07, + "loss": 0.2081, + "step": 11285 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 1.6931471362544004, + "learning_rate": 5.819013385799388e-07, + "loss": 0.2524, + "step": 11286 + }, + { + "epoch": 0.8941968706674589, + "grad_norm": 1.5484010143534122, + "learning_rate": 5.810390871908711e-07, + "loss": 0.2477, + "step": 11287 + }, + { + "epoch": 0.8942760942760942, + "grad_norm": 1.310045657882032, + "learning_rate": 5.801774559950591e-07, + "loss": 0.1863, + "step": 11288 + }, + { + "epoch": 0.8943553178847297, + "grad_norm": 1.1867356211394389, + "learning_rate": 5.793164450492372e-07, + "loss": 0.1453, + "step": 11289 + }, + { + "epoch": 0.894434541493365, + "grad_norm": 1.8848513460708247, + "learning_rate": 5.784560544100959e-07, + "loss": 0.3078, + "step": 11290 + }, + { + "epoch": 0.8945137651020004, + "grad_norm": 1.2974320595761797, + "learning_rate": 5.775962841342919e-07, + "loss": 0.2261, + "step": 11291 + }, + { + "epoch": 0.8945929887106358, + "grad_norm": 1.6114914646046061, + "learning_rate": 5.767371342784345e-07, + "loss": 0.2344, + "step": 11292 + }, + { + "epoch": 0.8946722123192712, + "grad_norm": 1.5415672994218592, + "learning_rate": 5.758786048990939e-07, + "loss": 0.2413, + "step": 11293 + }, + { + "epoch": 0.8947514359279065, + "grad_norm": 1.1072199847719513, + "learning_rate": 5.750206960528027e-07, + "loss": 0.1888, + "step": 11294 + }, + { + "epoch": 0.8948306595365418, + "grad_norm": 1.5734904174558717, + "learning_rate": 5.741634077960479e-07, + "loss": 0.2771, + "step": 11295 + }, + { + "epoch": 0.8949098831451773, + "grad_norm": 1.2777103962039311, + "learning_rate": 5.733067401852788e-07, + "loss": 0.2692, + "step": 11296 + }, + { + "epoch": 0.8949891067538126, + "grad_norm": 1.2976147376667901, + "learning_rate": 5.724506932769014e-07, + "loss": 0.2269, + "step": 11297 + }, + { + "epoch": 0.895068330362448, + "grad_norm": 1.1156311606426472, + "learning_rate": 5.71595267127284e-07, + "loss": 0.1258, + "step": 11298 + }, + { + "epoch": 0.8951475539710834, + "grad_norm": 1.3322417333742287, + "learning_rate": 5.707404617927526e-07, + "loss": 0.1839, + "step": 11299 + }, + { + "epoch": 0.8952267775797188, + "grad_norm": 1.3975783059514086, + "learning_rate": 5.698862773295888e-07, + "loss": 0.2276, + "step": 11300 + }, + { + "epoch": 0.8953060011883541, + "grad_norm": 1.426479682980726, + "learning_rate": 5.69032713794041e-07, + "loss": 0.226, + "step": 11301 + }, + { + "epoch": 0.8953852247969895, + "grad_norm": 1.1900460493453853, + "learning_rate": 5.681797712423099e-07, + "loss": 0.1541, + "step": 11302 + }, + { + "epoch": 0.8954644484056249, + "grad_norm": 1.1942963362216403, + "learning_rate": 5.673274497305559e-07, + "loss": 0.1883, + "step": 11303 + }, + { + "epoch": 0.8955436720142602, + "grad_norm": 1.4018705923639465, + "learning_rate": 5.664757493149042e-07, + "loss": 0.2837, + "step": 11304 + }, + { + "epoch": 0.8956228956228957, + "grad_norm": 1.7135132582349892, + "learning_rate": 5.656246700514323e-07, + "loss": 0.2488, + "step": 11305 + }, + { + "epoch": 0.895702119231531, + "grad_norm": 1.4224352111800682, + "learning_rate": 5.647742119961797e-07, + "loss": 0.2305, + "step": 11306 + }, + { + "epoch": 0.8957813428401664, + "grad_norm": 1.1836426972003882, + "learning_rate": 5.639243752051482e-07, + "loss": 0.1725, + "step": 11307 + }, + { + "epoch": 0.8958605664488017, + "grad_norm": 1.2796218846211957, + "learning_rate": 5.630751597342921e-07, + "loss": 0.1904, + "step": 11308 + }, + { + "epoch": 0.8959397900574371, + "grad_norm": 1.2689183194157867, + "learning_rate": 5.622265656395276e-07, + "loss": 0.1936, + "step": 11309 + }, + { + "epoch": 0.8960190136660725, + "grad_norm": 1.483024624501899, + "learning_rate": 5.613785929767335e-07, + "loss": 0.2514, + "step": 11310 + }, + { + "epoch": 0.8960982372747078, + "grad_norm": 1.6310744359522076, + "learning_rate": 5.605312418017439e-07, + "loss": 0.3105, + "step": 11311 + }, + { + "epoch": 0.8961774608833433, + "grad_norm": 1.2218020080492402, + "learning_rate": 5.59684512170352e-07, + "loss": 0.2179, + "step": 11312 + }, + { + "epoch": 0.8962566844919786, + "grad_norm": 1.2910209724626072, + "learning_rate": 5.588384041383089e-07, + "loss": 0.1773, + "step": 11313 + }, + { + "epoch": 0.896335908100614, + "grad_norm": 1.406795183977652, + "learning_rate": 5.579929177613308e-07, + "loss": 0.2849, + "step": 11314 + }, + { + "epoch": 0.8964151317092494, + "grad_norm": 1.235775171695432, + "learning_rate": 5.571480530950879e-07, + "loss": 0.1988, + "step": 11315 + }, + { + "epoch": 0.8964943553178847, + "grad_norm": 1.4740244839958896, + "learning_rate": 5.563038101952067e-07, + "loss": 0.2841, + "step": 11316 + }, + { + "epoch": 0.8965735789265201, + "grad_norm": 1.1070878368035737, + "learning_rate": 5.554601891172817e-07, + "loss": 0.1612, + "step": 11317 + }, + { + "epoch": 0.8966528025351554, + "grad_norm": 1.3406334628096848, + "learning_rate": 5.546171899168595e-07, + "loss": 0.2282, + "step": 11318 + }, + { + "epoch": 0.8967320261437909, + "grad_norm": 1.6442479065037026, + "learning_rate": 5.537748126494446e-07, + "loss": 0.2977, + "step": 11319 + }, + { + "epoch": 0.8968112497524262, + "grad_norm": 1.072812409772159, + "learning_rate": 5.529330573705083e-07, + "loss": 0.1704, + "step": 11320 + }, + { + "epoch": 0.8968904733610616, + "grad_norm": 1.3470759345771661, + "learning_rate": 5.520919241354728e-07, + "loss": 0.2299, + "step": 11321 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 1.3190738239635866, + "learning_rate": 5.512514129997227e-07, + "loss": 0.2375, + "step": 11322 + }, + { + "epoch": 0.8970489205783323, + "grad_norm": 1.275169964574663, + "learning_rate": 5.504115240186048e-07, + "loss": 0.1591, + "step": 11323 + }, + { + "epoch": 0.8971281441869677, + "grad_norm": 1.3796100413778525, + "learning_rate": 5.495722572474183e-07, + "loss": 0.2421, + "step": 11324 + }, + { + "epoch": 0.8972073677956031, + "grad_norm": 1.3259854659581767, + "learning_rate": 5.487336127414267e-07, + "loss": 0.2102, + "step": 11325 + }, + { + "epoch": 0.8972865914042385, + "grad_norm": 1.3242590469525624, + "learning_rate": 5.478955905558491e-07, + "loss": 0.2455, + "step": 11326 + }, + { + "epoch": 0.8973658150128738, + "grad_norm": 1.106817363897461, + "learning_rate": 5.470581907458672e-07, + "loss": 0.156, + "step": 11327 + }, + { + "epoch": 0.8974450386215093, + "grad_norm": 1.2317653774955672, + "learning_rate": 5.462214133666189e-07, + "loss": 0.1671, + "step": 11328 + }, + { + "epoch": 0.8975242622301446, + "grad_norm": 1.4126933902094934, + "learning_rate": 5.453852584732e-07, + "loss": 0.2161, + "step": 11329 + }, + { + "epoch": 0.8976034858387799, + "grad_norm": 1.1733881551598588, + "learning_rate": 5.4454972612067e-07, + "loss": 0.1817, + "step": 11330 + }, + { + "epoch": 0.8976827094474154, + "grad_norm": 1.3312278010518537, + "learning_rate": 5.437148163640449e-07, + "loss": 0.2291, + "step": 11331 + }, + { + "epoch": 0.8977619330560507, + "grad_norm": 1.7906696721780908, + "learning_rate": 5.428805292582973e-07, + "loss": 0.2741, + "step": 11332 + }, + { + "epoch": 0.8978411566646861, + "grad_norm": 1.1303085992557462, + "learning_rate": 5.420468648583621e-07, + "loss": 0.1477, + "step": 11333 + }, + { + "epoch": 0.8979203802733214, + "grad_norm": 1.2503010189911672, + "learning_rate": 5.412138232191333e-07, + "loss": 0.1767, + "step": 11334 + }, + { + "epoch": 0.8979996038819569, + "grad_norm": 1.5633845706763296, + "learning_rate": 5.403814043954592e-07, + "loss": 0.2378, + "step": 11335 + }, + { + "epoch": 0.8980788274905922, + "grad_norm": 1.1167936422622222, + "learning_rate": 5.39549608442157e-07, + "loss": 0.1331, + "step": 11336 + }, + { + "epoch": 0.8981580510992275, + "grad_norm": 1.1267852080403595, + "learning_rate": 5.387184354139896e-07, + "loss": 0.136, + "step": 11337 + }, + { + "epoch": 0.898237274707863, + "grad_norm": 1.1560933617445144, + "learning_rate": 5.378878853656877e-07, + "loss": 0.1894, + "step": 11338 + }, + { + "epoch": 0.8983164983164983, + "grad_norm": 1.526294067325196, + "learning_rate": 5.370579583519409e-07, + "loss": 0.2311, + "step": 11339 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 1.3672258240134598, + "learning_rate": 5.362286544273942e-07, + "loss": 0.1984, + "step": 11340 + }, + { + "epoch": 0.898474945533769, + "grad_norm": 1.203141230594665, + "learning_rate": 5.353999736466531e-07, + "loss": 0.1692, + "step": 11341 + }, + { + "epoch": 0.8985541691424045, + "grad_norm": 0.9958655829035286, + "learning_rate": 5.345719160642848e-07, + "loss": 0.175, + "step": 11342 + }, + { + "epoch": 0.8986333927510398, + "grad_norm": 1.073971393542406, + "learning_rate": 5.337444817348103e-07, + "loss": 0.128, + "step": 11343 + }, + { + "epoch": 0.8987126163596751, + "grad_norm": 1.2378900609734436, + "learning_rate": 5.329176707127115e-07, + "loss": 0.2107, + "step": 11344 + }, + { + "epoch": 0.8987918399683106, + "grad_norm": 1.222133507604698, + "learning_rate": 5.320914830524337e-07, + "loss": 0.1871, + "step": 11345 + }, + { + "epoch": 0.8988710635769459, + "grad_norm": 1.098244225356646, + "learning_rate": 5.312659188083746e-07, + "loss": 0.1445, + "step": 11346 + }, + { + "epoch": 0.8989502871855813, + "grad_norm": 1.3675979063567736, + "learning_rate": 5.304409780348919e-07, + "loss": 0.2535, + "step": 11347 + }, + { + "epoch": 0.8990295107942167, + "grad_norm": 1.2724222370788936, + "learning_rate": 5.296166607863085e-07, + "loss": 0.2551, + "step": 11348 + }, + { + "epoch": 0.899108734402852, + "grad_norm": 1.203831052219527, + "learning_rate": 5.287929671168989e-07, + "loss": 0.1477, + "step": 11349 + }, + { + "epoch": 0.8991879580114874, + "grad_norm": 1.317295899865152, + "learning_rate": 5.279698970809011e-07, + "loss": 0.2385, + "step": 11350 + }, + { + "epoch": 0.8992671816201228, + "grad_norm": 1.326966316900384, + "learning_rate": 5.271474507325058e-07, + "loss": 0.21, + "step": 11351 + }, + { + "epoch": 0.8993464052287582, + "grad_norm": 1.2145670078554756, + "learning_rate": 5.263256281258733e-07, + "loss": 0.1637, + "step": 11352 + }, + { + "epoch": 0.8994256288373935, + "grad_norm": 1.4211902363494058, + "learning_rate": 5.255044293151135e-07, + "loss": 0.2873, + "step": 11353 + }, + { + "epoch": 0.899504852446029, + "grad_norm": 1.3854266328194298, + "learning_rate": 5.246838543542964e-07, + "loss": 0.2233, + "step": 11354 + }, + { + "epoch": 0.8995840760546643, + "grad_norm": 1.4085883118439881, + "learning_rate": 5.23863903297458e-07, + "loss": 0.1979, + "step": 11355 + }, + { + "epoch": 0.8996632996632996, + "grad_norm": 1.4769738598097448, + "learning_rate": 5.230445761985836e-07, + "loss": 0.2596, + "step": 11356 + }, + { + "epoch": 0.899742523271935, + "grad_norm": 1.6099820062460852, + "learning_rate": 5.222258731116237e-07, + "loss": 0.2623, + "step": 11357 + }, + { + "epoch": 0.8998217468805704, + "grad_norm": 1.3185848590239624, + "learning_rate": 5.214077940904872e-07, + "loss": 0.2258, + "step": 11358 + }, + { + "epoch": 0.8999009704892058, + "grad_norm": 1.6162709943845397, + "learning_rate": 5.205903391890387e-07, + "loss": 0.2308, + "step": 11359 + }, + { + "epoch": 0.8999801940978411, + "grad_norm": 1.070711772449832, + "learning_rate": 5.197735084611033e-07, + "loss": 0.1879, + "step": 11360 + }, + { + "epoch": 0.9000594177064766, + "grad_norm": 1.219055837136359, + "learning_rate": 5.189573019604676e-07, + "loss": 0.1758, + "step": 11361 + }, + { + "epoch": 0.9001386413151119, + "grad_norm": 1.537556658752876, + "learning_rate": 5.181417197408733e-07, + "loss": 0.2714, + "step": 11362 + }, + { + "epoch": 0.9002178649237472, + "grad_norm": 1.2285730873010796, + "learning_rate": 5.173267618560229e-07, + "loss": 0.2427, + "step": 11363 + }, + { + "epoch": 0.9002970885323827, + "grad_norm": 0.9861842337554035, + "learning_rate": 5.165124283595779e-07, + "loss": 0.1185, + "step": 11364 + }, + { + "epoch": 0.900376312141018, + "grad_norm": 1.5344212455510402, + "learning_rate": 5.156987193051577e-07, + "loss": 0.2746, + "step": 11365 + }, + { + "epoch": 0.9004555357496534, + "grad_norm": 1.408660037451225, + "learning_rate": 5.148856347463416e-07, + "loss": 0.1962, + "step": 11366 + }, + { + "epoch": 0.9005347593582887, + "grad_norm": 1.4697398999513414, + "learning_rate": 5.140731747366656e-07, + "loss": 0.2069, + "step": 11367 + }, + { + "epoch": 0.9006139829669242, + "grad_norm": 1.2461685686705504, + "learning_rate": 5.132613393296293e-07, + "loss": 0.1997, + "step": 11368 + }, + { + "epoch": 0.9006932065755595, + "grad_norm": 1.3460139831610238, + "learning_rate": 5.124501285786865e-07, + "loss": 0.1745, + "step": 11369 + }, + { + "epoch": 0.9007724301841948, + "grad_norm": 1.5533842665404258, + "learning_rate": 5.1163954253725e-07, + "loss": 0.2504, + "step": 11370 + }, + { + "epoch": 0.9008516537928303, + "grad_norm": 1.2402129098705812, + "learning_rate": 5.108295812586961e-07, + "loss": 0.2011, + "step": 11371 + }, + { + "epoch": 0.9009308774014656, + "grad_norm": 1.620876653689994, + "learning_rate": 5.100202447963553e-07, + "loss": 0.248, + "step": 11372 + }, + { + "epoch": 0.901010101010101, + "grad_norm": 1.5318757161205356, + "learning_rate": 5.092115332035163e-07, + "loss": 0.2377, + "step": 11373 + }, + { + "epoch": 0.9010893246187364, + "grad_norm": 1.387867466782114, + "learning_rate": 5.084034465334342e-07, + "loss": 0.1932, + "step": 11374 + }, + { + "epoch": 0.9011685482273718, + "grad_norm": 1.3972242720614534, + "learning_rate": 5.07595984839313e-07, + "loss": 0.2267, + "step": 11375 + }, + { + "epoch": 0.9012477718360071, + "grad_norm": 1.2386671868902543, + "learning_rate": 5.067891481743203e-07, + "loss": 0.1836, + "step": 11376 + }, + { + "epoch": 0.9013269954446425, + "grad_norm": 1.6107819433048094, + "learning_rate": 5.059829365915859e-07, + "loss": 0.3054, + "step": 11377 + }, + { + "epoch": 0.9014062190532779, + "grad_norm": 1.5661555145482802, + "learning_rate": 5.051773501441926e-07, + "loss": 0.2306, + "step": 11378 + }, + { + "epoch": 0.9014854426619132, + "grad_norm": 1.3683896645436369, + "learning_rate": 5.043723888851837e-07, + "loss": 0.2496, + "step": 11379 + }, + { + "epoch": 0.9015646662705487, + "grad_norm": 1.587397155778934, + "learning_rate": 5.035680528675635e-07, + "loss": 0.2659, + "step": 11380 + }, + { + "epoch": 0.901643889879184, + "grad_norm": 1.6449214395639375, + "learning_rate": 5.027643421442929e-07, + "loss": 0.2473, + "step": 11381 + }, + { + "epoch": 0.9017231134878194, + "grad_norm": 1.5590032711446369, + "learning_rate": 5.01961256768293e-07, + "loss": 0.1907, + "step": 11382 + }, + { + "epoch": 0.9018023370964547, + "grad_norm": 1.2802405210780758, + "learning_rate": 5.011587967924414e-07, + "loss": 0.1234, + "step": 11383 + }, + { + "epoch": 0.9018815607050901, + "grad_norm": 1.098312871652314, + "learning_rate": 5.003569622695792e-07, + "loss": 0.1402, + "step": 11384 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 1.2568472561837125, + "learning_rate": 4.99555753252502e-07, + "loss": 0.2283, + "step": 11385 + }, + { + "epoch": 0.9020400079223608, + "grad_norm": 1.33091756087352, + "learning_rate": 4.987551697939629e-07, + "loss": 0.2554, + "step": 11386 + }, + { + "epoch": 0.9021192315309963, + "grad_norm": 1.0895109755664834, + "learning_rate": 4.979552119466802e-07, + "loss": 0.1598, + "step": 11387 + }, + { + "epoch": 0.9021984551396316, + "grad_norm": 1.3668570458311966, + "learning_rate": 4.971558797633258e-07, + "loss": 0.2479, + "step": 11388 + }, + { + "epoch": 0.902277678748267, + "grad_norm": 1.2760477458937718, + "learning_rate": 4.963571732965311e-07, + "loss": 0.197, + "step": 11389 + }, + { + "epoch": 0.9023569023569024, + "grad_norm": 1.6020991949566248, + "learning_rate": 4.955590925988896e-07, + "loss": 0.3074, + "step": 11390 + }, + { + "epoch": 0.9024361259655377, + "grad_norm": 1.7250715360888407, + "learning_rate": 4.947616377229492e-07, + "loss": 0.2336, + "step": 11391 + }, + { + "epoch": 0.9025153495741731, + "grad_norm": 1.488642712588369, + "learning_rate": 4.939648087212168e-07, + "loss": 0.2536, + "step": 11392 + }, + { + "epoch": 0.9025945731828084, + "grad_norm": 1.5629289950185032, + "learning_rate": 4.931686056461626e-07, + "loss": 0.2336, + "step": 11393 + }, + { + "epoch": 0.9026737967914439, + "grad_norm": 1.583776097965649, + "learning_rate": 4.923730285502126e-07, + "loss": 0.2509, + "step": 11394 + }, + { + "epoch": 0.9027530204000792, + "grad_norm": 1.5154665756283534, + "learning_rate": 4.915780774857504e-07, + "loss": 0.2884, + "step": 11395 + }, + { + "epoch": 0.9028322440087146, + "grad_norm": 1.2517962124246766, + "learning_rate": 4.907837525051196e-07, + "loss": 0.1816, + "step": 11396 + }, + { + "epoch": 0.90291146761735, + "grad_norm": 1.5462027095360786, + "learning_rate": 4.89990053660624e-07, + "loss": 0.3189, + "step": 11397 + }, + { + "epoch": 0.9029906912259853, + "grad_norm": 1.4039947543600597, + "learning_rate": 4.891969810045239e-07, + "loss": 0.2237, + "step": 11398 + }, + { + "epoch": 0.9030699148346207, + "grad_norm": 1.3042630988143886, + "learning_rate": 4.884045345890387e-07, + "loss": 0.2005, + "step": 11399 + }, + { + "epoch": 0.9031491384432561, + "grad_norm": 1.479450632462305, + "learning_rate": 4.87612714466349e-07, + "loss": 0.3089, + "step": 11400 + }, + { + "epoch": 0.9032283620518915, + "grad_norm": 1.5594688532818757, + "learning_rate": 4.868215206885918e-07, + "loss": 0.2101, + "step": 11401 + }, + { + "epoch": 0.9033075856605268, + "grad_norm": 1.0299462844120528, + "learning_rate": 4.860309533078611e-07, + "loss": 0.1274, + "step": 11402 + }, + { + "epoch": 0.9033868092691623, + "grad_norm": 1.9166456177510156, + "learning_rate": 4.852410123762164e-07, + "loss": 0.3007, + "step": 11403 + }, + { + "epoch": 0.9034660328777976, + "grad_norm": 1.338086690352521, + "learning_rate": 4.844516979456671e-07, + "loss": 0.2041, + "step": 11404 + }, + { + "epoch": 0.9035452564864329, + "grad_norm": 1.2322799528405464, + "learning_rate": 4.836630100681872e-07, + "loss": 0.1688, + "step": 11405 + }, + { + "epoch": 0.9036244800950683, + "grad_norm": 1.6135528697994006, + "learning_rate": 4.828749487957097e-07, + "loss": 0.2882, + "step": 11406 + }, + { + "epoch": 0.9037037037037037, + "grad_norm": 1.1655779112500846, + "learning_rate": 4.82087514180124e-07, + "loss": 0.1856, + "step": 11407 + }, + { + "epoch": 0.9037829273123391, + "grad_norm": 1.4718796388116533, + "learning_rate": 4.813007062732756e-07, + "loss": 0.2251, + "step": 11408 + }, + { + "epoch": 0.9038621509209744, + "grad_norm": 1.0698800138197295, + "learning_rate": 4.805145251269772e-07, + "loss": 0.1676, + "step": 11409 + }, + { + "epoch": 0.9039413745296099, + "grad_norm": 1.1870476041713254, + "learning_rate": 4.797289707929919e-07, + "loss": 0.168, + "step": 11410 + }, + { + "epoch": 0.9040205981382452, + "grad_norm": 1.269900936575648, + "learning_rate": 4.789440433230452e-07, + "loss": 0.2004, + "step": 11411 + }, + { + "epoch": 0.9040998217468805, + "grad_norm": 1.315033862582558, + "learning_rate": 4.781597427688189e-07, + "loss": 0.1933, + "step": 11412 + }, + { + "epoch": 0.904179045355516, + "grad_norm": 1.3314371072151376, + "learning_rate": 4.773760691819596e-07, + "loss": 0.2066, + "step": 11413 + }, + { + "epoch": 0.9042582689641513, + "grad_norm": 1.6267462378265747, + "learning_rate": 4.765930226140658e-07, + "loss": 0.2276, + "step": 11414 + }, + { + "epoch": 0.9043374925727867, + "grad_norm": 1.5170199275185874, + "learning_rate": 4.7581060311669757e-07, + "loss": 0.1854, + "step": 11415 + }, + { + "epoch": 0.904416716181422, + "grad_norm": 1.2337378966475265, + "learning_rate": 4.7502881074137476e-07, + "loss": 0.2203, + "step": 11416 + }, + { + "epoch": 0.9044959397900575, + "grad_norm": 1.279520330318968, + "learning_rate": 4.742476455395706e-07, + "loss": 0.2192, + "step": 11417 + }, + { + "epoch": 0.9045751633986928, + "grad_norm": 1.2879903944159783, + "learning_rate": 4.734671075627262e-07, + "loss": 0.1888, + "step": 11418 + }, + { + "epoch": 0.9046543870073281, + "grad_norm": 1.349526012663776, + "learning_rate": 4.726871968622337e-07, + "loss": 0.2327, + "step": 11419 + }, + { + "epoch": 0.9047336106159636, + "grad_norm": 1.3792906672124408, + "learning_rate": 4.7190791348944777e-07, + "loss": 0.2125, + "step": 11420 + }, + { + "epoch": 0.9048128342245989, + "grad_norm": 0.9746645782948918, + "learning_rate": 4.711292574956772e-07, + "loss": 0.1673, + "step": 11421 + }, + { + "epoch": 0.9048920578332343, + "grad_norm": 1.625455595410993, + "learning_rate": 4.7035122893219653e-07, + "loss": 0.2276, + "step": 11422 + }, + { + "epoch": 0.9049712814418697, + "grad_norm": 1.0555706634774598, + "learning_rate": 4.695738278502338e-07, + "loss": 0.151, + "step": 11423 + }, + { + "epoch": 0.9050505050505051, + "grad_norm": 1.5723002557247647, + "learning_rate": 4.6879705430097566e-07, + "loss": 0.2218, + "step": 11424 + }, + { + "epoch": 0.9051297286591404, + "grad_norm": 1.3062239306314902, + "learning_rate": 4.6802090833557136e-07, + "loss": 0.2295, + "step": 11425 + }, + { + "epoch": 0.9052089522677758, + "grad_norm": 0.9010926330492873, + "learning_rate": 4.6724539000512546e-07, + "loss": 0.1112, + "step": 11426 + }, + { + "epoch": 0.9052881758764112, + "grad_norm": 1.2737771223156693, + "learning_rate": 4.6647049936070054e-07, + "loss": 0.183, + "step": 11427 + }, + { + "epoch": 0.9053673994850465, + "grad_norm": 1.142861898741502, + "learning_rate": 4.656962364533224e-07, + "loss": 0.1404, + "step": 11428 + }, + { + "epoch": 0.905446623093682, + "grad_norm": 1.1701301630882575, + "learning_rate": 4.649226013339703e-07, + "loss": 0.2177, + "step": 11429 + }, + { + "epoch": 0.9055258467023173, + "grad_norm": 1.1783358495937835, + "learning_rate": 4.641495940535845e-07, + "loss": 0.1904, + "step": 11430 + }, + { + "epoch": 0.9056050703109526, + "grad_norm": 1.3653305134647453, + "learning_rate": 4.633772146630655e-07, + "loss": 0.2385, + "step": 11431 + }, + { + "epoch": 0.905684293919588, + "grad_norm": 1.5685578557173798, + "learning_rate": 4.626054632132693e-07, + "loss": 0.2705, + "step": 11432 + }, + { + "epoch": 0.9057635175282234, + "grad_norm": 1.5574355920238032, + "learning_rate": 4.6183433975501067e-07, + "loss": 0.1969, + "step": 11433 + }, + { + "epoch": 0.9058427411368588, + "grad_norm": 1.2512610793744947, + "learning_rate": 4.61063844339068e-07, + "loss": 0.1829, + "step": 11434 + }, + { + "epoch": 0.9059219647454941, + "grad_norm": 1.5736830496131415, + "learning_rate": 4.6029397701617296e-07, + "loss": 0.2147, + "step": 11435 + }, + { + "epoch": 0.9060011883541296, + "grad_norm": 1.8558708006423443, + "learning_rate": 4.595247378370171e-07, + "loss": 0.3252, + "step": 11436 + }, + { + "epoch": 0.9060804119627649, + "grad_norm": 1.1974188610541265, + "learning_rate": 4.5875612685225e-07, + "loss": 0.1582, + "step": 11437 + }, + { + "epoch": 0.9061596355714002, + "grad_norm": 1.329125266802523, + "learning_rate": 4.5798814411248336e-07, + "loss": 0.1838, + "step": 11438 + }, + { + "epoch": 0.9062388591800357, + "grad_norm": 1.3581890926221736, + "learning_rate": 4.5722078966828455e-07, + "loss": 0.2199, + "step": 11439 + }, + { + "epoch": 0.906318082788671, + "grad_norm": 1.5281357014004326, + "learning_rate": 4.5645406357017865e-07, + "loss": 0.2612, + "step": 11440 + }, + { + "epoch": 0.9063973063973064, + "grad_norm": 1.3358689675729154, + "learning_rate": 4.5568796586865304e-07, + "loss": 0.1903, + "step": 11441 + }, + { + "epoch": 0.9064765300059417, + "grad_norm": 1.6621730605961096, + "learning_rate": 4.5492249661415077e-07, + "loss": 0.2606, + "step": 11442 + }, + { + "epoch": 0.9065557536145772, + "grad_norm": 1.4293273923108951, + "learning_rate": 4.541576558570726e-07, + "loss": 0.2354, + "step": 11443 + }, + { + "epoch": 0.9066349772232125, + "grad_norm": 1.2242375786311315, + "learning_rate": 4.533934436477827e-07, + "loss": 0.1627, + "step": 11444 + }, + { + "epoch": 0.9067142008318478, + "grad_norm": 1.3474590617236502, + "learning_rate": 4.526298600365997e-07, + "loss": 0.2429, + "step": 11445 + }, + { + "epoch": 0.9067934244404833, + "grad_norm": 1.2770921670065432, + "learning_rate": 4.5186690507379894e-07, + "loss": 0.1935, + "step": 11446 + }, + { + "epoch": 0.9068726480491186, + "grad_norm": 1.1095923965120869, + "learning_rate": 4.5110457880962246e-07, + "loss": 0.1302, + "step": 11447 + }, + { + "epoch": 0.906951871657754, + "grad_norm": 1.317481680057043, + "learning_rate": 4.503428812942623e-07, + "loss": 0.2187, + "step": 11448 + }, + { + "epoch": 0.9070310952663894, + "grad_norm": 1.8330943823497057, + "learning_rate": 4.495818125778717e-07, + "loss": 0.2661, + "step": 11449 + }, + { + "epoch": 0.9071103188750248, + "grad_norm": 1.2650905901969383, + "learning_rate": 4.488213727105672e-07, + "loss": 0.147, + "step": 11450 + }, + { + "epoch": 0.9071895424836601, + "grad_norm": 1.5642395395973636, + "learning_rate": 4.4806156174241776e-07, + "loss": 0.2679, + "step": 11451 + }, + { + "epoch": 0.9072687660922955, + "grad_norm": 1.1922316840883551, + "learning_rate": 4.4730237972345326e-07, + "loss": 0.1472, + "step": 11452 + }, + { + "epoch": 0.9073479897009309, + "grad_norm": 1.172952372995457, + "learning_rate": 4.465438267036604e-07, + "loss": 0.2279, + "step": 11453 + }, + { + "epoch": 0.9074272133095662, + "grad_norm": 1.264911390030226, + "learning_rate": 4.4578590273299027e-07, + "loss": 0.1922, + "step": 11454 + }, + { + "epoch": 0.9075064369182017, + "grad_norm": 1.5933968405366234, + "learning_rate": 4.4502860786134747e-07, + "loss": 0.195, + "step": 11455 + }, + { + "epoch": 0.907585660526837, + "grad_norm": 1.5812907823759608, + "learning_rate": 4.4427194213859216e-07, + "loss": 0.2335, + "step": 11456 + }, + { + "epoch": 0.9076648841354724, + "grad_norm": 1.4334218364671192, + "learning_rate": 4.435159056145533e-07, + "loss": 0.1957, + "step": 11457 + }, + { + "epoch": 0.9077441077441077, + "grad_norm": 1.2027290573049838, + "learning_rate": 4.427604983390077e-07, + "loss": 0.2466, + "step": 11458 + }, + { + "epoch": 0.9078233313527431, + "grad_norm": 1.6145628015650884, + "learning_rate": 4.420057203616956e-07, + "loss": 0.2571, + "step": 11459 + }, + { + "epoch": 0.9079025549613785, + "grad_norm": 0.9934166053794611, + "learning_rate": 4.4125157173231847e-07, + "loss": 0.1174, + "step": 11460 + }, + { + "epoch": 0.9079817785700138, + "grad_norm": 1.4222208978210948, + "learning_rate": 4.40498052500532e-07, + "loss": 0.2629, + "step": 11461 + }, + { + "epoch": 0.9080610021786493, + "grad_norm": 1.4192110266480258, + "learning_rate": 4.397451627159499e-07, + "loss": 0.186, + "step": 11462 + }, + { + "epoch": 0.9081402257872846, + "grad_norm": 1.746785174805083, + "learning_rate": 4.389929024281492e-07, + "loss": 0.2759, + "step": 11463 + }, + { + "epoch": 0.90821944939592, + "grad_norm": 1.3220487872127502, + "learning_rate": 4.382412716866602e-07, + "loss": 0.207, + "step": 11464 + }, + { + "epoch": 0.9082986730045554, + "grad_norm": 1.2338070256531364, + "learning_rate": 4.374902705409745e-07, + "loss": 0.2041, + "step": 11465 + }, + { + "epoch": 0.9083778966131907, + "grad_norm": 1.0736160557902221, + "learning_rate": 4.367398990405447e-07, + "loss": 0.157, + "step": 11466 + }, + { + "epoch": 0.9084571202218261, + "grad_norm": 1.2817395609640396, + "learning_rate": 4.359901572347758e-07, + "loss": 0.2443, + "step": 11467 + }, + { + "epoch": 0.9085363438304614, + "grad_norm": 1.1816610246249397, + "learning_rate": 4.3524104517303714e-07, + "loss": 0.1717, + "step": 11468 + }, + { + "epoch": 0.9086155674390969, + "grad_norm": 1.7560676930716415, + "learning_rate": 4.3449256290465035e-07, + "loss": 0.2375, + "step": 11469 + }, + { + "epoch": 0.9086947910477322, + "grad_norm": 1.2582724501020333, + "learning_rate": 4.3374471047890497e-07, + "loss": 0.2147, + "step": 11470 + }, + { + "epoch": 0.9087740146563676, + "grad_norm": 1.418905410575443, + "learning_rate": 4.329974879450394e-07, + "loss": 0.2459, + "step": 11471 + }, + { + "epoch": 0.908853238265003, + "grad_norm": 1.342316449061536, + "learning_rate": 4.3225089535225415e-07, + "loss": 0.2317, + "step": 11472 + }, + { + "epoch": 0.9089324618736383, + "grad_norm": 1.1916797329317954, + "learning_rate": 4.3150493274971227e-07, + "loss": 0.2177, + "step": 11473 + }, + { + "epoch": 0.9090116854822737, + "grad_norm": 1.0863981432213492, + "learning_rate": 4.3075960018652995e-07, + "loss": 0.1758, + "step": 11474 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.3191141858103075, + "learning_rate": 4.300148977117824e-07, + "loss": 0.2255, + "step": 11475 + }, + { + "epoch": 0.9091701326995445, + "grad_norm": 1.2096794936794457, + "learning_rate": 4.2927082537450705e-07, + "loss": 0.1697, + "step": 11476 + }, + { + "epoch": 0.9092493563081798, + "grad_norm": 1.2901020677158936, + "learning_rate": 4.285273832236969e-07, + "loss": 0.1951, + "step": 11477 + }, + { + "epoch": 0.9093285799168153, + "grad_norm": 1.5552055235717805, + "learning_rate": 4.277845713083018e-07, + "loss": 0.2683, + "step": 11478 + }, + { + "epoch": 0.9094078035254506, + "grad_norm": 1.485532598975266, + "learning_rate": 4.2704238967723574e-07, + "loss": 0.2339, + "step": 11479 + }, + { + "epoch": 0.9094870271340859, + "grad_norm": 1.2712454608590502, + "learning_rate": 4.2630083837936654e-07, + "loss": 0.1791, + "step": 11480 + }, + { + "epoch": 0.9095662507427213, + "grad_norm": 1.352239512518404, + "learning_rate": 4.2555991746352054e-07, + "loss": 0.2581, + "step": 11481 + }, + { + "epoch": 0.9096454743513567, + "grad_norm": 1.6863382835731167, + "learning_rate": 4.2481962697848323e-07, + "loss": 0.2527, + "step": 11482 + }, + { + "epoch": 0.9097246979599921, + "grad_norm": 1.5150735796303765, + "learning_rate": 4.240799669730034e-07, + "loss": 0.2929, + "step": 11483 + }, + { + "epoch": 0.9098039215686274, + "grad_norm": 1.4252381334508766, + "learning_rate": 4.2334093749577975e-07, + "loss": 0.2147, + "step": 11484 + }, + { + "epoch": 0.9098831451772629, + "grad_norm": 1.2594869124074355, + "learning_rate": 4.226025385954746e-07, + "loss": 0.1932, + "step": 11485 + }, + { + "epoch": 0.9099623687858982, + "grad_norm": 1.092966590490661, + "learning_rate": 4.218647703207113e-07, + "loss": 0.1283, + "step": 11486 + }, + { + "epoch": 0.9100415923945335, + "grad_norm": 1.3090466555396847, + "learning_rate": 4.211276327200642e-07, + "loss": 0.1911, + "step": 11487 + }, + { + "epoch": 0.910120816003169, + "grad_norm": 1.2063203457894478, + "learning_rate": 4.203911258420712e-07, + "loss": 0.2237, + "step": 11488 + }, + { + "epoch": 0.9102000396118043, + "grad_norm": 1.3012628403675948, + "learning_rate": 4.196552497352302e-07, + "loss": 0.1464, + "step": 11489 + }, + { + "epoch": 0.9102792632204397, + "grad_norm": 1.1912771895504253, + "learning_rate": 4.189200044479924e-07, + "loss": 0.2109, + "step": 11490 + }, + { + "epoch": 0.910358486829075, + "grad_norm": 1.5954214973115788, + "learning_rate": 4.1818539002877024e-07, + "loss": 0.299, + "step": 11491 + }, + { + "epoch": 0.9104377104377105, + "grad_norm": 1.271666131692804, + "learning_rate": 4.174514065259383e-07, + "loss": 0.1936, + "step": 11492 + }, + { + "epoch": 0.9105169340463458, + "grad_norm": 1.2435706009038738, + "learning_rate": 4.167180539878213e-07, + "loss": 0.1538, + "step": 11493 + }, + { + "epoch": 0.9105961576549811, + "grad_norm": 1.5786516255402805, + "learning_rate": 4.1598533246270833e-07, + "loss": 0.2379, + "step": 11494 + }, + { + "epoch": 0.9106753812636166, + "grad_norm": 1.6056421538436658, + "learning_rate": 4.152532419988453e-07, + "loss": 0.3461, + "step": 11495 + }, + { + "epoch": 0.9107546048722519, + "grad_norm": 1.3375083548694011, + "learning_rate": 4.145217826444392e-07, + "loss": 0.2484, + "step": 11496 + }, + { + "epoch": 0.9108338284808873, + "grad_norm": 1.42747690083719, + "learning_rate": 4.1379095444764926e-07, + "loss": 0.2001, + "step": 11497 + }, + { + "epoch": 0.9109130520895227, + "grad_norm": 1.29900205376338, + "learning_rate": 4.130607574566003e-07, + "loss": 0.2205, + "step": 11498 + }, + { + "epoch": 0.9109922756981581, + "grad_norm": 1.337424357474979, + "learning_rate": 4.1233119171937065e-07, + "loss": 0.2248, + "step": 11499 + }, + { + "epoch": 0.9110714993067934, + "grad_norm": 1.2766738544700231, + "learning_rate": 4.116022572839984e-07, + "loss": 0.1813, + "step": 11500 + }, + { + "epoch": 0.9111507229154288, + "grad_norm": 1.438992602713393, + "learning_rate": 4.1087395419848186e-07, + "loss": 0.2116, + "step": 11501 + }, + { + "epoch": 0.9112299465240642, + "grad_norm": 1.1742965260043927, + "learning_rate": 4.10146282510776e-07, + "loss": 0.19, + "step": 11502 + }, + { + "epoch": 0.9113091701326995, + "grad_norm": 1.1030349158739612, + "learning_rate": 4.094192422687926e-07, + "loss": 0.1208, + "step": 11503 + }, + { + "epoch": 0.911388393741335, + "grad_norm": 1.326589982163388, + "learning_rate": 4.0869283352040656e-07, + "loss": 0.1956, + "step": 11504 + }, + { + "epoch": 0.9114676173499703, + "grad_norm": 1.2188409136395084, + "learning_rate": 4.079670563134475e-07, + "loss": 0.2337, + "step": 11505 + }, + { + "epoch": 0.9115468409586057, + "grad_norm": 1.3075058874056278, + "learning_rate": 4.072419106957026e-07, + "loss": 0.1806, + "step": 11506 + }, + { + "epoch": 0.911626064567241, + "grad_norm": 1.3822003492298027, + "learning_rate": 4.065173967149205e-07, + "loss": 0.2074, + "step": 11507 + }, + { + "epoch": 0.9117052881758764, + "grad_norm": 0.9843325340828633, + "learning_rate": 4.057935144188074e-07, + "loss": 0.1264, + "step": 11508 + }, + { + "epoch": 0.9117845117845118, + "grad_norm": 1.3264894111292864, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.184, + "step": 11509 + }, + { + "epoch": 0.9118637353931471, + "grad_norm": 1.0693847780856147, + "learning_rate": 4.043476450712014e-07, + "loss": 0.1918, + "step": 11510 + }, + { + "epoch": 0.9119429590017826, + "grad_norm": 1.4229257269074096, + "learning_rate": 4.036256581149123e-07, + "loss": 0.2436, + "step": 11511 + }, + { + "epoch": 0.9120221826104179, + "grad_norm": 1.488066558518975, + "learning_rate": 4.0290430303369876e-07, + "loss": 0.2477, + "step": 11512 + }, + { + "epoch": 0.9121014062190532, + "grad_norm": 1.5473374802435573, + "learning_rate": 4.021835798750584e-07, + "loss": 0.2782, + "step": 11513 + }, + { + "epoch": 0.9121806298276887, + "grad_norm": 1.3482564116305311, + "learning_rate": 4.0146348868644767e-07, + "loss": 0.2488, + "step": 11514 + }, + { + "epoch": 0.912259853436324, + "grad_norm": 1.235627441540292, + "learning_rate": 4.0074402951528204e-07, + "loss": 0.1719, + "step": 11515 + }, + { + "epoch": 0.9123390770449594, + "grad_norm": 1.3888607368526977, + "learning_rate": 4.000252024089313e-07, + "loss": 0.1853, + "step": 11516 + }, + { + "epoch": 0.9124183006535947, + "grad_norm": 1.4064493236762259, + "learning_rate": 3.9930700741473093e-07, + "loss": 0.3289, + "step": 11517 + }, + { + "epoch": 0.9124975242622302, + "grad_norm": 1.3067992716682535, + "learning_rate": 3.985894445799676e-07, + "loss": 0.2175, + "step": 11518 + }, + { + "epoch": 0.9125767478708655, + "grad_norm": 1.265268119282097, + "learning_rate": 3.978725139518891e-07, + "loss": 0.1967, + "step": 11519 + }, + { + "epoch": 0.9126559714795008, + "grad_norm": 1.5062649490315358, + "learning_rate": 3.9715621557770535e-07, + "loss": 0.2556, + "step": 11520 + }, + { + "epoch": 0.9127351950881363, + "grad_norm": 1.4729512504434277, + "learning_rate": 3.9644054950457753e-07, + "loss": 0.2359, + "step": 11521 + }, + { + "epoch": 0.9128144186967716, + "grad_norm": 1.5356793515122895, + "learning_rate": 3.9572551577963135e-07, + "loss": 0.2523, + "step": 11522 + }, + { + "epoch": 0.912893642305407, + "grad_norm": 1.2987969898787666, + "learning_rate": 3.9501111444994576e-07, + "loss": 0.2169, + "step": 11523 + }, + { + "epoch": 0.9129728659140424, + "grad_norm": 1.658436110892631, + "learning_rate": 3.9429734556256205e-07, + "loss": 0.3422, + "step": 11524 + }, + { + "epoch": 0.9130520895226778, + "grad_norm": 1.5970255325394085, + "learning_rate": 3.9358420916447927e-07, + "loss": 0.2978, + "step": 11525 + }, + { + "epoch": 0.9131313131313131, + "grad_norm": 1.0917172516092473, + "learning_rate": 3.9287170530265206e-07, + "loss": 0.1808, + "step": 11526 + }, + { + "epoch": 0.9132105367399485, + "grad_norm": 1.0599886427716367, + "learning_rate": 3.9215983402399736e-07, + "loss": 0.1689, + "step": 11527 + }, + { + "epoch": 0.9132897603485839, + "grad_norm": 0.9021125677081997, + "learning_rate": 3.914485953753888e-07, + "loss": 0.1323, + "step": 11528 + }, + { + "epoch": 0.9133689839572192, + "grad_norm": 1.2952944025341806, + "learning_rate": 3.907379894036545e-07, + "loss": 0.1833, + "step": 11529 + }, + { + "epoch": 0.9134482075658547, + "grad_norm": 1.831734041119413, + "learning_rate": 3.9002801615558805e-07, + "loss": 0.3419, + "step": 11530 + }, + { + "epoch": 0.91352743117449, + "grad_norm": 1.5053007261768667, + "learning_rate": 3.893186756779366e-07, + "loss": 0.2432, + "step": 11531 + }, + { + "epoch": 0.9136066547831254, + "grad_norm": 1.2037043793227422, + "learning_rate": 3.886099680174049e-07, + "loss": 0.2157, + "step": 11532 + }, + { + "epoch": 0.9136858783917607, + "grad_norm": 1.1053531206404543, + "learning_rate": 3.879018932206624e-07, + "loss": 0.2351, + "step": 11533 + }, + { + "epoch": 0.9137651020003961, + "grad_norm": 1.5164012115154661, + "learning_rate": 3.871944513343284e-07, + "loss": 0.2526, + "step": 11534 + }, + { + "epoch": 0.9138443256090315, + "grad_norm": 1.2831309728208964, + "learning_rate": 3.864876424049857e-07, + "loss": 0.2123, + "step": 11535 + }, + { + "epoch": 0.9139235492176668, + "grad_norm": 1.0502753441279835, + "learning_rate": 3.857814664791748e-07, + "loss": 0.1217, + "step": 11536 + }, + { + "epoch": 0.9140027728263023, + "grad_norm": 1.3249857518200205, + "learning_rate": 3.8507592360339407e-07, + "loss": 0.2294, + "step": 11537 + }, + { + "epoch": 0.9140819964349376, + "grad_norm": 1.3539375859531648, + "learning_rate": 3.843710138240997e-07, + "loss": 0.196, + "step": 11538 + }, + { + "epoch": 0.914161220043573, + "grad_norm": 1.2569589096592837, + "learning_rate": 3.8366673718770564e-07, + "loss": 0.1603, + "step": 11539 + }, + { + "epoch": 0.9142404436522084, + "grad_norm": 1.2484832368530978, + "learning_rate": 3.8296309374058704e-07, + "loss": 0.2062, + "step": 11540 + }, + { + "epoch": 0.9143196672608437, + "grad_norm": 1.3439230322929494, + "learning_rate": 3.8226008352907464e-07, + "loss": 0.1846, + "step": 11541 + }, + { + "epoch": 0.9143988908694791, + "grad_norm": 1.1118837846291858, + "learning_rate": 3.815577065994569e-07, + "loss": 0.1274, + "step": 11542 + }, + { + "epoch": 0.9144781144781144, + "grad_norm": 1.8464745876317965, + "learning_rate": 3.8085596299798465e-07, + "loss": 0.3058, + "step": 11543 + }, + { + "epoch": 0.9145573380867499, + "grad_norm": 1.4617184678386403, + "learning_rate": 3.801548527708621e-07, + "loss": 0.2403, + "step": 11544 + }, + { + "epoch": 0.9146365616953852, + "grad_norm": 1.4169994515179263, + "learning_rate": 3.794543759642544e-07, + "loss": 0.2094, + "step": 11545 + }, + { + "epoch": 0.9147157853040206, + "grad_norm": 1.1984561231244761, + "learning_rate": 3.7875453262428584e-07, + "loss": 0.2073, + "step": 11546 + }, + { + "epoch": 0.914795008912656, + "grad_norm": 1.251204102318907, + "learning_rate": 3.7805532279703625e-07, + "loss": 0.164, + "step": 11547 + }, + { + "epoch": 0.9148742325212913, + "grad_norm": 1.2280814821146404, + "learning_rate": 3.773567465285455e-07, + "loss": 0.2151, + "step": 11548 + }, + { + "epoch": 0.9149534561299267, + "grad_norm": 1.6956879370774005, + "learning_rate": 3.7665880386481226e-07, + "loss": 0.286, + "step": 11549 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 1.2065091343224346, + "learning_rate": 3.759614948517931e-07, + "loss": 0.1587, + "step": 11550 + }, + { + "epoch": 0.9151119033471975, + "grad_norm": 1.4489850524812282, + "learning_rate": 3.7526481953539915e-07, + "loss": 0.2455, + "step": 11551 + }, + { + "epoch": 0.9151911269558328, + "grad_norm": 1.6145481975224287, + "learning_rate": 3.74568777961507e-07, + "loss": 0.2454, + "step": 11552 + }, + { + "epoch": 0.9152703505644683, + "grad_norm": 1.2488806129259216, + "learning_rate": 3.7387337017594674e-07, + "loss": 0.2035, + "step": 11553 + }, + { + "epoch": 0.9153495741731036, + "grad_norm": 1.2325590316906303, + "learning_rate": 3.7317859622450714e-07, + "loss": 0.183, + "step": 11554 + }, + { + "epoch": 0.9154287977817389, + "grad_norm": 1.4137943114641929, + "learning_rate": 3.7248445615293506e-07, + "loss": 0.2378, + "step": 11555 + }, + { + "epoch": 0.9155080213903743, + "grad_norm": 1.5916797291463736, + "learning_rate": 3.7179095000693723e-07, + "loss": 0.2517, + "step": 11556 + }, + { + "epoch": 0.9155872449990097, + "grad_norm": 1.2816295579161405, + "learning_rate": 3.710980778321771e-07, + "loss": 0.2073, + "step": 11557 + }, + { + "epoch": 0.9156664686076451, + "grad_norm": 1.2927875508823483, + "learning_rate": 3.70405839674276e-07, + "loss": 0.1847, + "step": 11558 + }, + { + "epoch": 0.9157456922162804, + "grad_norm": 1.5335384834886192, + "learning_rate": 3.697142355788175e-07, + "loss": 0.2399, + "step": 11559 + }, + { + "epoch": 0.9158249158249159, + "grad_norm": 1.5337601451572123, + "learning_rate": 3.6902326559133836e-07, + "loss": 0.2786, + "step": 11560 + }, + { + "epoch": 0.9159041394335512, + "grad_norm": 1.3051444378305266, + "learning_rate": 3.683329297573346e-07, + "loss": 0.2275, + "step": 11561 + }, + { + "epoch": 0.9159833630421865, + "grad_norm": 1.349157211582938, + "learning_rate": 3.6764322812226416e-07, + "loss": 0.2044, + "step": 11562 + }, + { + "epoch": 0.916062586650822, + "grad_norm": 1.3388097820910023, + "learning_rate": 3.669541607315397e-07, + "loss": 0.1757, + "step": 11563 + }, + { + "epoch": 0.9161418102594573, + "grad_norm": 1.353712757381436, + "learning_rate": 3.6626572763053034e-07, + "loss": 0.2119, + "step": 11564 + }, + { + "epoch": 0.9162210338680927, + "grad_norm": 1.6305765603502218, + "learning_rate": 3.6557792886457e-07, + "loss": 0.1964, + "step": 11565 + }, + { + "epoch": 0.916300257476728, + "grad_norm": 1.3653558641364365, + "learning_rate": 3.6489076447894456e-07, + "loss": 0.2429, + "step": 11566 + }, + { + "epoch": 0.9163794810853635, + "grad_norm": 1.2884789791330342, + "learning_rate": 3.642042345189023e-07, + "loss": 0.2238, + "step": 11567 + }, + { + "epoch": 0.9164587046939988, + "grad_norm": 1.4047165662094148, + "learning_rate": 3.6351833902964485e-07, + "loss": 0.2012, + "step": 11568 + }, + { + "epoch": 0.9165379283026341, + "grad_norm": 1.3995022979289622, + "learning_rate": 3.6283307805633714e-07, + "loss": 0.2704, + "step": 11569 + }, + { + "epoch": 0.9166171519112696, + "grad_norm": 1.2249043838575662, + "learning_rate": 3.6214845164410205e-07, + "loss": 0.1589, + "step": 11570 + }, + { + "epoch": 0.9166963755199049, + "grad_norm": 1.4284337286404245, + "learning_rate": 3.614644598380157e-07, + "loss": 0.3002, + "step": 11571 + }, + { + "epoch": 0.9167755991285403, + "grad_norm": 1.571096364990784, + "learning_rate": 3.607811026831176e-07, + "loss": 0.3304, + "step": 11572 + }, + { + "epoch": 0.9168548227371757, + "grad_norm": 1.3252266398167374, + "learning_rate": 3.600983802244007e-07, + "loss": 0.2241, + "step": 11573 + }, + { + "epoch": 0.9169340463458111, + "grad_norm": 1.2526626682659778, + "learning_rate": 3.594162925068234e-07, + "loss": 0.1782, + "step": 11574 + }, + { + "epoch": 0.9170132699544464, + "grad_norm": 1.4905217388324608, + "learning_rate": 3.587348395752954e-07, + "loss": 0.2308, + "step": 11575 + }, + { + "epoch": 0.9170924935630818, + "grad_norm": 1.5160617197509325, + "learning_rate": 3.5805402147468746e-07, + "loss": 0.3075, + "step": 11576 + }, + { + "epoch": 0.9171717171717172, + "grad_norm": 1.4184027858690453, + "learning_rate": 3.573738382498271e-07, + "loss": 0.2455, + "step": 11577 + }, + { + "epoch": 0.9172509407803525, + "grad_norm": 1.426169947430461, + "learning_rate": 3.566942899455039e-07, + "loss": 0.2211, + "step": 11578 + }, + { + "epoch": 0.917330164388988, + "grad_norm": 1.1709391058722793, + "learning_rate": 3.5601537660646e-07, + "loss": 0.1648, + "step": 11579 + }, + { + "epoch": 0.9174093879976233, + "grad_norm": 1.3088221228553496, + "learning_rate": 3.553370982773985e-07, + "loss": 0.1954, + "step": 11580 + }, + { + "epoch": 0.9174886116062587, + "grad_norm": 1.318887886039781, + "learning_rate": 3.546594550029836e-07, + "loss": 0.2274, + "step": 11581 + }, + { + "epoch": 0.917567835214894, + "grad_norm": 1.1032652733466042, + "learning_rate": 3.53982446827833e-07, + "loss": 0.1927, + "step": 11582 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 1.0856985427116774, + "learning_rate": 3.533060737965244e-07, + "loss": 0.1727, + "step": 11583 + }, + { + "epoch": 0.9177262824321648, + "grad_norm": 1.3215648081236284, + "learning_rate": 3.526303359535932e-07, + "loss": 0.1879, + "step": 11584 + }, + { + "epoch": 0.9178055060408001, + "grad_norm": 1.274311315833374, + "learning_rate": 3.519552333435361e-07, + "loss": 0.2261, + "step": 11585 + }, + { + "epoch": 0.9178847296494356, + "grad_norm": 1.341233000779832, + "learning_rate": 3.5128076601080087e-07, + "loss": 0.1673, + "step": 11586 + }, + { + "epoch": 0.9179639532580709, + "grad_norm": 1.532310091893599, + "learning_rate": 3.5060693399980194e-07, + "loss": 0.277, + "step": 11587 + }, + { + "epoch": 0.9180431768667062, + "grad_norm": 1.3132039142085052, + "learning_rate": 3.499337373549072e-07, + "loss": 0.1724, + "step": 11588 + }, + { + "epoch": 0.9181224004753417, + "grad_norm": 1.4648295216107914, + "learning_rate": 3.4926117612044117e-07, + "loss": 0.2561, + "step": 11589 + }, + { + "epoch": 0.918201624083977, + "grad_norm": 1.3670028036780484, + "learning_rate": 3.485892503406907e-07, + "loss": 0.2308, + "step": 11590 + }, + { + "epoch": 0.9182808476926124, + "grad_norm": 1.413317174962045, + "learning_rate": 3.4791796005989917e-07, + "loss": 0.2048, + "step": 11591 + }, + { + "epoch": 0.9183600713012477, + "grad_norm": 1.3156485684355186, + "learning_rate": 3.4724730532226693e-07, + "loss": 0.2426, + "step": 11592 + }, + { + "epoch": 0.9184392949098832, + "grad_norm": 1.2839334148701198, + "learning_rate": 3.4657728617195295e-07, + "loss": 0.2208, + "step": 11593 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 1.0337204612195527, + "learning_rate": 3.459079026530754e-07, + "loss": 0.1628, + "step": 11594 + }, + { + "epoch": 0.9185977421271538, + "grad_norm": 1.0214973701723287, + "learning_rate": 3.4523915480971113e-07, + "loss": 0.1495, + "step": 11595 + }, + { + "epoch": 0.9186769657357893, + "grad_norm": 1.3430927803644317, + "learning_rate": 3.445710426858906e-07, + "loss": 0.2427, + "step": 11596 + }, + { + "epoch": 0.9187561893444246, + "grad_norm": 0.9737141241836886, + "learning_rate": 3.439035663256096e-07, + "loss": 0.1646, + "step": 11597 + }, + { + "epoch": 0.91883541295306, + "grad_norm": 1.2833807636694672, + "learning_rate": 3.4323672577281754e-07, + "loss": 0.2414, + "step": 11598 + }, + { + "epoch": 0.9189146365616954, + "grad_norm": 1.3823219654489336, + "learning_rate": 3.425705210714192e-07, + "loss": 0.1954, + "step": 11599 + }, + { + "epoch": 0.9189938601703308, + "grad_norm": 1.6658482083365656, + "learning_rate": 3.419049522652851e-07, + "loss": 0.2866, + "step": 11600 + }, + { + "epoch": 0.9190730837789661, + "grad_norm": 1.2244437966818023, + "learning_rate": 3.412400193982379e-07, + "loss": 0.1693, + "step": 11601 + }, + { + "epoch": 0.9191523073876015, + "grad_norm": 1.3093001524476615, + "learning_rate": 3.4057572251405936e-07, + "loss": 0.2053, + "step": 11602 + }, + { + "epoch": 0.9192315309962369, + "grad_norm": 1.4559454081304735, + "learning_rate": 3.3991206165649213e-07, + "loss": 0.2062, + "step": 11603 + }, + { + "epoch": 0.9193107546048722, + "grad_norm": 1.3119344660267651, + "learning_rate": 3.392490368692347e-07, + "loss": 0.1806, + "step": 11604 + }, + { + "epoch": 0.9193899782135077, + "grad_norm": 1.53034192515243, + "learning_rate": 3.385866481959432e-07, + "loss": 0.256, + "step": 11605 + }, + { + "epoch": 0.919469201822143, + "grad_norm": 1.128099510379244, + "learning_rate": 3.379248956802328e-07, + "loss": 0.1875, + "step": 11606 + }, + { + "epoch": 0.9195484254307784, + "grad_norm": 1.260348781554864, + "learning_rate": 3.3726377936567856e-07, + "loss": 0.1794, + "step": 11607 + }, + { + "epoch": 0.9196276490394137, + "grad_norm": 1.903357490299525, + "learning_rate": 3.3660329929580904e-07, + "loss": 0.2116, + "step": 11608 + }, + { + "epoch": 0.9197068726480491, + "grad_norm": 1.1099084181269925, + "learning_rate": 3.3594345551411503e-07, + "loss": 0.194, + "step": 11609 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 1.2170520120294799, + "learning_rate": 3.352842480640439e-07, + "loss": 0.1524, + "step": 11610 + }, + { + "epoch": 0.9198653198653198, + "grad_norm": 1.1582542632832862, + "learning_rate": 3.346256769890022e-07, + "loss": 0.1702, + "step": 11611 + }, + { + "epoch": 0.9199445434739553, + "grad_norm": 1.0699894486719395, + "learning_rate": 3.3396774233235173e-07, + "loss": 0.1468, + "step": 11612 + }, + { + "epoch": 0.9200237670825906, + "grad_norm": 1.35999773543236, + "learning_rate": 3.333104441374158e-07, + "loss": 0.2873, + "step": 11613 + }, + { + "epoch": 0.920102990691226, + "grad_norm": 1.6271186321335922, + "learning_rate": 3.32653782447474e-07, + "loss": 0.2807, + "step": 11614 + }, + { + "epoch": 0.9201822142998614, + "grad_norm": 1.0585003754224247, + "learning_rate": 3.319977573057642e-07, + "loss": 0.1852, + "step": 11615 + }, + { + "epoch": 0.9202614379084967, + "grad_norm": 1.4875737306188574, + "learning_rate": 3.313423687554829e-07, + "loss": 0.2728, + "step": 11616 + }, + { + "epoch": 0.9203406615171321, + "grad_norm": 1.2213897286790911, + "learning_rate": 3.3068761683978434e-07, + "loss": 0.154, + "step": 11617 + }, + { + "epoch": 0.9204198851257674, + "grad_norm": 1.2515154194584592, + "learning_rate": 3.3003350160177974e-07, + "loss": 0.2217, + "step": 11618 + }, + { + "epoch": 0.9204991087344029, + "grad_norm": 1.1100112385452725, + "learning_rate": 3.293800230845412e-07, + "loss": 0.1606, + "step": 11619 + }, + { + "epoch": 0.9205783323430382, + "grad_norm": 1.1502153782740052, + "learning_rate": 3.287271813310955e-07, + "loss": 0.19, + "step": 11620 + }, + { + "epoch": 0.9206575559516736, + "grad_norm": 1.129163978637922, + "learning_rate": 3.280749763844293e-07, + "loss": 0.1483, + "step": 11621 + }, + { + "epoch": 0.920736779560309, + "grad_norm": 1.2468665673009325, + "learning_rate": 3.274234082874872e-07, + "loss": 0.2402, + "step": 11622 + }, + { + "epoch": 0.9208160031689443, + "grad_norm": 1.4164191628158815, + "learning_rate": 3.267724770831737e-07, + "loss": 0.1996, + "step": 11623 + }, + { + "epoch": 0.9208952267775797, + "grad_norm": 1.1816855471896477, + "learning_rate": 3.2612218281434794e-07, + "loss": 0.2219, + "step": 11624 + }, + { + "epoch": 0.9209744503862151, + "grad_norm": 1.52934294030688, + "learning_rate": 3.254725255238267e-07, + "loss": 0.2989, + "step": 11625 + }, + { + "epoch": 0.9210536739948505, + "grad_norm": 1.3934782321165347, + "learning_rate": 3.2482350525439023e-07, + "loss": 0.2248, + "step": 11626 + }, + { + "epoch": 0.9211328976034858, + "grad_norm": 1.3166476886642682, + "learning_rate": 3.241751220487721e-07, + "loss": 0.2082, + "step": 11627 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 1.2818403386536417, + "learning_rate": 3.235273759496638e-07, + "loss": 0.1888, + "step": 11628 + }, + { + "epoch": 0.9212913448207566, + "grad_norm": 0.903189518569157, + "learning_rate": 3.2288026699971884e-07, + "loss": 0.1058, + "step": 11629 + }, + { + "epoch": 0.9213705684293919, + "grad_norm": 1.2610339211557926, + "learning_rate": 3.222337952415455e-07, + "loss": 0.1703, + "step": 11630 + }, + { + "epoch": 0.9214497920380273, + "grad_norm": 1.3696459012125872, + "learning_rate": 3.215879607177086e-07, + "loss": 0.1951, + "step": 11631 + }, + { + "epoch": 0.9215290156466627, + "grad_norm": 1.593612213382193, + "learning_rate": 3.2094276347073626e-07, + "loss": 0.2844, + "step": 11632 + }, + { + "epoch": 0.9216082392552981, + "grad_norm": 1.1646523413033234, + "learning_rate": 3.2029820354311014e-07, + "loss": 0.1878, + "step": 11633 + }, + { + "epoch": 0.9216874628639334, + "grad_norm": 1.3376368005026216, + "learning_rate": 3.196542809772707e-07, + "loss": 0.212, + "step": 11634 + }, + { + "epoch": 0.9217666864725689, + "grad_norm": 1.2901264582424004, + "learning_rate": 3.1901099581561846e-07, + "loss": 0.2094, + "step": 11635 + }, + { + "epoch": 0.9218459100812042, + "grad_norm": 1.0463639860805685, + "learning_rate": 3.183683481005106e-07, + "loss": 0.1692, + "step": 11636 + }, + { + "epoch": 0.9219251336898395, + "grad_norm": 1.5067208914503585, + "learning_rate": 3.1772633787426233e-07, + "loss": 0.1934, + "step": 11637 + }, + { + "epoch": 0.922004357298475, + "grad_norm": 1.578674209466328, + "learning_rate": 3.1708496517914523e-07, + "loss": 0.294, + "step": 11638 + }, + { + "epoch": 0.9220835809071103, + "grad_norm": 1.3687116099502004, + "learning_rate": 3.1644423005739335e-07, + "loss": 0.2224, + "step": 11639 + }, + { + "epoch": 0.9221628045157457, + "grad_norm": 1.2486326512391224, + "learning_rate": 3.15804132551194e-07, + "loss": 0.2072, + "step": 11640 + }, + { + "epoch": 0.922242028124381, + "grad_norm": 1.3632927631783953, + "learning_rate": 3.151646727026947e-07, + "loss": 0.2052, + "step": 11641 + }, + { + "epoch": 0.9223212517330165, + "grad_norm": 1.3330007786657825, + "learning_rate": 3.1452585055400167e-07, + "loss": 0.2881, + "step": 11642 + }, + { + "epoch": 0.9224004753416518, + "grad_norm": 1.3946377233132328, + "learning_rate": 3.138876661471779e-07, + "loss": 0.2007, + "step": 11643 + }, + { + "epoch": 0.9224796989502871, + "grad_norm": 1.1238048063461563, + "learning_rate": 3.1325011952424435e-07, + "loss": 0.1847, + "step": 11644 + }, + { + "epoch": 0.9225589225589226, + "grad_norm": 1.4675888098580654, + "learning_rate": 3.1261321072718063e-07, + "loss": 0.2639, + "step": 11645 + }, + { + "epoch": 0.9226381461675579, + "grad_norm": 1.3228553078447403, + "learning_rate": 3.1197693979792556e-07, + "loss": 0.2151, + "step": 11646 + }, + { + "epoch": 0.9227173697761933, + "grad_norm": 1.3301417266819258, + "learning_rate": 3.1134130677837103e-07, + "loss": 0.2039, + "step": 11647 + }, + { + "epoch": 0.9227965933848287, + "grad_norm": 1.281197228734371, + "learning_rate": 3.107063117103759e-07, + "loss": 0.1446, + "step": 11648 + }, + { + "epoch": 0.9228758169934641, + "grad_norm": 1.28368766585353, + "learning_rate": 3.100719546357467e-07, + "loss": 0.1773, + "step": 11649 + }, + { + "epoch": 0.9229550406020994, + "grad_norm": 1.3951118455185139, + "learning_rate": 3.0943823559625217e-07, + "loss": 0.266, + "step": 11650 + }, + { + "epoch": 0.9230342642107348, + "grad_norm": 1.5610755211539782, + "learning_rate": 3.088051546336246e-07, + "loss": 0.3311, + "step": 11651 + }, + { + "epoch": 0.9231134878193702, + "grad_norm": 1.554292572928912, + "learning_rate": 3.08172711789545e-07, + "loss": 0.2494, + "step": 11652 + }, + { + "epoch": 0.9231927114280055, + "grad_norm": 1.4840477482654226, + "learning_rate": 3.0754090710565785e-07, + "loss": 0.3146, + "step": 11653 + }, + { + "epoch": 0.923271935036641, + "grad_norm": 1.5159598526576488, + "learning_rate": 3.069097406235666e-07, + "loss": 0.2676, + "step": 11654 + }, + { + "epoch": 0.9233511586452763, + "grad_norm": 1.0977865496127512, + "learning_rate": 3.0627921238482794e-07, + "loss": 0.2278, + "step": 11655 + }, + { + "epoch": 0.9234303822539117, + "grad_norm": 1.4228183867391526, + "learning_rate": 3.056493224309587e-07, + "loss": 0.2674, + "step": 11656 + }, + { + "epoch": 0.923509605862547, + "grad_norm": 1.5689402868935982, + "learning_rate": 3.0502007080343675e-07, + "loss": 0.2302, + "step": 11657 + }, + { + "epoch": 0.9235888294711824, + "grad_norm": 1.4263459428708165, + "learning_rate": 3.043914575436946e-07, + "loss": 0.1644, + "step": 11658 + }, + { + "epoch": 0.9236680530798178, + "grad_norm": 1.2053126750803078, + "learning_rate": 3.0376348269312017e-07, + "loss": 0.175, + "step": 11659 + }, + { + "epoch": 0.9237472766884531, + "grad_norm": 1.1665812498854211, + "learning_rate": 3.031361462930671e-07, + "loss": 0.1886, + "step": 11660 + }, + { + "epoch": 0.9238265002970886, + "grad_norm": 1.7057341045544592, + "learning_rate": 3.025094483848401e-07, + "loss": 0.2302, + "step": 11661 + }, + { + "epoch": 0.9239057239057239, + "grad_norm": 1.4555151962626849, + "learning_rate": 3.0188338900970505e-07, + "loss": 0.3122, + "step": 11662 + }, + { + "epoch": 0.9239849475143593, + "grad_norm": 1.2178807449347981, + "learning_rate": 3.0125796820888343e-07, + "loss": 0.1907, + "step": 11663 + }, + { + "epoch": 0.9240641711229947, + "grad_norm": 1.229881971219856, + "learning_rate": 3.0063318602355787e-07, + "loss": 0.1871, + "step": 11664 + }, + { + "epoch": 0.92414339473163, + "grad_norm": 1.498836699257237, + "learning_rate": 3.000090424948665e-07, + "loss": 0.2405, + "step": 11665 + }, + { + "epoch": 0.9242226183402654, + "grad_norm": 1.8367582583925424, + "learning_rate": 2.993855376639054e-07, + "loss": 0.3306, + "step": 11666 + }, + { + "epoch": 0.9243018419489007, + "grad_norm": 1.6411745186639422, + "learning_rate": 2.987626715717318e-07, + "loss": 0.3118, + "step": 11667 + }, + { + "epoch": 0.9243810655575362, + "grad_norm": 1.2635489572839607, + "learning_rate": 2.9814044425935605e-07, + "loss": 0.2014, + "step": 11668 + }, + { + "epoch": 0.9244602891661715, + "grad_norm": 1.124021977600041, + "learning_rate": 2.9751885576774887e-07, + "loss": 0.2154, + "step": 11669 + }, + { + "epoch": 0.9245395127748068, + "grad_norm": 1.2599962752123208, + "learning_rate": 2.9689790613784073e-07, + "loss": 0.195, + "step": 11670 + }, + { + "epoch": 0.9246187363834423, + "grad_norm": 1.2991942394353995, + "learning_rate": 2.962775954105179e-07, + "loss": 0.1925, + "step": 11671 + }, + { + "epoch": 0.9246979599920776, + "grad_norm": 1.0306786580531144, + "learning_rate": 2.9565792362662213e-07, + "loss": 0.1287, + "step": 11672 + }, + { + "epoch": 0.924777183600713, + "grad_norm": 1.4610051297952606, + "learning_rate": 2.9503889082695967e-07, + "loss": 0.2558, + "step": 11673 + }, + { + "epoch": 0.9248564072093484, + "grad_norm": 1.3780365414436075, + "learning_rate": 2.9442049705228794e-07, + "loss": 0.1729, + "step": 11674 + }, + { + "epoch": 0.9249356308179838, + "grad_norm": 1.2355285899968955, + "learning_rate": 2.938027423433254e-07, + "loss": 0.1671, + "step": 11675 + }, + { + "epoch": 0.9250148544266191, + "grad_norm": 1.336103346660533, + "learning_rate": 2.931856267407507e-07, + "loss": 0.1895, + "step": 11676 + }, + { + "epoch": 0.9250940780352545, + "grad_norm": 1.4343131387109438, + "learning_rate": 2.9256915028519575e-07, + "loss": 0.216, + "step": 11677 + }, + { + "epoch": 0.9251733016438899, + "grad_norm": 1.5446231381017235, + "learning_rate": 2.919533130172536e-07, + "loss": 0.2473, + "step": 11678 + }, + { + "epoch": 0.9252525252525252, + "grad_norm": 1.5536035127580397, + "learning_rate": 2.913381149774719e-07, + "loss": 0.1562, + "step": 11679 + }, + { + "epoch": 0.9253317488611607, + "grad_norm": 1.1999161943769499, + "learning_rate": 2.907235562063615e-07, + "loss": 0.177, + "step": 11680 + }, + { + "epoch": 0.925410972469796, + "grad_norm": 1.622409687988493, + "learning_rate": 2.9010963674438674e-07, + "loss": 0.3556, + "step": 11681 + }, + { + "epoch": 0.9254901960784314, + "grad_norm": 1.1075209765826464, + "learning_rate": 2.8949635663197087e-07, + "loss": 0.1625, + "step": 11682 + }, + { + "epoch": 0.9255694196870667, + "grad_norm": 1.1862006430947292, + "learning_rate": 2.8888371590949703e-07, + "loss": 0.1844, + "step": 11683 + }, + { + "epoch": 0.9256486432957021, + "grad_norm": 1.3955708626069496, + "learning_rate": 2.882717146173031e-07, + "loss": 0.2439, + "step": 11684 + }, + { + "epoch": 0.9257278669043375, + "grad_norm": 1.3698786986885638, + "learning_rate": 2.8766035279568563e-07, + "loss": 0.2144, + "step": 11685 + }, + { + "epoch": 0.9258070905129728, + "grad_norm": 1.3196321968617435, + "learning_rate": 2.8704963048490243e-07, + "loss": 0.1931, + "step": 11686 + }, + { + "epoch": 0.9258863141216083, + "grad_norm": 1.1230422783815341, + "learning_rate": 2.864395477251658e-07, + "loss": 0.1855, + "step": 11687 + }, + { + "epoch": 0.9259655377302436, + "grad_norm": 1.3814388039259153, + "learning_rate": 2.858301045566447e-07, + "loss": 0.2268, + "step": 11688 + }, + { + "epoch": 0.926044761338879, + "grad_norm": 1.3684293879363767, + "learning_rate": 2.8522130101947045e-07, + "loss": 0.2, + "step": 11689 + }, + { + "epoch": 0.9261239849475144, + "grad_norm": 1.5205480670126952, + "learning_rate": 2.8461313715372976e-07, + "loss": 0.1734, + "step": 11690 + }, + { + "epoch": 0.9262032085561497, + "grad_norm": 1.679026175862578, + "learning_rate": 2.8400561299946503e-07, + "loss": 0.2363, + "step": 11691 + }, + { + "epoch": 0.9262824321647851, + "grad_norm": 1.4086595284165107, + "learning_rate": 2.8339872859668103e-07, + "loss": 0.2298, + "step": 11692 + }, + { + "epoch": 0.9263616557734204, + "grad_norm": 1.4842261889138826, + "learning_rate": 2.82792483985338e-07, + "loss": 0.2081, + "step": 11693 + }, + { + "epoch": 0.9264408793820559, + "grad_norm": 1.4308514433780373, + "learning_rate": 2.8218687920535395e-07, + "loss": 0.183, + "step": 11694 + }, + { + "epoch": 0.9265201029906912, + "grad_norm": 1.2687038438754916, + "learning_rate": 2.8158191429660364e-07, + "loss": 0.1659, + "step": 11695 + }, + { + "epoch": 0.9265993265993266, + "grad_norm": 1.149224754780243, + "learning_rate": 2.8097758929892196e-07, + "loss": 0.1634, + "step": 11696 + }, + { + "epoch": 0.926678550207962, + "grad_norm": 1.319276523406566, + "learning_rate": 2.803739042521025e-07, + "loss": 0.2519, + "step": 11697 + }, + { + "epoch": 0.9267577738165973, + "grad_norm": 1.4928639946480549, + "learning_rate": 2.7977085919589253e-07, + "loss": 0.2254, + "step": 11698 + }, + { + "epoch": 0.9268369974252327, + "grad_norm": 1.527422502656232, + "learning_rate": 2.791684541700013e-07, + "loss": 0.2377, + "step": 11699 + }, + { + "epoch": 0.9269162210338681, + "grad_norm": 1.3227684548630245, + "learning_rate": 2.785666892140937e-07, + "loss": 0.1947, + "step": 11700 + }, + { + "epoch": 0.9269954446425035, + "grad_norm": 1.0447110915081543, + "learning_rate": 2.7796556436779144e-07, + "loss": 0.1611, + "step": 11701 + }, + { + "epoch": 0.9270746682511388, + "grad_norm": 1.1712675378076414, + "learning_rate": 2.773650796706795e-07, + "loss": 0.1813, + "step": 11702 + }, + { + "epoch": 0.9271538918597743, + "grad_norm": 1.613198350624746, + "learning_rate": 2.7676523516229404e-07, + "loss": 0.2333, + "step": 11703 + }, + { + "epoch": 0.9272331154684096, + "grad_norm": 1.0544246153595418, + "learning_rate": 2.7616603088213126e-07, + "loss": 0.1411, + "step": 11704 + }, + { + "epoch": 0.9273123390770449, + "grad_norm": 1.606025795651259, + "learning_rate": 2.755674668696495e-07, + "loss": 0.213, + "step": 11705 + }, + { + "epoch": 0.9273915626856803, + "grad_norm": 1.3550933607013556, + "learning_rate": 2.749695431642574e-07, + "loss": 0.224, + "step": 11706 + }, + { + "epoch": 0.9274707862943157, + "grad_norm": 1.265046808553349, + "learning_rate": 2.743722598053278e-07, + "loss": 0.2412, + "step": 11707 + }, + { + "epoch": 0.9275500099029511, + "grad_norm": 1.2838966147096786, + "learning_rate": 2.737756168321881e-07, + "loss": 0.2409, + "step": 11708 + }, + { + "epoch": 0.9276292335115864, + "grad_norm": 1.3318208352489131, + "learning_rate": 2.7317961428412475e-07, + "loss": 0.1744, + "step": 11709 + }, + { + "epoch": 0.9277084571202219, + "grad_norm": 1.558522848240427, + "learning_rate": 2.7258425220038077e-07, + "loss": 0.282, + "step": 11710 + }, + { + "epoch": 0.9277876807288572, + "grad_norm": 1.4114619915282693, + "learning_rate": 2.719895306201581e-07, + "loss": 0.215, + "step": 11711 + }, + { + "epoch": 0.9278669043374925, + "grad_norm": 1.7033313063993536, + "learning_rate": 2.7139544958261765e-07, + "loss": 0.2641, + "step": 11712 + }, + { + "epoch": 0.927946127946128, + "grad_norm": 1.345063763021351, + "learning_rate": 2.7080200912687484e-07, + "loss": 0.2015, + "step": 11713 + }, + { + "epoch": 0.9280253515547633, + "grad_norm": 1.3396521024860375, + "learning_rate": 2.702092092920061e-07, + "loss": 0.2231, + "step": 11714 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 1.7315056954293184, + "learning_rate": 2.6961705011704475e-07, + "loss": 0.3107, + "step": 11715 + }, + { + "epoch": 0.928183798772034, + "grad_norm": 1.4026607903232158, + "learning_rate": 2.6902553164098065e-07, + "loss": 0.2113, + "step": 11716 + }, + { + "epoch": 0.9282630223806695, + "grad_norm": 1.609693795305664, + "learning_rate": 2.684346539027616e-07, + "loss": 0.2437, + "step": 11717 + }, + { + "epoch": 0.9283422459893048, + "grad_norm": 1.2518693612663423, + "learning_rate": 2.6784441694129747e-07, + "loss": 0.2092, + "step": 11718 + }, + { + "epoch": 0.9284214695979401, + "grad_norm": 1.36093872256006, + "learning_rate": 2.672548207954495e-07, + "loss": 0.1746, + "step": 11719 + }, + { + "epoch": 0.9285006932065756, + "grad_norm": 1.6777580367695801, + "learning_rate": 2.6666586550403884e-07, + "loss": 0.3009, + "step": 11720 + }, + { + "epoch": 0.9285799168152109, + "grad_norm": 1.3154426416127023, + "learning_rate": 2.6607755110584886e-07, + "loss": 0.2764, + "step": 11721 + }, + { + "epoch": 0.9286591404238463, + "grad_norm": 1.4638309939909384, + "learning_rate": 2.654898776396164e-07, + "loss": 0.2446, + "step": 11722 + }, + { + "epoch": 0.9287383640324817, + "grad_norm": 1.131494028007227, + "learning_rate": 2.64902845144035e-07, + "loss": 0.1637, + "step": 11723 + }, + { + "epoch": 0.9288175876411171, + "grad_norm": 1.2881899757114494, + "learning_rate": 2.6431645365775806e-07, + "loss": 0.2406, + "step": 11724 + }, + { + "epoch": 0.9288968112497524, + "grad_norm": 1.386066966201406, + "learning_rate": 2.637307032193992e-07, + "loss": 0.2477, + "step": 11725 + }, + { + "epoch": 0.9289760348583878, + "grad_norm": 1.5188150558481415, + "learning_rate": 2.6314559386752423e-07, + "loss": 0.2473, + "step": 11726 + }, + { + "epoch": 0.9290552584670232, + "grad_norm": 1.4528315736885065, + "learning_rate": 2.6256112564066236e-07, + "loss": 0.2333, + "step": 11727 + }, + { + "epoch": 0.9291344820756585, + "grad_norm": 0.9592547249325707, + "learning_rate": 2.6197729857729617e-07, + "loss": 0.1532, + "step": 11728 + }, + { + "epoch": 0.929213705684294, + "grad_norm": 1.6710092286615459, + "learning_rate": 2.613941127158681e-07, + "loss": 0.2725, + "step": 11729 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 1.3420527437980327, + "learning_rate": 2.608115680947787e-07, + "loss": 0.1488, + "step": 11730 + }, + { + "epoch": 0.9293721529015647, + "grad_norm": 1.2712597518432591, + "learning_rate": 2.602296647523861e-07, + "loss": 0.2262, + "step": 11731 + }, + { + "epoch": 0.9294513765102, + "grad_norm": 1.3733931666446226, + "learning_rate": 2.596484027270041e-07, + "loss": 0.1986, + "step": 11732 + }, + { + "epoch": 0.9295306001188354, + "grad_norm": 1.4692234321747957, + "learning_rate": 2.5906778205690876e-07, + "loss": 0.2243, + "step": 11733 + }, + { + "epoch": 0.9296098237274708, + "grad_norm": 1.2491528105606493, + "learning_rate": 2.5848780278032836e-07, + "loss": 0.1973, + "step": 11734 + }, + { + "epoch": 0.9296890473361061, + "grad_norm": 1.30864716497283, + "learning_rate": 2.579084649354546e-07, + "loss": 0.1998, + "step": 11735 + }, + { + "epoch": 0.9297682709447416, + "grad_norm": 1.0601137595023613, + "learning_rate": 2.5732976856043034e-07, + "loss": 0.1891, + "step": 11736 + }, + { + "epoch": 0.9298474945533769, + "grad_norm": 1.2006555755373662, + "learning_rate": 2.5675171369336284e-07, + "loss": 0.1529, + "step": 11737 + }, + { + "epoch": 0.9299267181620123, + "grad_norm": 1.5486600087431033, + "learning_rate": 2.5617430037231495e-07, + "loss": 0.3102, + "step": 11738 + }, + { + "epoch": 0.9300059417706477, + "grad_norm": 1.197424746422277, + "learning_rate": 2.5559752863530295e-07, + "loss": 0.1999, + "step": 11739 + }, + { + "epoch": 0.930085165379283, + "grad_norm": 1.307429626907592, + "learning_rate": 2.550213985203076e-07, + "loss": 0.1876, + "step": 11740 + }, + { + "epoch": 0.9301643889879184, + "grad_norm": 1.5232249416873884, + "learning_rate": 2.54445910065263e-07, + "loss": 0.3259, + "step": 11741 + }, + { + "epoch": 0.9302436125965537, + "grad_norm": 1.2806496961754996, + "learning_rate": 2.538710633080621e-07, + "loss": 0.1925, + "step": 11742 + }, + { + "epoch": 0.9303228362051892, + "grad_norm": 1.2686683746155034, + "learning_rate": 2.5329685828655803e-07, + "loss": 0.1877, + "step": 11743 + }, + { + "epoch": 0.9304020598138245, + "grad_norm": 1.3152483961455235, + "learning_rate": 2.527232950385572e-07, + "loss": 0.2267, + "step": 11744 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 1.1581671923904353, + "learning_rate": 2.521503736018249e-07, + "loss": 0.1611, + "step": 11745 + }, + { + "epoch": 0.9305605070310953, + "grad_norm": 1.2851059149252215, + "learning_rate": 2.5157809401408775e-07, + "loss": 0.1614, + "step": 11746 + }, + { + "epoch": 0.9306397306397306, + "grad_norm": 1.642225014022437, + "learning_rate": 2.510064563130277e-07, + "loss": 0.298, + "step": 11747 + }, + { + "epoch": 0.930718954248366, + "grad_norm": 1.1884245041748536, + "learning_rate": 2.5043546053628245e-07, + "loss": 0.1749, + "step": 11748 + }, + { + "epoch": 0.9307981778570014, + "grad_norm": 1.360637026091205, + "learning_rate": 2.498651067214497e-07, + "loss": 0.22, + "step": 11749 + }, + { + "epoch": 0.9308774014656368, + "grad_norm": 1.4865996775409334, + "learning_rate": 2.4929539490608614e-07, + "loss": 0.2048, + "step": 11750 + }, + { + "epoch": 0.9309566250742721, + "grad_norm": 1.4758000955801802, + "learning_rate": 2.487263251277028e-07, + "loss": 0.256, + "step": 11751 + }, + { + "epoch": 0.9310358486829075, + "grad_norm": 1.1446928284980966, + "learning_rate": 2.481578974237697e-07, + "loss": 0.1872, + "step": 11752 + }, + { + "epoch": 0.9311150722915429, + "grad_norm": 1.3106486960672494, + "learning_rate": 2.475901118317181e-07, + "loss": 0.1878, + "step": 11753 + }, + { + "epoch": 0.9311942959001782, + "grad_norm": 1.7531819922993446, + "learning_rate": 2.4702296838893134e-07, + "loss": 0.2608, + "step": 11754 + }, + { + "epoch": 0.9312735195088137, + "grad_norm": 1.2512840045916154, + "learning_rate": 2.464564671327529e-07, + "loss": 0.2088, + "step": 11755 + }, + { + "epoch": 0.931352743117449, + "grad_norm": 1.9232297755269545, + "learning_rate": 2.4589060810048635e-07, + "loss": 0.3096, + "step": 11756 + }, + { + "epoch": 0.9314319667260844, + "grad_norm": 1.3321777398321328, + "learning_rate": 2.453253913293896e-07, + "loss": 0.2394, + "step": 11757 + }, + { + "epoch": 0.9315111903347197, + "grad_norm": 1.397992454416382, + "learning_rate": 2.447608168566784e-07, + "loss": 0.1881, + "step": 11758 + }, + { + "epoch": 0.9315904139433551, + "grad_norm": 1.5472119994517586, + "learning_rate": 2.441968847195286e-07, + "loss": 0.2444, + "step": 11759 + }, + { + "epoch": 0.9316696375519905, + "grad_norm": 1.5574448752611958, + "learning_rate": 2.4363359495507166e-07, + "loss": 0.2581, + "step": 11760 + }, + { + "epoch": 0.9317488611606258, + "grad_norm": 1.245997876680775, + "learning_rate": 2.430709476003978e-07, + "loss": 0.1879, + "step": 11761 + }, + { + "epoch": 0.9318280847692613, + "grad_norm": 1.3332456244341295, + "learning_rate": 2.425089426925553e-07, + "loss": 0.1686, + "step": 11762 + }, + { + "epoch": 0.9319073083778966, + "grad_norm": 1.0194501328885548, + "learning_rate": 2.419475802685489e-07, + "loss": 0.1845, + "step": 11763 + }, + { + "epoch": 0.931986531986532, + "grad_norm": 2.2879964073497865, + "learning_rate": 2.413868603653413e-07, + "loss": 0.202, + "step": 11764 + }, + { + "epoch": 0.9320657555951674, + "grad_norm": 1.4269892370423047, + "learning_rate": 2.4082678301985297e-07, + "loss": 0.2323, + "step": 11765 + }, + { + "epoch": 0.9321449792038027, + "grad_norm": 1.5580562973218195, + "learning_rate": 2.402673482689633e-07, + "loss": 0.2113, + "step": 11766 + }, + { + "epoch": 0.9322242028124381, + "grad_norm": 1.3423216614956248, + "learning_rate": 2.3970855614950827e-07, + "loss": 0.2551, + "step": 11767 + }, + { + "epoch": 0.9323034264210734, + "grad_norm": 1.106902026684924, + "learning_rate": 2.3915040669828084e-07, + "loss": 0.175, + "step": 11768 + }, + { + "epoch": 0.9323826500297089, + "grad_norm": 1.5574270599329445, + "learning_rate": 2.385928999520326e-07, + "loss": 0.2487, + "step": 11769 + }, + { + "epoch": 0.9324618736383442, + "grad_norm": 1.7952622775714446, + "learning_rate": 2.3803603594747427e-07, + "loss": 0.2541, + "step": 11770 + }, + { + "epoch": 0.9325410972469796, + "grad_norm": 0.9879754028872343, + "learning_rate": 2.374798147212698e-07, + "loss": 0.1352, + "step": 11771 + }, + { + "epoch": 0.932620320855615, + "grad_norm": 1.454845325079651, + "learning_rate": 2.3692423631004658e-07, + "loss": 0.2332, + "step": 11772 + }, + { + "epoch": 0.9326995444642503, + "grad_norm": 1.411908066399433, + "learning_rate": 2.3636930075038534e-07, + "loss": 0.2538, + "step": 11773 + }, + { + "epoch": 0.9327787680728857, + "grad_norm": 1.0538057797970626, + "learning_rate": 2.3581500807882462e-07, + "loss": 0.1435, + "step": 11774 + }, + { + "epoch": 0.9328579916815211, + "grad_norm": 1.0969157894523553, + "learning_rate": 2.3526135833186527e-07, + "loss": 0.163, + "step": 11775 + }, + { + "epoch": 0.9329372152901565, + "grad_norm": 1.3326822905880578, + "learning_rate": 2.3470835154595918e-07, + "loss": 0.2275, + "step": 11776 + }, + { + "epoch": 0.9330164388987918, + "grad_norm": 1.5433416754030453, + "learning_rate": 2.3415598775752057e-07, + "loss": 0.2288, + "step": 11777 + }, + { + "epoch": 0.9330956625074273, + "grad_norm": 1.341940863702087, + "learning_rate": 2.3360426700292038e-07, + "loss": 0.2157, + "step": 11778 + }, + { + "epoch": 0.9331748861160626, + "grad_norm": 1.166842390274979, + "learning_rate": 2.330531893184873e-07, + "loss": 0.1794, + "step": 11779 + }, + { + "epoch": 0.9332541097246979, + "grad_norm": 1.4192506291092686, + "learning_rate": 2.3250275474050565e-07, + "loss": 0.2002, + "step": 11780 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.424225249593969, + "learning_rate": 2.3195296330521756e-07, + "loss": 0.2512, + "step": 11781 + }, + { + "epoch": 0.9334125569419687, + "grad_norm": 0.9888248994080405, + "learning_rate": 2.3140381504882736e-07, + "loss": 0.1083, + "step": 11782 + }, + { + "epoch": 0.9334917805506041, + "grad_norm": 1.1909576708458314, + "learning_rate": 2.3085531000749285e-07, + "loss": 0.1856, + "step": 11783 + }, + { + "epoch": 0.9335710041592394, + "grad_norm": 1.1589324636910048, + "learning_rate": 2.3030744821732953e-07, + "loss": 0.1711, + "step": 11784 + }, + { + "epoch": 0.9336502277678749, + "grad_norm": 1.568533870473859, + "learning_rate": 2.297602297144119e-07, + "loss": 0.2569, + "step": 11785 + }, + { + "epoch": 0.9337294513765102, + "grad_norm": 1.8304215803466244, + "learning_rate": 2.2921365453477229e-07, + "loss": 0.2194, + "step": 11786 + }, + { + "epoch": 0.9338086749851455, + "grad_norm": 1.5067639696282564, + "learning_rate": 2.286677227143985e-07, + "loss": 0.2173, + "step": 11787 + }, + { + "epoch": 0.933887898593781, + "grad_norm": 1.4508389071037184, + "learning_rate": 2.2812243428923964e-07, + "loss": 0.252, + "step": 11788 + }, + { + "epoch": 0.9339671222024163, + "grad_norm": 1.2177733074988366, + "learning_rate": 2.2757778929519914e-07, + "loss": 0.1851, + "step": 11789 + }, + { + "epoch": 0.9340463458110517, + "grad_norm": 1.3164968723123085, + "learning_rate": 2.2703378776813833e-07, + "loss": 0.2263, + "step": 11790 + }, + { + "epoch": 0.934125569419687, + "grad_norm": 1.2687385192977794, + "learning_rate": 2.2649042974387858e-07, + "loss": 0.1925, + "step": 11791 + }, + { + "epoch": 0.9342047930283225, + "grad_norm": 1.2555332335414413, + "learning_rate": 2.259477152581979e-07, + "loss": 0.234, + "step": 11792 + }, + { + "epoch": 0.9342840166369578, + "grad_norm": 1.3827845142854447, + "learning_rate": 2.2540564434682998e-07, + "loss": 0.1958, + "step": 11793 + }, + { + "epoch": 0.9343632402455931, + "grad_norm": 1.1396192031512402, + "learning_rate": 2.2486421704546623e-07, + "loss": 0.2036, + "step": 11794 + }, + { + "epoch": 0.9344424638542286, + "grad_norm": 1.46414449126329, + "learning_rate": 2.2432343338976038e-07, + "loss": 0.2115, + "step": 11795 + }, + { + "epoch": 0.9345216874628639, + "grad_norm": 1.6883736874608692, + "learning_rate": 2.2378329341531946e-07, + "loss": 0.2014, + "step": 11796 + }, + { + "epoch": 0.9346009110714993, + "grad_norm": 1.4931527294248408, + "learning_rate": 2.2324379715770728e-07, + "loss": 0.2402, + "step": 11797 + }, + { + "epoch": 0.9346801346801347, + "grad_norm": 1.7027946803534397, + "learning_rate": 2.2270494465244874e-07, + "loss": 0.2599, + "step": 11798 + }, + { + "epoch": 0.9347593582887701, + "grad_norm": 1.1970302088718698, + "learning_rate": 2.2216673593502437e-07, + "loss": 0.1852, + "step": 11799 + }, + { + "epoch": 0.9348385818974054, + "grad_norm": 1.1463715935250338, + "learning_rate": 2.2162917104087245e-07, + "loss": 0.1987, + "step": 11800 + }, + { + "epoch": 0.9349178055060408, + "grad_norm": 1.45743511857333, + "learning_rate": 2.2109225000538915e-07, + "loss": 0.2497, + "step": 11801 + }, + { + "epoch": 0.9349970291146762, + "grad_norm": 1.3384941021037486, + "learning_rate": 2.2055597286392838e-07, + "loss": 0.1948, + "step": 11802 + }, + { + "epoch": 0.9350762527233115, + "grad_norm": 1.1027871857955747, + "learning_rate": 2.200203396517997e-07, + "loss": 0.1413, + "step": 11803 + }, + { + "epoch": 0.935155476331947, + "grad_norm": 1.4002335100430188, + "learning_rate": 2.19485350404276e-07, + "loss": 0.2083, + "step": 11804 + }, + { + "epoch": 0.9352346999405823, + "grad_norm": 1.5358531501775807, + "learning_rate": 2.1895100515658019e-07, + "loss": 0.2254, + "step": 11805 + }, + { + "epoch": 0.9353139235492177, + "grad_norm": 1.417322382088549, + "learning_rate": 2.1841730394389527e-07, + "loss": 0.2401, + "step": 11806 + }, + { + "epoch": 0.935393147157853, + "grad_norm": 1.2639871788483603, + "learning_rate": 2.1788424680136756e-07, + "loss": 0.1979, + "step": 11807 + }, + { + "epoch": 0.9354723707664884, + "grad_norm": 1.3536508211154703, + "learning_rate": 2.173518337640923e-07, + "loss": 0.1954, + "step": 11808 + }, + { + "epoch": 0.9355515943751238, + "grad_norm": 2.1250739121836664, + "learning_rate": 2.1682006486712703e-07, + "loss": 0.2318, + "step": 11809 + }, + { + "epoch": 0.9356308179837591, + "grad_norm": 1.543122021361864, + "learning_rate": 2.1628894014548819e-07, + "loss": 0.2467, + "step": 11810 + }, + { + "epoch": 0.9357100415923946, + "grad_norm": 1.3945230761445655, + "learning_rate": 2.1575845963414555e-07, + "loss": 0.2167, + "step": 11811 + }, + { + "epoch": 0.9357892652010299, + "grad_norm": 1.4518405474756406, + "learning_rate": 2.1522862336803008e-07, + "loss": 0.2723, + "step": 11812 + }, + { + "epoch": 0.9358684888096653, + "grad_norm": 1.1833232743575877, + "learning_rate": 2.146994313820283e-07, + "loss": 0.2198, + "step": 11813 + }, + { + "epoch": 0.9359477124183007, + "grad_norm": 1.192544749969018, + "learning_rate": 2.141708837109846e-07, + "loss": 0.1932, + "step": 11814 + }, + { + "epoch": 0.936026936026936, + "grad_norm": 1.2390482454080411, + "learning_rate": 2.136429803897022e-07, + "loss": 0.2383, + "step": 11815 + }, + { + "epoch": 0.9361061596355714, + "grad_norm": 1.5052267080951667, + "learning_rate": 2.1311572145294114e-07, + "loss": 0.2986, + "step": 11816 + }, + { + "epoch": 0.9361853832442067, + "grad_norm": 1.0716443373011708, + "learning_rate": 2.1258910693541802e-07, + "loss": 0.1591, + "step": 11817 + }, + { + "epoch": 0.9362646068528422, + "grad_norm": 1.2531648900834316, + "learning_rate": 2.1206313687180845e-07, + "loss": 0.2283, + "step": 11818 + }, + { + "epoch": 0.9363438304614775, + "grad_norm": 1.0294217551869718, + "learning_rate": 2.1153781129674367e-07, + "loss": 0.1538, + "step": 11819 + }, + { + "epoch": 0.936423054070113, + "grad_norm": 1.825043959452312, + "learning_rate": 2.1101313024481595e-07, + "loss": 0.2709, + "step": 11820 + }, + { + "epoch": 0.9365022776787483, + "grad_norm": 1.4082009893038205, + "learning_rate": 2.1048909375057103e-07, + "loss": 0.2025, + "step": 11821 + }, + { + "epoch": 0.9365815012873836, + "grad_norm": 1.0714974453063149, + "learning_rate": 2.0996570184851572e-07, + "loss": 0.1522, + "step": 11822 + }, + { + "epoch": 0.936660724896019, + "grad_norm": 1.3841663975626486, + "learning_rate": 2.0944295457311247e-07, + "loss": 0.2994, + "step": 11823 + }, + { + "epoch": 0.9367399485046544, + "grad_norm": 1.4817367386534586, + "learning_rate": 2.0892085195878154e-07, + "loss": 0.3112, + "step": 11824 + }, + { + "epoch": 0.9368191721132898, + "grad_norm": 0.9474804587485585, + "learning_rate": 2.0839939403989984e-07, + "loss": 0.1766, + "step": 11825 + }, + { + "epoch": 0.9368983957219251, + "grad_norm": 1.392698330739467, + "learning_rate": 2.078785808508055e-07, + "loss": 0.1892, + "step": 11826 + }, + { + "epoch": 0.9369776193305605, + "grad_norm": 1.3232401112575685, + "learning_rate": 2.0735841242578992e-07, + "loss": 0.249, + "step": 11827 + }, + { + "epoch": 0.9370568429391959, + "grad_norm": 1.337518976621295, + "learning_rate": 2.068388887991013e-07, + "loss": 0.248, + "step": 11828 + }, + { + "epoch": 0.9371360665478312, + "grad_norm": 1.5996375940632537, + "learning_rate": 2.0632001000495228e-07, + "loss": 0.2656, + "step": 11829 + }, + { + "epoch": 0.9372152901564667, + "grad_norm": 1.1307932127574574, + "learning_rate": 2.0580177607750663e-07, + "loss": 0.1834, + "step": 11830 + }, + { + "epoch": 0.937294513765102, + "grad_norm": 1.2999057101886673, + "learning_rate": 2.0528418705088592e-07, + "loss": 0.2439, + "step": 11831 + }, + { + "epoch": 0.9373737373737374, + "grad_norm": 1.4831531820373913, + "learning_rate": 2.0476724295917294e-07, + "loss": 0.2556, + "step": 11832 + }, + { + "epoch": 0.9374529609823727, + "grad_norm": 1.173939555744418, + "learning_rate": 2.04250943836406e-07, + "loss": 0.1857, + "step": 11833 + }, + { + "epoch": 0.9375321845910081, + "grad_norm": 1.2739064473636263, + "learning_rate": 2.0373528971658009e-07, + "loss": 0.2136, + "step": 11834 + }, + { + "epoch": 0.9376114081996435, + "grad_norm": 1.272453047747186, + "learning_rate": 2.0322028063364806e-07, + "loss": 0.2141, + "step": 11835 + }, + { + "epoch": 0.9376906318082788, + "grad_norm": 1.4034895278096364, + "learning_rate": 2.0270591662152173e-07, + "loss": 0.2449, + "step": 11836 + }, + { + "epoch": 0.9377698554169143, + "grad_norm": 1.8051136931278835, + "learning_rate": 2.0219219771406952e-07, + "loss": 0.269, + "step": 11837 + }, + { + "epoch": 0.9378490790255496, + "grad_norm": 1.4023868618819069, + "learning_rate": 2.0167912394511657e-07, + "loss": 0.2285, + "step": 11838 + }, + { + "epoch": 0.937928302634185, + "grad_norm": 1.1885430928745575, + "learning_rate": 2.01166695348447e-07, + "loss": 0.2219, + "step": 11839 + }, + { + "epoch": 0.9380075262428204, + "grad_norm": 1.334351575441163, + "learning_rate": 2.0065491195780163e-07, + "loss": 0.2218, + "step": 11840 + }, + { + "epoch": 0.9380867498514557, + "grad_norm": 1.4406217702623154, + "learning_rate": 2.00143773806879e-07, + "loss": 0.2303, + "step": 11841 + }, + { + "epoch": 0.9381659734600911, + "grad_norm": 1.329706289025817, + "learning_rate": 1.9963328092933444e-07, + "loss": 0.234, + "step": 11842 + }, + { + "epoch": 0.9382451970687264, + "grad_norm": 1.662954207655982, + "learning_rate": 1.9912343335878326e-07, + "loss": 0.2913, + "step": 11843 + }, + { + "epoch": 0.9383244206773619, + "grad_norm": 1.2153611402017082, + "learning_rate": 1.9861423112879308e-07, + "loss": 0.1916, + "step": 11844 + }, + { + "epoch": 0.9384036442859972, + "grad_norm": 1.5296047956071133, + "learning_rate": 1.9810567427289596e-07, + "loss": 0.2323, + "step": 11845 + }, + { + "epoch": 0.9384828678946326, + "grad_norm": 1.156034604355825, + "learning_rate": 1.9759776282457731e-07, + "loss": 0.1302, + "step": 11846 + }, + { + "epoch": 0.938562091503268, + "grad_norm": 1.0828072375206133, + "learning_rate": 1.970904968172771e-07, + "loss": 0.1434, + "step": 11847 + }, + { + "epoch": 0.9386413151119033, + "grad_norm": 1.2738723994992955, + "learning_rate": 1.965838762844019e-07, + "loss": 0.2556, + "step": 11848 + }, + { + "epoch": 0.9387205387205387, + "grad_norm": 1.4287699753356409, + "learning_rate": 1.9607790125930614e-07, + "loss": 0.2085, + "step": 11849 + }, + { + "epoch": 0.9387997623291741, + "grad_norm": 1.6615611135323411, + "learning_rate": 1.9557257177530763e-07, + "loss": 0.3128, + "step": 11850 + }, + { + "epoch": 0.9388789859378095, + "grad_norm": 1.0842100642350225, + "learning_rate": 1.9506788786567865e-07, + "loss": 0.1545, + "step": 11851 + }, + { + "epoch": 0.9389582095464448, + "grad_norm": 1.461056663648525, + "learning_rate": 1.9456384956365149e-07, + "loss": 0.2749, + "step": 11852 + }, + { + "epoch": 0.9390374331550803, + "grad_norm": 1.5240099852363538, + "learning_rate": 1.9406045690241404e-07, + "loss": 0.221, + "step": 11853 + }, + { + "epoch": 0.9391166567637156, + "grad_norm": 1.1995326697564606, + "learning_rate": 1.935577099151109e-07, + "loss": 0.1955, + "step": 11854 + }, + { + "epoch": 0.9391958803723509, + "grad_norm": 1.4228484989316121, + "learning_rate": 1.9305560863484896e-07, + "loss": 0.2337, + "step": 11855 + }, + { + "epoch": 0.9392751039809863, + "grad_norm": 1.3293574139390814, + "learning_rate": 1.9255415309468618e-07, + "loss": 0.1741, + "step": 11856 + }, + { + "epoch": 0.9393543275896217, + "grad_norm": 1.653499143709913, + "learning_rate": 1.920533433276417e-07, + "loss": 0.2914, + "step": 11857 + }, + { + "epoch": 0.9394335511982571, + "grad_norm": 1.2517050235985543, + "learning_rate": 1.9155317936669248e-07, + "loss": 0.198, + "step": 11858 + }, + { + "epoch": 0.9395127748068924, + "grad_norm": 1.5424045510061735, + "learning_rate": 1.910536612447711e-07, + "loss": 0.2703, + "step": 11859 + }, + { + "epoch": 0.9395919984155279, + "grad_norm": 1.1542399376246517, + "learning_rate": 1.9055478899476788e-07, + "loss": 0.1434, + "step": 11860 + }, + { + "epoch": 0.9396712220241632, + "grad_norm": 1.624431413875575, + "learning_rate": 1.900565626495332e-07, + "loss": 0.2699, + "step": 11861 + }, + { + "epoch": 0.9397504456327985, + "grad_norm": 1.2519920728417075, + "learning_rate": 1.8955898224187086e-07, + "loss": 0.2199, + "step": 11862 + }, + { + "epoch": 0.939829669241434, + "grad_norm": 1.556699824477557, + "learning_rate": 1.890620478045435e-07, + "loss": 0.293, + "step": 11863 + }, + { + "epoch": 0.9399088928500693, + "grad_norm": 1.062172475506258, + "learning_rate": 1.8856575937027388e-07, + "loss": 0.1908, + "step": 11864 + }, + { + "epoch": 0.9399881164587047, + "grad_norm": 1.3599914032952658, + "learning_rate": 1.8807011697174027e-07, + "loss": 0.2473, + "step": 11865 + }, + { + "epoch": 0.94006734006734, + "grad_norm": 1.470812827272954, + "learning_rate": 1.8757512064157658e-07, + "loss": 0.3064, + "step": 11866 + }, + { + "epoch": 0.9401465636759755, + "grad_norm": 1.6214258861285569, + "learning_rate": 1.870807704123756e-07, + "loss": 0.275, + "step": 11867 + }, + { + "epoch": 0.9402257872846108, + "grad_norm": 1.3242406876468993, + "learning_rate": 1.8658706631669133e-07, + "loss": 0.1892, + "step": 11868 + }, + { + "epoch": 0.9403050108932461, + "grad_norm": 1.4675579235927214, + "learning_rate": 1.8609400838702884e-07, + "loss": 0.1647, + "step": 11869 + }, + { + "epoch": 0.9403842345018816, + "grad_norm": 0.9899325417985122, + "learning_rate": 1.856015966558533e-07, + "loss": 0.1272, + "step": 11870 + }, + { + "epoch": 0.9404634581105169, + "grad_norm": 1.2658252014544822, + "learning_rate": 1.8510983115558988e-07, + "loss": 0.1425, + "step": 11871 + }, + { + "epoch": 0.9405426817191523, + "grad_norm": 1.0409987464124033, + "learning_rate": 1.8461871191861825e-07, + "loss": 0.1782, + "step": 11872 + }, + { + "epoch": 0.9406219053277877, + "grad_norm": 1.3545409745235588, + "learning_rate": 1.8412823897727473e-07, + "loss": 0.2687, + "step": 11873 + }, + { + "epoch": 0.9407011289364231, + "grad_norm": 1.2156091717253168, + "learning_rate": 1.8363841236385571e-07, + "loss": 0.1493, + "step": 11874 + }, + { + "epoch": 0.9407803525450584, + "grad_norm": 1.231647848831258, + "learning_rate": 1.8314923211061542e-07, + "loss": 0.1705, + "step": 11875 + }, + { + "epoch": 0.9408595761536938, + "grad_norm": 1.1761801458719676, + "learning_rate": 1.826606982497603e-07, + "loss": 0.1718, + "step": 11876 + }, + { + "epoch": 0.9409387997623292, + "grad_norm": 1.5123812859875192, + "learning_rate": 1.8217281081346238e-07, + "loss": 0.2675, + "step": 11877 + }, + { + "epoch": 0.9410180233709645, + "grad_norm": 1.1870596934246855, + "learning_rate": 1.8168556983384377e-07, + "loss": 0.1693, + "step": 11878 + }, + { + "epoch": 0.9410972469796, + "grad_norm": 1.1783524117068334, + "learning_rate": 1.811989753429877e-07, + "loss": 0.2107, + "step": 11879 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.436980441136393, + "learning_rate": 1.8071302737293294e-07, + "loss": 0.2459, + "step": 11880 + }, + { + "epoch": 0.9412556941968707, + "grad_norm": 1.1959506585518205, + "learning_rate": 1.802277259556784e-07, + "loss": 0.1734, + "step": 11881 + }, + { + "epoch": 0.941334917805506, + "grad_norm": 1.3158445148595443, + "learning_rate": 1.7974307112317957e-07, + "loss": 0.1918, + "step": 11882 + }, + { + "epoch": 0.9414141414141414, + "grad_norm": 1.1075428525038007, + "learning_rate": 1.7925906290734653e-07, + "loss": 0.1592, + "step": 11883 + }, + { + "epoch": 0.9414933650227768, + "grad_norm": 1.0829546841492328, + "learning_rate": 1.787757013400504e-07, + "loss": 0.1682, + "step": 11884 + }, + { + "epoch": 0.9415725886314121, + "grad_norm": 1.3924413530700357, + "learning_rate": 1.7829298645311688e-07, + "loss": 0.2251, + "step": 11885 + }, + { + "epoch": 0.9416518122400476, + "grad_norm": 1.4827696162012518, + "learning_rate": 1.7781091827833164e-07, + "loss": 0.2148, + "step": 11886 + }, + { + "epoch": 0.9417310358486829, + "grad_norm": 1.2280209201682328, + "learning_rate": 1.7732949684743593e-07, + "loss": 0.2367, + "step": 11887 + }, + { + "epoch": 0.9418102594573183, + "grad_norm": 1.101994998358428, + "learning_rate": 1.768487221921278e-07, + "loss": 0.147, + "step": 11888 + }, + { + "epoch": 0.9418894830659537, + "grad_norm": 1.5072352873412898, + "learning_rate": 1.763685943440674e-07, + "loss": 0.2646, + "step": 11889 + }, + { + "epoch": 0.941968706674589, + "grad_norm": 1.4695441262736455, + "learning_rate": 1.7588911333486614e-07, + "loss": 0.2227, + "step": 11890 + }, + { + "epoch": 0.9420479302832244, + "grad_norm": 1.4941616695678268, + "learning_rate": 1.7541027919609545e-07, + "loss": 0.21, + "step": 11891 + }, + { + "epoch": 0.9421271538918597, + "grad_norm": 1.3070088914423765, + "learning_rate": 1.7493209195928562e-07, + "loss": 0.1709, + "step": 11892 + }, + { + "epoch": 0.9422063775004952, + "grad_norm": 1.0303539809339672, + "learning_rate": 1.7445455165592262e-07, + "loss": 0.1516, + "step": 11893 + }, + { + "epoch": 0.9422856011091305, + "grad_norm": 1.5832898035687808, + "learning_rate": 1.7397765831744905e-07, + "loss": 0.2752, + "step": 11894 + }, + { + "epoch": 0.942364824717766, + "grad_norm": 1.6027844603903292, + "learning_rate": 1.7350141197526648e-07, + "loss": 0.2743, + "step": 11895 + }, + { + "epoch": 0.9424440483264013, + "grad_norm": 1.2047544621482034, + "learning_rate": 1.7302581266073537e-07, + "loss": 0.1545, + "step": 11896 + }, + { + "epoch": 0.9425232719350366, + "grad_norm": 0.9941809397859586, + "learning_rate": 1.7255086040516954e-07, + "loss": 0.1159, + "step": 11897 + }, + { + "epoch": 0.942602495543672, + "grad_norm": 1.3059711228896056, + "learning_rate": 1.7207655523984179e-07, + "loss": 0.1886, + "step": 11898 + }, + { + "epoch": 0.9426817191523074, + "grad_norm": 1.7801572217739896, + "learning_rate": 1.71602897195986e-07, + "loss": 0.2573, + "step": 11899 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 1.356885211025602, + "learning_rate": 1.711298863047872e-07, + "loss": 0.1654, + "step": 11900 + }, + { + "epoch": 0.9428401663695781, + "grad_norm": 1.8828074228791027, + "learning_rate": 1.7065752259739056e-07, + "loss": 0.3406, + "step": 11901 + }, + { + "epoch": 0.9429193899782136, + "grad_norm": 1.378281485369158, + "learning_rate": 1.701858061049022e-07, + "loss": 0.2399, + "step": 11902 + }, + { + "epoch": 0.9429986135868489, + "grad_norm": 1.7622914263581961, + "learning_rate": 1.697147368583796e-07, + "loss": 0.3128, + "step": 11903 + }, + { + "epoch": 0.9430778371954842, + "grad_norm": 1.1779012537425344, + "learning_rate": 1.692443148888412e-07, + "loss": 0.2598, + "step": 11904 + }, + { + "epoch": 0.9431570608041197, + "grad_norm": 1.2626440688762763, + "learning_rate": 1.6877454022726225e-07, + "loss": 0.1856, + "step": 11905 + }, + { + "epoch": 0.943236284412755, + "grad_norm": 1.021109736272606, + "learning_rate": 1.6830541290457468e-07, + "loss": 0.1653, + "step": 11906 + }, + { + "epoch": 0.9433155080213904, + "grad_norm": 1.2120086583644574, + "learning_rate": 1.6783693295166935e-07, + "loss": 0.2067, + "step": 11907 + }, + { + "epoch": 0.9433947316300257, + "grad_norm": 1.3630737181410553, + "learning_rate": 1.6736910039939159e-07, + "loss": 0.2496, + "step": 11908 + }, + { + "epoch": 0.9434739552386611, + "grad_norm": 1.5236820255868397, + "learning_rate": 1.6690191527854782e-07, + "loss": 0.1782, + "step": 11909 + }, + { + "epoch": 0.9435531788472965, + "grad_norm": 1.2197213354954548, + "learning_rate": 1.6643537761989904e-07, + "loss": 0.2251, + "step": 11910 + }, + { + "epoch": 0.9436324024559318, + "grad_norm": 1.3848311345328754, + "learning_rate": 1.6596948745416397e-07, + "loss": 0.2144, + "step": 11911 + }, + { + "epoch": 0.9437116260645673, + "grad_norm": 1.1850209270462837, + "learning_rate": 1.6550424481202032e-07, + "loss": 0.2018, + "step": 11912 + }, + { + "epoch": 0.9437908496732026, + "grad_norm": 2.039681188470805, + "learning_rate": 1.6503964972410136e-07, + "loss": 0.3152, + "step": 11913 + }, + { + "epoch": 0.943870073281838, + "grad_norm": 1.289540182328846, + "learning_rate": 1.6457570222099816e-07, + "loss": 0.1948, + "step": 11914 + }, + { + "epoch": 0.9439492968904734, + "grad_norm": 1.3152030920513889, + "learning_rate": 1.6411240233326076e-07, + "loss": 0.2046, + "step": 11915 + }, + { + "epoch": 0.9440285204991087, + "grad_norm": 1.4460436261278127, + "learning_rate": 1.6364975009139473e-07, + "loss": 0.2182, + "step": 11916 + }, + { + "epoch": 0.9441077441077441, + "grad_norm": 1.1534976684157143, + "learning_rate": 1.6318774552586237e-07, + "loss": 0.1397, + "step": 11917 + }, + { + "epoch": 0.9441869677163794, + "grad_norm": 1.2984169973201547, + "learning_rate": 1.627263886670849e-07, + "loss": 0.2758, + "step": 11918 + }, + { + "epoch": 0.9442661913250149, + "grad_norm": 1.4242951790041694, + "learning_rate": 1.6226567954544248e-07, + "loss": 0.2169, + "step": 11919 + }, + { + "epoch": 0.9443454149336502, + "grad_norm": 1.2893162730102112, + "learning_rate": 1.618056181912675e-07, + "loss": 0.2592, + "step": 11920 + }, + { + "epoch": 0.9444246385422856, + "grad_norm": 0.99595852530852, + "learning_rate": 1.6134620463485352e-07, + "loss": 0.1254, + "step": 11921 + }, + { + "epoch": 0.944503862150921, + "grad_norm": 1.468256810202704, + "learning_rate": 1.6088743890645297e-07, + "loss": 0.2421, + "step": 11922 + }, + { + "epoch": 0.9445830857595563, + "grad_norm": 1.4257736491814927, + "learning_rate": 1.6042932103627174e-07, + "loss": 0.2402, + "step": 11923 + }, + { + "epoch": 0.9446623093681917, + "grad_norm": 1.190154964733546, + "learning_rate": 1.5997185105447344e-07, + "loss": 0.2205, + "step": 11924 + }, + { + "epoch": 0.9447415329768271, + "grad_norm": 1.3351385482760902, + "learning_rate": 1.5951502899118176e-07, + "loss": 0.1855, + "step": 11925 + }, + { + "epoch": 0.9448207565854625, + "grad_norm": 1.1923291436262913, + "learning_rate": 1.590588548764771e-07, + "loss": 0.1585, + "step": 11926 + }, + { + "epoch": 0.9448999801940978, + "grad_norm": 1.225062720953491, + "learning_rate": 1.586033287403943e-07, + "loss": 0.1951, + "step": 11927 + }, + { + "epoch": 0.9449792038027333, + "grad_norm": 1.0672946209416674, + "learning_rate": 1.5814845061292938e-07, + "loss": 0.1606, + "step": 11928 + }, + { + "epoch": 0.9450584274113686, + "grad_norm": 1.1799627462184714, + "learning_rate": 1.5769422052403172e-07, + "loss": 0.1642, + "step": 11929 + }, + { + "epoch": 0.9451376510200039, + "grad_norm": 1.323578162600058, + "learning_rate": 1.572406385036118e-07, + "loss": 0.2022, + "step": 11930 + }, + { + "epoch": 0.9452168746286393, + "grad_norm": 1.6741497543502415, + "learning_rate": 1.5678770458153693e-07, + "loss": 0.2528, + "step": 11931 + }, + { + "epoch": 0.9452960982372747, + "grad_norm": 1.2580078193246054, + "learning_rate": 1.563354187876287e-07, + "loss": 0.2059, + "step": 11932 + }, + { + "epoch": 0.9453753218459101, + "grad_norm": 1.2188005409246228, + "learning_rate": 1.558837811516667e-07, + "loss": 0.1993, + "step": 11933 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 1.1386068903986977, + "learning_rate": 1.5543279170339265e-07, + "loss": 0.1405, + "step": 11934 + }, + { + "epoch": 0.9455337690631809, + "grad_norm": 1.4518773886646272, + "learning_rate": 1.5498245047249948e-07, + "loss": 0.2792, + "step": 11935 + }, + { + "epoch": 0.9456129926718162, + "grad_norm": 1.0379932828164715, + "learning_rate": 1.5453275748864128e-07, + "loss": 0.181, + "step": 11936 + }, + { + "epoch": 0.9456922162804515, + "grad_norm": 1.2527407608144574, + "learning_rate": 1.5408371278142652e-07, + "loss": 0.2136, + "step": 11937 + }, + { + "epoch": 0.945771439889087, + "grad_norm": 1.133051970395139, + "learning_rate": 1.5363531638042494e-07, + "loss": 0.1694, + "step": 11938 + }, + { + "epoch": 0.9458506634977223, + "grad_norm": 0.9453520298459362, + "learning_rate": 1.5318756831516069e-07, + "loss": 0.1444, + "step": 11939 + }, + { + "epoch": 0.9459298871063577, + "grad_norm": 1.1661787678538582, + "learning_rate": 1.5274046861511348e-07, + "loss": 0.1334, + "step": 11940 + }, + { + "epoch": 0.946009110714993, + "grad_norm": 1.7486606422793853, + "learning_rate": 1.5229401730972536e-07, + "loss": 0.2351, + "step": 11941 + }, + { + "epoch": 0.9460883343236285, + "grad_norm": 1.596072679826886, + "learning_rate": 1.518482144283917e-07, + "loss": 0.318, + "step": 11942 + }, + { + "epoch": 0.9461675579322638, + "grad_norm": 1.4173459579703596, + "learning_rate": 1.514030600004668e-07, + "loss": 0.2463, + "step": 11943 + }, + { + "epoch": 0.9462467815408991, + "grad_norm": 1.3730255622837788, + "learning_rate": 1.5095855405526272e-07, + "loss": 0.2797, + "step": 11944 + }, + { + "epoch": 0.9463260051495346, + "grad_norm": 1.3236403827608467, + "learning_rate": 1.505146966220461e-07, + "loss": 0.1998, + "step": 11945 + }, + { + "epoch": 0.9464052287581699, + "grad_norm": 1.389233998474259, + "learning_rate": 1.5007148773004466e-07, + "loss": 0.2002, + "step": 11946 + }, + { + "epoch": 0.9464844523668053, + "grad_norm": 1.7784165772092895, + "learning_rate": 1.496289274084417e-07, + "loss": 0.2197, + "step": 11947 + }, + { + "epoch": 0.9465636759754407, + "grad_norm": 1.5493614019702404, + "learning_rate": 1.4918701568637618e-07, + "loss": 0.2967, + "step": 11948 + }, + { + "epoch": 0.9466428995840761, + "grad_norm": 1.2418625160201957, + "learning_rate": 1.4874575259294588e-07, + "loss": 0.2176, + "step": 11949 + }, + { + "epoch": 0.9467221231927114, + "grad_norm": 1.4411002547260734, + "learning_rate": 1.483051381572076e-07, + "loss": 0.1918, + "step": 11950 + }, + { + "epoch": 0.9468013468013468, + "grad_norm": 1.1050156657550234, + "learning_rate": 1.4786517240817255e-07, + "loss": 0.1983, + "step": 11951 + }, + { + "epoch": 0.9468805704099822, + "grad_norm": 1.3573581722178696, + "learning_rate": 1.474258553748098e-07, + "loss": 0.2327, + "step": 11952 + }, + { + "epoch": 0.9469597940186175, + "grad_norm": 2.149163762593121, + "learning_rate": 1.469871870860473e-07, + "loss": 0.2086, + "step": 11953 + }, + { + "epoch": 0.947039017627253, + "grad_norm": 1.2791287067992043, + "learning_rate": 1.4654916757076865e-07, + "loss": 0.2405, + "step": 11954 + }, + { + "epoch": 0.9471182412358883, + "grad_norm": 1.6083277190772298, + "learning_rate": 1.461117968578163e-07, + "loss": 0.204, + "step": 11955 + }, + { + "epoch": 0.9471974648445237, + "grad_norm": 1.3285162745799173, + "learning_rate": 1.4567507497598722e-07, + "loss": 0.2018, + "step": 11956 + }, + { + "epoch": 0.947276688453159, + "grad_norm": 1.7184821307080291, + "learning_rate": 1.452390019540384e-07, + "loss": 0.2823, + "step": 11957 + }, + { + "epoch": 0.9473559120617944, + "grad_norm": 1.6791100265502252, + "learning_rate": 1.4480357782068467e-07, + "loss": 0.1741, + "step": 11958 + }, + { + "epoch": 0.9474351356704298, + "grad_norm": 1.293959205928631, + "learning_rate": 1.4436880260459307e-07, + "loss": 0.2105, + "step": 11959 + }, + { + "epoch": 0.9475143592790651, + "grad_norm": 1.5912953725969694, + "learning_rate": 1.4393467633439629e-07, + "loss": 0.2497, + "step": 11960 + }, + { + "epoch": 0.9475935828877006, + "grad_norm": 1.2873370480244775, + "learning_rate": 1.4350119903867477e-07, + "loss": 0.2066, + "step": 11961 + }, + { + "epoch": 0.9476728064963359, + "grad_norm": 1.3278285073873823, + "learning_rate": 1.4306837074597235e-07, + "loss": 0.1883, + "step": 11962 + }, + { + "epoch": 0.9477520301049713, + "grad_norm": 1.4386089560988997, + "learning_rate": 1.426361914847907e-07, + "loss": 0.2089, + "step": 11963 + }, + { + "epoch": 0.9478312537136067, + "grad_norm": 1.2522204529530243, + "learning_rate": 1.422046612835848e-07, + "loss": 0.2395, + "step": 11964 + }, + { + "epoch": 0.947910477322242, + "grad_norm": 1.4331094810636582, + "learning_rate": 1.417737801707686e-07, + "loss": 0.2941, + "step": 11965 + }, + { + "epoch": 0.9479897009308774, + "grad_norm": 1.2559555286142803, + "learning_rate": 1.4134354817471497e-07, + "loss": 0.1773, + "step": 11966 + }, + { + "epoch": 0.9480689245395127, + "grad_norm": 1.5234539943660057, + "learning_rate": 1.4091396532375123e-07, + "loss": 0.2936, + "step": 11967 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 1.4017798717037955, + "learning_rate": 1.4048503164616367e-07, + "loss": 0.1725, + "step": 11968 + }, + { + "epoch": 0.9482273717567835, + "grad_norm": 1.7403147759700393, + "learning_rate": 1.4005674717019746e-07, + "loss": 0.3533, + "step": 11969 + }, + { + "epoch": 0.948306595365419, + "grad_norm": 1.3541541850831256, + "learning_rate": 1.3962911192405004e-07, + "loss": 0.2474, + "step": 11970 + }, + { + "epoch": 0.9483858189740543, + "grad_norm": 1.475853449203787, + "learning_rate": 1.3920212593588113e-07, + "loss": 0.2532, + "step": 11971 + }, + { + "epoch": 0.9484650425826896, + "grad_norm": 1.5552424109613492, + "learning_rate": 1.3877578923380486e-07, + "loss": 0.1956, + "step": 11972 + }, + { + "epoch": 0.948544266191325, + "grad_norm": 1.80668159592404, + "learning_rate": 1.3835010184589325e-07, + "loss": 0.2312, + "step": 11973 + }, + { + "epoch": 0.9486234897999604, + "grad_norm": 1.411971935682486, + "learning_rate": 1.3792506380017612e-07, + "loss": 0.2772, + "step": 11974 + }, + { + "epoch": 0.9487027134085958, + "grad_norm": 1.3883863741424434, + "learning_rate": 1.3750067512464105e-07, + "loss": 0.1856, + "step": 11975 + }, + { + "epoch": 0.9487819370172311, + "grad_norm": 1.235841226080657, + "learning_rate": 1.3707693584723124e-07, + "loss": 0.2193, + "step": 11976 + }, + { + "epoch": 0.9488611606258666, + "grad_norm": 1.2413515833972935, + "learning_rate": 1.3665384599584774e-07, + "loss": 0.2418, + "step": 11977 + }, + { + "epoch": 0.9489403842345019, + "grad_norm": 1.4092986017668696, + "learning_rate": 1.3623140559834824e-07, + "loss": 0.2984, + "step": 11978 + }, + { + "epoch": 0.9490196078431372, + "grad_norm": 1.2776247619251861, + "learning_rate": 1.358096146825505e-07, + "loss": 0.2324, + "step": 11979 + }, + { + "epoch": 0.9490988314517727, + "grad_norm": 0.8980753348335349, + "learning_rate": 1.353884732762256e-07, + "loss": 0.0778, + "step": 11980 + }, + { + "epoch": 0.949178055060408, + "grad_norm": 1.1800663763844146, + "learning_rate": 1.3496798140710365e-07, + "loss": 0.1331, + "step": 11981 + }, + { + "epoch": 0.9492572786690434, + "grad_norm": 1.3320287231195085, + "learning_rate": 1.3454813910287358e-07, + "loss": 0.243, + "step": 11982 + }, + { + "epoch": 0.9493365022776787, + "grad_norm": 1.5349659838867427, + "learning_rate": 1.341289463911788e-07, + "loss": 0.2524, + "step": 11983 + }, + { + "epoch": 0.9494157258863142, + "grad_norm": 0.9779419419521583, + "learning_rate": 1.337104032996206e-07, + "loss": 0.1249, + "step": 11984 + }, + { + "epoch": 0.9494949494949495, + "grad_norm": 1.3130984466198592, + "learning_rate": 1.3329250985575915e-07, + "loss": 0.182, + "step": 11985 + }, + { + "epoch": 0.9495741731035848, + "grad_norm": 1.1407139271416284, + "learning_rate": 1.3287526608711132e-07, + "loss": 0.1682, + "step": 11986 + }, + { + "epoch": 0.9496533967122203, + "grad_norm": 1.4095186487183038, + "learning_rate": 1.324586720211485e-07, + "loss": 0.268, + "step": 11987 + }, + { + "epoch": 0.9497326203208556, + "grad_norm": 1.269723617351791, + "learning_rate": 1.3204272768530313e-07, + "loss": 0.2292, + "step": 11988 + }, + { + "epoch": 0.949811843929491, + "grad_norm": 1.2776761685822047, + "learning_rate": 1.3162743310696224e-07, + "loss": 0.2249, + "step": 11989 + }, + { + "epoch": 0.9498910675381264, + "grad_norm": 1.2221732045848455, + "learning_rate": 1.3121278831347172e-07, + "loss": 0.225, + "step": 11990 + }, + { + "epoch": 0.9499702911467617, + "grad_norm": 1.3028100527904278, + "learning_rate": 1.3079879333213308e-07, + "loss": 0.2261, + "step": 11991 + }, + { + "epoch": 0.9500495147553971, + "grad_norm": 1.2897115282589904, + "learning_rate": 1.303854481902067e-07, + "loss": 0.2044, + "step": 11992 + }, + { + "epoch": 0.9501287383640324, + "grad_norm": 1.3020085071512875, + "learning_rate": 1.2997275291490863e-07, + "loss": 0.2259, + "step": 11993 + }, + { + "epoch": 0.9502079619726679, + "grad_norm": 1.1185168777319725, + "learning_rate": 1.2956070753341265e-07, + "loss": 0.2063, + "step": 11994 + }, + { + "epoch": 0.9502871855813032, + "grad_norm": 1.0354552482974677, + "learning_rate": 1.2914931207285154e-07, + "loss": 0.1454, + "step": 11995 + }, + { + "epoch": 0.9503664091899386, + "grad_norm": 1.5246919721748133, + "learning_rate": 1.2873856656031358e-07, + "loss": 0.2671, + "step": 11996 + }, + { + "epoch": 0.950445632798574, + "grad_norm": 1.245796593054884, + "learning_rate": 1.2832847102284162e-07, + "loss": 0.2281, + "step": 11997 + }, + { + "epoch": 0.9505248564072093, + "grad_norm": 1.527279786575422, + "learning_rate": 1.2791902548744185e-07, + "loss": 0.2932, + "step": 11998 + }, + { + "epoch": 0.9506040800158447, + "grad_norm": 1.4930144641370797, + "learning_rate": 1.2751022998107154e-07, + "loss": 0.3583, + "step": 11999 + }, + { + "epoch": 0.9506833036244801, + "grad_norm": 1.3516748208563374, + "learning_rate": 1.271020845306492e-07, + "loss": 0.2185, + "step": 12000 + }, + { + "epoch": 0.9507625272331155, + "grad_norm": 1.2888948217148029, + "learning_rate": 1.2669458916305112e-07, + "loss": 0.2254, + "step": 12001 + }, + { + "epoch": 0.9508417508417508, + "grad_norm": 0.9919838545110675, + "learning_rate": 1.2628774390510578e-07, + "loss": 0.1362, + "step": 12002 + }, + { + "epoch": 0.9509209744503863, + "grad_norm": 0.8817573968500436, + "learning_rate": 1.2588154878360293e-07, + "loss": 0.1129, + "step": 12003 + }, + { + "epoch": 0.9510001980590216, + "grad_norm": 1.5010132874448798, + "learning_rate": 1.254760038252889e-07, + "loss": 0.2369, + "step": 12004 + }, + { + "epoch": 0.9510794216676569, + "grad_norm": 1.686453830077245, + "learning_rate": 1.2507110905686793e-07, + "loss": 0.2635, + "step": 12005 + }, + { + "epoch": 0.9511586452762923, + "grad_norm": 1.2310034150257587, + "learning_rate": 1.2466686450499866e-07, + "loss": 0.194, + "step": 12006 + }, + { + "epoch": 0.9512378688849277, + "grad_norm": 1.2440299999995539, + "learning_rate": 1.242632701962987e-07, + "loss": 0.1745, + "step": 12007 + }, + { + "epoch": 0.9513170924935631, + "grad_norm": 1.5254919100564839, + "learning_rate": 1.2386032615734345e-07, + "loss": 0.2648, + "step": 12008 + }, + { + "epoch": 0.9513963161021984, + "grad_norm": 1.2466469382638743, + "learning_rate": 1.2345803241466504e-07, + "loss": 0.1748, + "step": 12009 + }, + { + "epoch": 0.9514755397108339, + "grad_norm": 1.4911780575393123, + "learning_rate": 1.2305638899475226e-07, + "loss": 0.2403, + "step": 12010 + }, + { + "epoch": 0.9515547633194692, + "grad_norm": 1.1133615498707368, + "learning_rate": 1.2265539592405173e-07, + "loss": 0.21, + "step": 12011 + }, + { + "epoch": 0.9516339869281045, + "grad_norm": 1.438323333057415, + "learning_rate": 1.222550532289668e-07, + "loss": 0.2717, + "step": 12012 + }, + { + "epoch": 0.95171321053674, + "grad_norm": 1.338172658249403, + "learning_rate": 1.218553609358575e-07, + "loss": 0.2196, + "step": 12013 + }, + { + "epoch": 0.9517924341453753, + "grad_norm": 1.6368377244901853, + "learning_rate": 1.214563190710416e-07, + "loss": 0.2596, + "step": 12014 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 1.4715574813175563, + "learning_rate": 1.2105792766079594e-07, + "loss": 0.2666, + "step": 12015 + }, + { + "epoch": 0.951950881362646, + "grad_norm": 1.2140343872631034, + "learning_rate": 1.2066018673134948e-07, + "loss": 0.16, + "step": 12016 + }, + { + "epoch": 0.9520301049712815, + "grad_norm": 1.518032396287896, + "learning_rate": 1.2026309630889465e-07, + "loss": 0.21, + "step": 12017 + }, + { + "epoch": 0.9521093285799168, + "grad_norm": 1.4641034407477171, + "learning_rate": 1.1986665641957718e-07, + "loss": 0.252, + "step": 12018 + }, + { + "epoch": 0.9521885521885521, + "grad_norm": 1.581106214000562, + "learning_rate": 1.194708670894984e-07, + "loss": 0.2676, + "step": 12019 + }, + { + "epoch": 0.9522677757971876, + "grad_norm": 1.3851883197961747, + "learning_rate": 1.1907572834472303e-07, + "loss": 0.2207, + "step": 12020 + }, + { + "epoch": 0.9523469994058229, + "grad_norm": 1.412009989454947, + "learning_rate": 1.1868124021126582e-07, + "loss": 0.2378, + "step": 12021 + }, + { + "epoch": 0.9524262230144583, + "grad_norm": 1.5077565675306255, + "learning_rate": 1.1828740271510375e-07, + "loss": 0.2936, + "step": 12022 + }, + { + "epoch": 0.9525054466230937, + "grad_norm": 1.2407322179599092, + "learning_rate": 1.1789421588216721e-07, + "loss": 0.16, + "step": 12023 + }, + { + "epoch": 0.9525846702317291, + "grad_norm": 1.121054097699786, + "learning_rate": 1.1750167973834769e-07, + "loss": 0.1802, + "step": 12024 + }, + { + "epoch": 0.9526638938403644, + "grad_norm": 1.2123476573175656, + "learning_rate": 1.171097943094912e-07, + "loss": 0.1826, + "step": 12025 + }, + { + "epoch": 0.9527431174489998, + "grad_norm": 1.695234520379659, + "learning_rate": 1.1671855962140045e-07, + "loss": 0.2148, + "step": 12026 + }, + { + "epoch": 0.9528223410576352, + "grad_norm": 1.3540368347767922, + "learning_rate": 1.1632797569983811e-07, + "loss": 0.2015, + "step": 12027 + }, + { + "epoch": 0.9529015646662705, + "grad_norm": 1.3755359478661886, + "learning_rate": 1.1593804257052143e-07, + "loss": 0.22, + "step": 12028 + }, + { + "epoch": 0.952980788274906, + "grad_norm": 1.2045428756173873, + "learning_rate": 1.1554876025912432e-07, + "loss": 0.183, + "step": 12029 + }, + { + "epoch": 0.9530600118835413, + "grad_norm": 1.191458811297058, + "learning_rate": 1.151601287912818e-07, + "loss": 0.1591, + "step": 12030 + }, + { + "epoch": 0.9531392354921767, + "grad_norm": 1.51003280604779, + "learning_rate": 1.147721481925812e-07, + "loss": 0.2629, + "step": 12031 + }, + { + "epoch": 0.953218459100812, + "grad_norm": 1.5766495149396162, + "learning_rate": 1.1438481848856986e-07, + "loss": 0.2871, + "step": 12032 + }, + { + "epoch": 0.9532976827094474, + "grad_norm": 1.624416702400341, + "learning_rate": 1.1399813970475293e-07, + "loss": 0.2511, + "step": 12033 + }, + { + "epoch": 0.9533769063180828, + "grad_norm": 1.2759760450675603, + "learning_rate": 1.1361211186658893e-07, + "loss": 0.2226, + "step": 12034 + }, + { + "epoch": 0.9534561299267181, + "grad_norm": 1.3488130915437462, + "learning_rate": 1.1322673499949754e-07, + "loss": 0.1872, + "step": 12035 + }, + { + "epoch": 0.9535353535353536, + "grad_norm": 1.2469218248306462, + "learning_rate": 1.1284200912885291e-07, + "loss": 0.224, + "step": 12036 + }, + { + "epoch": 0.9536145771439889, + "grad_norm": 1.1826302128327042, + "learning_rate": 1.1245793427998919e-07, + "loss": 0.2229, + "step": 12037 + }, + { + "epoch": 0.9536938007526243, + "grad_norm": 1.2200646355315476, + "learning_rate": 1.1207451047819396e-07, + "loss": 0.209, + "step": 12038 + }, + { + "epoch": 0.9537730243612597, + "grad_norm": 1.2437426914285388, + "learning_rate": 1.1169173774871478e-07, + "loss": 0.1563, + "step": 12039 + }, + { + "epoch": 0.953852247969895, + "grad_norm": 1.4813359499281296, + "learning_rate": 1.1130961611675484e-07, + "loss": 0.2738, + "step": 12040 + }, + { + "epoch": 0.9539314715785304, + "grad_norm": 1.4717511548655786, + "learning_rate": 1.1092814560747511e-07, + "loss": 0.2404, + "step": 12041 + }, + { + "epoch": 0.9540106951871657, + "grad_norm": 1.2797721495979215, + "learning_rate": 1.105473262459944e-07, + "loss": 0.2002, + "step": 12042 + }, + { + "epoch": 0.9540899187958012, + "grad_norm": 1.5027525510214748, + "learning_rate": 1.1016715805738709e-07, + "loss": 0.2407, + "step": 12043 + }, + { + "epoch": 0.9541691424044365, + "grad_norm": 1.5141907228535005, + "learning_rate": 1.0978764106668538e-07, + "loss": 0.2723, + "step": 12044 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 1.7346619579244726, + "learning_rate": 1.0940877529887928e-07, + "loss": 0.3685, + "step": 12045 + }, + { + "epoch": 0.9543275896217073, + "grad_norm": 1.9299077359652532, + "learning_rate": 1.0903056077891438e-07, + "loss": 0.2515, + "step": 12046 + }, + { + "epoch": 0.9544068132303426, + "grad_norm": 1.4152862560784865, + "learning_rate": 1.0865299753169522e-07, + "loss": 0.3484, + "step": 12047 + }, + { + "epoch": 0.954486036838978, + "grad_norm": 1.0916035270612172, + "learning_rate": 1.0827608558208192e-07, + "loss": 0.1754, + "step": 12048 + }, + { + "epoch": 0.9545652604476134, + "grad_norm": 1.65197444466612, + "learning_rate": 1.0789982495489238e-07, + "loss": 0.2704, + "step": 12049 + }, + { + "epoch": 0.9546444840562488, + "grad_norm": 1.2835648265580597, + "learning_rate": 1.0752421567490123e-07, + "loss": 0.1797, + "step": 12050 + }, + { + "epoch": 0.9547237076648841, + "grad_norm": 1.497835429754647, + "learning_rate": 1.0714925776684093e-07, + "loss": 0.1938, + "step": 12051 + }, + { + "epoch": 0.9548029312735196, + "grad_norm": 1.6822844337763083, + "learning_rate": 1.067749512554006e-07, + "loss": 0.2433, + "step": 12052 + }, + { + "epoch": 0.9548821548821549, + "grad_norm": 1.009573782527316, + "learning_rate": 1.0640129616522721e-07, + "loss": 0.1241, + "step": 12053 + }, + { + "epoch": 0.9549613784907902, + "grad_norm": 1.2860634601832368, + "learning_rate": 1.0602829252092328e-07, + "loss": 0.1797, + "step": 12054 + }, + { + "epoch": 0.9550406020994257, + "grad_norm": 1.6118354973779467, + "learning_rate": 1.0565594034704918e-07, + "loss": 0.3022, + "step": 12055 + }, + { + "epoch": 0.955119825708061, + "grad_norm": 1.2625732874743605, + "learning_rate": 1.0528423966812307e-07, + "loss": 0.1993, + "step": 12056 + }, + { + "epoch": 0.9551990493166964, + "grad_norm": 1.4915002915896953, + "learning_rate": 1.0491319050861981e-07, + "loss": 0.3321, + "step": 12057 + }, + { + "epoch": 0.9552782729253317, + "grad_norm": 1.002769121406365, + "learning_rate": 1.0454279289296987e-07, + "loss": 0.1489, + "step": 12058 + }, + { + "epoch": 0.9553574965339672, + "grad_norm": 1.4980894120193144, + "learning_rate": 1.0417304684556373e-07, + "loss": 0.2009, + "step": 12059 + }, + { + "epoch": 0.9554367201426025, + "grad_norm": 1.6452384749232496, + "learning_rate": 1.0380395239074747e-07, + "loss": 0.2951, + "step": 12060 + }, + { + "epoch": 0.9555159437512378, + "grad_norm": 1.4340916612071166, + "learning_rate": 1.0343550955282278e-07, + "loss": 0.2014, + "step": 12061 + }, + { + "epoch": 0.9555951673598733, + "grad_norm": 1.498196402153841, + "learning_rate": 1.0306771835605022e-07, + "loss": 0.2857, + "step": 12062 + }, + { + "epoch": 0.9556743909685086, + "grad_norm": 1.2314404875989995, + "learning_rate": 1.0270057882464823e-07, + "loss": 0.2106, + "step": 12063 + }, + { + "epoch": 0.955753614577144, + "grad_norm": 1.3877809760831388, + "learning_rate": 1.0233409098278967e-07, + "loss": 0.2483, + "step": 12064 + }, + { + "epoch": 0.9558328381857794, + "grad_norm": 1.1610400820542772, + "learning_rate": 1.0196825485460637e-07, + "loss": 0.1796, + "step": 12065 + }, + { + "epoch": 0.9559120617944147, + "grad_norm": 1.1650929796542495, + "learning_rate": 1.0160307046418794e-07, + "loss": 0.1475, + "step": 12066 + }, + { + "epoch": 0.9559912854030501, + "grad_norm": 1.0850260526726183, + "learning_rate": 1.0123853783557847e-07, + "loss": 0.1349, + "step": 12067 + }, + { + "epoch": 0.9560705090116854, + "grad_norm": 1.6217811538467826, + "learning_rate": 1.0087465699278321e-07, + "loss": 0.2582, + "step": 12068 + }, + { + "epoch": 0.9561497326203209, + "grad_norm": 1.384486882112382, + "learning_rate": 1.0051142795975855e-07, + "loss": 0.2294, + "step": 12069 + }, + { + "epoch": 0.9562289562289562, + "grad_norm": 1.3157176927898737, + "learning_rate": 1.0014885076042313e-07, + "loss": 0.1801, + "step": 12070 + }, + { + "epoch": 0.9563081798375916, + "grad_norm": 1.5019837687230437, + "learning_rate": 9.978692541865121e-08, + "loss": 0.2447, + "step": 12071 + }, + { + "epoch": 0.956387403446227, + "grad_norm": 1.2018344616119367, + "learning_rate": 9.94256519582748e-08, + "loss": 0.2104, + "step": 12072 + }, + { + "epoch": 0.9564666270548623, + "grad_norm": 1.382753501926394, + "learning_rate": 9.906503040307824e-08, + "loss": 0.2516, + "step": 12073 + }, + { + "epoch": 0.9565458506634977, + "grad_norm": 1.4339537821265524, + "learning_rate": 9.87050607768103e-08, + "loss": 0.2741, + "step": 12074 + }, + { + "epoch": 0.9566250742721331, + "grad_norm": 1.3794841903610584, + "learning_rate": 9.834574310317313e-08, + "loss": 0.1599, + "step": 12075 + }, + { + "epoch": 0.9567042978807685, + "grad_norm": 1.1467058783384767, + "learning_rate": 9.798707740582447e-08, + "loss": 0.1667, + "step": 12076 + }, + { + "epoch": 0.9567835214894038, + "grad_norm": 1.2536122321258512, + "learning_rate": 9.762906370837988e-08, + "loss": 0.1844, + "step": 12077 + }, + { + "epoch": 0.9568627450980393, + "grad_norm": 1.124703618109494, + "learning_rate": 9.727170203441605e-08, + "loss": 0.1499, + "step": 12078 + }, + { + "epoch": 0.9569419687066746, + "grad_norm": 1.2182006055880223, + "learning_rate": 9.691499240746083e-08, + "loss": 0.1705, + "step": 12079 + }, + { + "epoch": 0.9570211923153099, + "grad_norm": 1.2238522846079825, + "learning_rate": 9.65589348510032e-08, + "loss": 0.2093, + "step": 12080 + }, + { + "epoch": 0.9571004159239453, + "grad_norm": 1.513681469823502, + "learning_rate": 9.620352938848665e-08, + "loss": 0.2353, + "step": 12081 + }, + { + "epoch": 0.9571796395325807, + "grad_norm": 1.1708921392011702, + "learning_rate": 9.584877604331467e-08, + "loss": 0.1225, + "step": 12082 + }, + { + "epoch": 0.9572588631412161, + "grad_norm": 1.49444931593593, + "learning_rate": 9.549467483884412e-08, + "loss": 0.2378, + "step": 12083 + }, + { + "epoch": 0.9573380867498514, + "grad_norm": 1.3839567343048198, + "learning_rate": 9.514122579839302e-08, + "loss": 0.1519, + "step": 12084 + }, + { + "epoch": 0.9574173103584869, + "grad_norm": 1.1023302408782591, + "learning_rate": 9.478842894523165e-08, + "loss": 0.1929, + "step": 12085 + }, + { + "epoch": 0.9574965339671222, + "grad_norm": 1.730421586430579, + "learning_rate": 9.443628430259144e-08, + "loss": 0.3391, + "step": 12086 + }, + { + "epoch": 0.9575757575757575, + "grad_norm": 1.3556957041593796, + "learning_rate": 9.408479189366049e-08, + "loss": 0.2192, + "step": 12087 + }, + { + "epoch": 0.957654981184393, + "grad_norm": 1.3446272689870877, + "learning_rate": 9.37339517415814e-08, + "loss": 0.1944, + "step": 12088 + }, + { + "epoch": 0.9577342047930283, + "grad_norm": 1.3150633615907967, + "learning_rate": 9.33837638694557e-08, + "loss": 0.1487, + "step": 12089 + }, + { + "epoch": 0.9578134284016637, + "grad_norm": 1.1507819122392342, + "learning_rate": 9.30342283003416e-08, + "loss": 0.1915, + "step": 12090 + }, + { + "epoch": 0.957892652010299, + "grad_norm": 1.5636916708326578, + "learning_rate": 9.268534505725402e-08, + "loss": 0.2226, + "step": 12091 + }, + { + "epoch": 0.9579718756189345, + "grad_norm": 0.9988760320364604, + "learning_rate": 9.233711416316571e-08, + "loss": 0.1333, + "step": 12092 + }, + { + "epoch": 0.9580510992275698, + "grad_norm": 1.4073140713551204, + "learning_rate": 9.1989535641005e-08, + "loss": 0.2329, + "step": 12093 + }, + { + "epoch": 0.9581303228362051, + "grad_norm": 1.7331972313718855, + "learning_rate": 9.164260951366021e-08, + "loss": 0.2993, + "step": 12094 + }, + { + "epoch": 0.9582095464448406, + "grad_norm": 1.4223058469021153, + "learning_rate": 9.129633580397312e-08, + "loss": 0.2253, + "step": 12095 + }, + { + "epoch": 0.9582887700534759, + "grad_norm": 1.3186301104674516, + "learning_rate": 9.095071453474435e-08, + "loss": 0.2388, + "step": 12096 + }, + { + "epoch": 0.9583679936621113, + "grad_norm": 1.3543933527280094, + "learning_rate": 9.060574572873238e-08, + "loss": 0.2262, + "step": 12097 + }, + { + "epoch": 0.9584472172707467, + "grad_norm": 1.8175150985146755, + "learning_rate": 9.026142940865013e-08, + "loss": 0.1552, + "step": 12098 + }, + { + "epoch": 0.9585264408793821, + "grad_norm": 1.2335238287866004, + "learning_rate": 8.991776559717058e-08, + "loss": 0.2188, + "step": 12099 + }, + { + "epoch": 0.9586056644880174, + "grad_norm": 1.6544648377718518, + "learning_rate": 8.95747543169223e-08, + "loss": 0.3067, + "step": 12100 + }, + { + "epoch": 0.9586848880966528, + "grad_norm": 1.1345685436292745, + "learning_rate": 8.923239559049057e-08, + "loss": 0.1721, + "step": 12101 + }, + { + "epoch": 0.9587641117052882, + "grad_norm": 1.3399825959972054, + "learning_rate": 8.889068944041734e-08, + "loss": 0.2523, + "step": 12102 + }, + { + "epoch": 0.9588433353139235, + "grad_norm": 1.6606470916426763, + "learning_rate": 8.854963588920351e-08, + "loss": 0.2674, + "step": 12103 + }, + { + "epoch": 0.958922558922559, + "grad_norm": 1.2664170017616017, + "learning_rate": 8.820923495930556e-08, + "loss": 0.2405, + "step": 12104 + }, + { + "epoch": 0.9590017825311943, + "grad_norm": 1.4517399427834006, + "learning_rate": 8.786948667313667e-08, + "loss": 0.2839, + "step": 12105 + }, + { + "epoch": 0.9590810061398297, + "grad_norm": 1.436688799028156, + "learning_rate": 8.753039105306782e-08, + "loss": 0.2045, + "step": 12106 + }, + { + "epoch": 0.959160229748465, + "grad_norm": 1.3075966837930841, + "learning_rate": 8.719194812142673e-08, + "loss": 0.2106, + "step": 12107 + }, + { + "epoch": 0.9592394533571004, + "grad_norm": 1.2739832681844057, + "learning_rate": 8.685415790049889e-08, + "loss": 0.1766, + "step": 12108 + }, + { + "epoch": 0.9593186769657358, + "grad_norm": 1.381004367940717, + "learning_rate": 8.651702041252541e-08, + "loss": 0.219, + "step": 12109 + }, + { + "epoch": 0.9593979005743711, + "grad_norm": 1.5719520999738097, + "learning_rate": 8.61805356797063e-08, + "loss": 0.2708, + "step": 12110 + }, + { + "epoch": 0.9594771241830066, + "grad_norm": 1.1898290102794342, + "learning_rate": 8.584470372419606e-08, + "loss": 0.2438, + "step": 12111 + }, + { + "epoch": 0.9595563477916419, + "grad_norm": 1.7353954479277225, + "learning_rate": 8.550952456810813e-08, + "loss": 0.2483, + "step": 12112 + }, + { + "epoch": 0.9596355714002773, + "grad_norm": 1.22742945273216, + "learning_rate": 8.517499823351261e-08, + "loss": 0.2291, + "step": 12113 + }, + { + "epoch": 0.9597147950089127, + "grad_norm": 1.3177081652211198, + "learning_rate": 8.484112474243633e-08, + "loss": 0.2294, + "step": 12114 + }, + { + "epoch": 0.959794018617548, + "grad_norm": 1.3204271554155063, + "learning_rate": 8.450790411686282e-08, + "loss": 0.2163, + "step": 12115 + }, + { + "epoch": 0.9598732422261834, + "grad_norm": 1.1210764762060492, + "learning_rate": 8.417533637873454e-08, + "loss": 0.189, + "step": 12116 + }, + { + "epoch": 0.9599524658348187, + "grad_norm": 1.4746511371134137, + "learning_rate": 8.384342154994841e-08, + "loss": 0.2361, + "step": 12117 + }, + { + "epoch": 0.9600316894434542, + "grad_norm": 1.4551871061446462, + "learning_rate": 8.351215965235915e-08, + "loss": 0.1973, + "step": 12118 + }, + { + "epoch": 0.9601109130520895, + "grad_norm": 1.343528999564082, + "learning_rate": 8.318155070777822e-08, + "loss": 0.2135, + "step": 12119 + }, + { + "epoch": 0.960190136660725, + "grad_norm": 1.1672042063154022, + "learning_rate": 8.28515947379771e-08, + "loss": 0.1513, + "step": 12120 + }, + { + "epoch": 0.9602693602693603, + "grad_norm": 1.2958760868955908, + "learning_rate": 8.252229176467841e-08, + "loss": 0.2067, + "step": 12121 + }, + { + "epoch": 0.9603485838779956, + "grad_norm": 1.1238609629028475, + "learning_rate": 8.219364180956812e-08, + "loss": 0.1562, + "step": 12122 + }, + { + "epoch": 0.960427807486631, + "grad_norm": 1.5411071517664339, + "learning_rate": 8.186564489428561e-08, + "loss": 0.2308, + "step": 12123 + }, + { + "epoch": 0.9605070310952664, + "grad_norm": 1.2910708605082721, + "learning_rate": 8.153830104042582e-08, + "loss": 0.1994, + "step": 12124 + }, + { + "epoch": 0.9605862547039018, + "grad_norm": 1.2090214632643919, + "learning_rate": 8.121161026954482e-08, + "loss": 0.1796, + "step": 12125 + }, + { + "epoch": 0.9606654783125371, + "grad_norm": 1.2293102610274222, + "learning_rate": 8.088557260315322e-08, + "loss": 0.1869, + "step": 12126 + }, + { + "epoch": 0.9607447019211726, + "grad_norm": 1.5758302338841237, + "learning_rate": 8.056018806271937e-08, + "loss": 0.3025, + "step": 12127 + }, + { + "epoch": 0.9608239255298079, + "grad_norm": 1.2662352546161477, + "learning_rate": 8.023545666966726e-08, + "loss": 0.1979, + "step": 12128 + }, + { + "epoch": 0.9609031491384432, + "grad_norm": 1.5212885895466786, + "learning_rate": 7.991137844537977e-08, + "loss": 0.2857, + "step": 12129 + }, + { + "epoch": 0.9609823727470787, + "grad_norm": 1.0824801357269576, + "learning_rate": 7.958795341119541e-08, + "loss": 0.1143, + "step": 12130 + }, + { + "epoch": 0.961061596355714, + "grad_norm": 1.3017967915763433, + "learning_rate": 7.926518158841045e-08, + "loss": 0.2157, + "step": 12131 + }, + { + "epoch": 0.9611408199643494, + "grad_norm": 1.3201811404147326, + "learning_rate": 7.894306299827791e-08, + "loss": 0.2127, + "step": 12132 + }, + { + "epoch": 0.9612200435729847, + "grad_norm": 1.4514946484897493, + "learning_rate": 7.86215976620075e-08, + "loss": 0.1764, + "step": 12133 + }, + { + "epoch": 0.9612992671816202, + "grad_norm": 1.4849218175548835, + "learning_rate": 7.83007856007667e-08, + "loss": 0.2878, + "step": 12134 + }, + { + "epoch": 0.9613784907902555, + "grad_norm": 1.5690597622736902, + "learning_rate": 7.798062683567864e-08, + "loss": 0.1912, + "step": 12135 + }, + { + "epoch": 0.9614577143988908, + "grad_norm": 1.3691484049840832, + "learning_rate": 7.766112138782422e-08, + "loss": 0.1742, + "step": 12136 + }, + { + "epoch": 0.9615369380075263, + "grad_norm": 1.6543121613121445, + "learning_rate": 7.734226927824106e-08, + "loss": 0.2167, + "step": 12137 + }, + { + "epoch": 0.9616161616161616, + "grad_norm": 1.4726366517948368, + "learning_rate": 7.70240705279257e-08, + "loss": 0.2216, + "step": 12138 + }, + { + "epoch": 0.961695385224797, + "grad_norm": 1.4127443340806638, + "learning_rate": 7.670652515782917e-08, + "loss": 0.2281, + "step": 12139 + }, + { + "epoch": 0.9617746088334324, + "grad_norm": 1.0391781337030863, + "learning_rate": 7.638963318886028e-08, + "loss": 0.1488, + "step": 12140 + }, + { + "epoch": 0.9618538324420678, + "grad_norm": 1.3354249076797793, + "learning_rate": 7.607339464188346e-08, + "loss": 0.2121, + "step": 12141 + }, + { + "epoch": 0.9619330560507031, + "grad_norm": 1.4241452585947816, + "learning_rate": 7.575780953772427e-08, + "loss": 0.2593, + "step": 12142 + }, + { + "epoch": 0.9620122796593384, + "grad_norm": 1.1991295839562486, + "learning_rate": 7.544287789715943e-08, + "loss": 0.2001, + "step": 12143 + }, + { + "epoch": 0.9620915032679739, + "grad_norm": 1.5383734362394947, + "learning_rate": 7.51285997409279e-08, + "loss": 0.2076, + "step": 12144 + }, + { + "epoch": 0.9621707268766092, + "grad_norm": 1.704234911213951, + "learning_rate": 7.481497508972313e-08, + "loss": 0.2625, + "step": 12145 + }, + { + "epoch": 0.9622499504852446, + "grad_norm": 1.4030967323240011, + "learning_rate": 7.450200396419416e-08, + "loss": 0.2675, + "step": 12146 + }, + { + "epoch": 0.96232917409388, + "grad_norm": 1.9239969177192455, + "learning_rate": 7.418968638495006e-08, + "loss": 0.3094, + "step": 12147 + }, + { + "epoch": 0.9624083977025153, + "grad_norm": 1.0788767502670733, + "learning_rate": 7.387802237255658e-08, + "loss": 0.1421, + "step": 12148 + }, + { + "epoch": 0.9624876213111507, + "grad_norm": 1.6326189032316254, + "learning_rate": 7.35670119475329e-08, + "loss": 0.2816, + "step": 12149 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 1.2659298189789456, + "learning_rate": 7.325665513035707e-08, + "loss": 0.1891, + "step": 12150 + }, + { + "epoch": 0.9626460685284215, + "grad_norm": 1.3321865926123506, + "learning_rate": 7.294695194146829e-08, + "loss": 0.1996, + "step": 12151 + }, + { + "epoch": 0.9627252921370568, + "grad_norm": 1.4562287523024298, + "learning_rate": 7.263790240125579e-08, + "loss": 0.2582, + "step": 12152 + }, + { + "epoch": 0.9628045157456923, + "grad_norm": 2.0075950733529773, + "learning_rate": 7.232950653006998e-08, + "loss": 0.3089, + "step": 12153 + }, + { + "epoch": 0.9628837393543276, + "grad_norm": 1.4084725111007126, + "learning_rate": 7.202176434821683e-08, + "loss": 0.2484, + "step": 12154 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 1.5399977729776648, + "learning_rate": 7.171467587596126e-08, + "loss": 0.2149, + "step": 12155 + }, + { + "epoch": 0.9630421865715983, + "grad_norm": 1.5275181949076877, + "learning_rate": 7.140824113352151e-08, + "loss": 0.2532, + "step": 12156 + }, + { + "epoch": 0.9631214101802337, + "grad_norm": 1.287590968924547, + "learning_rate": 7.110246014107592e-08, + "loss": 0.255, + "step": 12157 + }, + { + "epoch": 0.9632006337888691, + "grad_norm": 1.3973797913588788, + "learning_rate": 7.079733291875945e-08, + "loss": 0.2133, + "step": 12158 + }, + { + "epoch": 0.9632798573975044, + "grad_norm": 1.2654956713772518, + "learning_rate": 7.049285948666052e-08, + "loss": 0.2054, + "step": 12159 + }, + { + "epoch": 0.9633590810061399, + "grad_norm": 1.5181264535255565, + "learning_rate": 7.018903986483083e-08, + "loss": 0.2483, + "step": 12160 + }, + { + "epoch": 0.9634383046147752, + "grad_norm": 1.4993303828743456, + "learning_rate": 6.988587407327219e-08, + "loss": 0.2477, + "step": 12161 + }, + { + "epoch": 0.9635175282234105, + "grad_norm": 1.168916372065898, + "learning_rate": 6.958336213194972e-08, + "loss": 0.1949, + "step": 12162 + }, + { + "epoch": 0.963596751832046, + "grad_norm": 1.3396899518409298, + "learning_rate": 6.928150406077861e-08, + "loss": 0.1886, + "step": 12163 + }, + { + "epoch": 0.9636759754406813, + "grad_norm": 1.4996940315474447, + "learning_rate": 6.89802998796385e-08, + "loss": 0.2348, + "step": 12164 + }, + { + "epoch": 0.9637551990493167, + "grad_norm": 1.8643842749256838, + "learning_rate": 6.867974960836022e-08, + "loss": 0.2822, + "step": 12165 + }, + { + "epoch": 0.963834422657952, + "grad_norm": 1.2574672922472474, + "learning_rate": 6.837985326673457e-08, + "loss": 0.2161, + "step": 12166 + }, + { + "epoch": 0.9639136462665875, + "grad_norm": 1.6438673262321906, + "learning_rate": 6.80806108745069e-08, + "loss": 0.2964, + "step": 12167 + }, + { + "epoch": 0.9639928698752228, + "grad_norm": 1.176001171388162, + "learning_rate": 6.778202245138144e-08, + "loss": 0.2096, + "step": 12168 + }, + { + "epoch": 0.9640720934838581, + "grad_norm": 1.0239928894708468, + "learning_rate": 6.748408801701911e-08, + "loss": 0.1241, + "step": 12169 + }, + { + "epoch": 0.9641513170924936, + "grad_norm": 1.571309231946613, + "learning_rate": 6.718680759103757e-08, + "loss": 0.2577, + "step": 12170 + }, + { + "epoch": 0.9642305407011289, + "grad_norm": 1.3598451065031205, + "learning_rate": 6.689018119301227e-08, + "loss": 0.25, + "step": 12171 + }, + { + "epoch": 0.9643097643097643, + "grad_norm": 1.2925687705380562, + "learning_rate": 6.659420884247203e-08, + "loss": 0.2392, + "step": 12172 + }, + { + "epoch": 0.9643889879183997, + "grad_norm": 1.1308155036818477, + "learning_rate": 6.629889055890682e-08, + "loss": 0.1639, + "step": 12173 + }, + { + "epoch": 0.9644682115270351, + "grad_norm": 1.4475982701983245, + "learning_rate": 6.600422636176219e-08, + "loss": 0.2576, + "step": 12174 + }, + { + "epoch": 0.9645474351356704, + "grad_norm": 1.4380596796821712, + "learning_rate": 6.571021627043928e-08, + "loss": 0.1797, + "step": 12175 + }, + { + "epoch": 0.9646266587443058, + "grad_norm": 1.4300944381680842, + "learning_rate": 6.541686030429817e-08, + "loss": 0.235, + "step": 12176 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 1.1656434602092571, + "learning_rate": 6.512415848265453e-08, + "loss": 0.1854, + "step": 12177 + }, + { + "epoch": 0.9647851059615765, + "grad_norm": 1.2666882505308366, + "learning_rate": 6.48321108247818e-08, + "loss": 0.2255, + "step": 12178 + }, + { + "epoch": 0.964864329570212, + "grad_norm": 1.50194016194839, + "learning_rate": 6.454071734990907e-08, + "loss": 0.2744, + "step": 12179 + }, + { + "epoch": 0.9649435531788473, + "grad_norm": 1.4170857624721638, + "learning_rate": 6.424997807722433e-08, + "loss": 0.2931, + "step": 12180 + }, + { + "epoch": 0.9650227767874827, + "grad_norm": 1.3123720705969035, + "learning_rate": 6.395989302587113e-08, + "loss": 0.1836, + "step": 12181 + }, + { + "epoch": 0.965102000396118, + "grad_norm": 1.6992903764607232, + "learning_rate": 6.367046221494866e-08, + "loss": 0.3634, + "step": 12182 + }, + { + "epoch": 0.9651812240047534, + "grad_norm": 1.1401655244332354, + "learning_rate": 6.33816856635161e-08, + "loss": 0.1586, + "step": 12183 + }, + { + "epoch": 0.9652604476133888, + "grad_norm": 1.2693838860272517, + "learning_rate": 6.309356339058825e-08, + "loss": 0.1629, + "step": 12184 + }, + { + "epoch": 0.9653396712220241, + "grad_norm": 1.1035292437370114, + "learning_rate": 6.28060954151355e-08, + "loss": 0.1323, + "step": 12185 + }, + { + "epoch": 0.9654188948306596, + "grad_norm": 1.3287846184987673, + "learning_rate": 6.251928175608602e-08, + "loss": 0.234, + "step": 12186 + }, + { + "epoch": 0.9654981184392949, + "grad_norm": 1.6808753389531697, + "learning_rate": 6.223312243232693e-08, + "loss": 0.3173, + "step": 12187 + }, + { + "epoch": 0.9655773420479303, + "grad_norm": 1.4629514101098156, + "learning_rate": 6.194761746269762e-08, + "loss": 0.2741, + "step": 12188 + }, + { + "epoch": 0.9656565656565657, + "grad_norm": 1.3981916529423903, + "learning_rate": 6.16627668659997e-08, + "loss": 0.1928, + "step": 12189 + }, + { + "epoch": 0.965735789265201, + "grad_norm": 1.1763928950181257, + "learning_rate": 6.137857066098929e-08, + "loss": 0.2251, + "step": 12190 + }, + { + "epoch": 0.9658150128738364, + "grad_norm": 1.3421797233630617, + "learning_rate": 6.109502886637697e-08, + "loss": 0.224, + "step": 12191 + }, + { + "epoch": 0.9658942364824717, + "grad_norm": 0.9270520369504419, + "learning_rate": 6.081214150083447e-08, + "loss": 0.112, + "step": 12192 + }, + { + "epoch": 0.9659734600911072, + "grad_norm": 1.8481709396715922, + "learning_rate": 6.052990858298801e-08, + "loss": 0.2632, + "step": 12193 + }, + { + "epoch": 0.9660526836997425, + "grad_norm": 1.3208908447697942, + "learning_rate": 6.024833013142272e-08, + "loss": 0.2372, + "step": 12194 + }, + { + "epoch": 0.966131907308378, + "grad_norm": 1.2458708480604703, + "learning_rate": 5.9967406164676e-08, + "loss": 0.2134, + "step": 12195 + }, + { + "epoch": 0.9662111309170133, + "grad_norm": 1.163062056507131, + "learning_rate": 5.96871367012486e-08, + "loss": 0.1485, + "step": 12196 + }, + { + "epoch": 0.9662903545256486, + "grad_norm": 1.2097916449634558, + "learning_rate": 5.9407521759592414e-08, + "loss": 0.2029, + "step": 12197 + }, + { + "epoch": 0.966369578134284, + "grad_norm": 1.282762612822857, + "learning_rate": 5.912856135812051e-08, + "loss": 0.2042, + "step": 12198 + }, + { + "epoch": 0.9664488017429194, + "grad_norm": 1.4703193284617515, + "learning_rate": 5.8850255515200405e-08, + "loss": 0.2417, + "step": 12199 + }, + { + "epoch": 0.9665280253515548, + "grad_norm": 1.3913644006621462, + "learning_rate": 5.857260424915634e-08, + "loss": 0.2386, + "step": 12200 + }, + { + "epoch": 0.9666072489601901, + "grad_norm": 1.4006172209198122, + "learning_rate": 5.8295607578272575e-08, + "loss": 0.28, + "step": 12201 + }, + { + "epoch": 0.9666864725688256, + "grad_norm": 1.3877347369665778, + "learning_rate": 5.801926552078563e-08, + "loss": 0.1874, + "step": 12202 + }, + { + "epoch": 0.9667656961774609, + "grad_norm": 1.1495006728063186, + "learning_rate": 5.774357809489317e-08, + "loss": 0.1623, + "step": 12203 + }, + { + "epoch": 0.9668449197860962, + "grad_norm": 1.3752093057341217, + "learning_rate": 5.746854531874624e-08, + "loss": 0.2532, + "step": 12204 + }, + { + "epoch": 0.9669241433947316, + "grad_norm": 1.2165236408239686, + "learning_rate": 5.7194167210454785e-08, + "loss": 0.1689, + "step": 12205 + }, + { + "epoch": 0.967003367003367, + "grad_norm": 1.2438981028003795, + "learning_rate": 5.692044378808659e-08, + "loss": 0.1825, + "step": 12206 + }, + { + "epoch": 0.9670825906120024, + "grad_norm": 1.2642222220469412, + "learning_rate": 5.664737506966389e-08, + "loss": 0.2264, + "step": 12207 + }, + { + "epoch": 0.9671618142206377, + "grad_norm": 1.494274586391317, + "learning_rate": 5.6374961073166757e-08, + "loss": 0.2515, + "step": 12208 + }, + { + "epoch": 0.9672410378292732, + "grad_norm": 1.2357348088375435, + "learning_rate": 5.610320181653306e-08, + "loss": 0.2515, + "step": 12209 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 1.2427218164644418, + "learning_rate": 5.583209731765626e-08, + "loss": 0.1704, + "step": 12210 + }, + { + "epoch": 0.9673994850465438, + "grad_norm": 1.3440637586162874, + "learning_rate": 5.5561647594388756e-08, + "loss": 0.2034, + "step": 12211 + }, + { + "epoch": 0.9674787086551793, + "grad_norm": 1.336502308107966, + "learning_rate": 5.529185266453629e-08, + "loss": 0.2411, + "step": 12212 + }, + { + "epoch": 0.9675579322638146, + "grad_norm": 1.1072609553865793, + "learning_rate": 5.502271254586356e-08, + "loss": 0.1679, + "step": 12213 + }, + { + "epoch": 0.96763715587245, + "grad_norm": 1.350825278239869, + "learning_rate": 5.4754227256094136e-08, + "loss": 0.1897, + "step": 12214 + }, + { + "epoch": 0.9677163794810854, + "grad_norm": 1.21358631090734, + "learning_rate": 5.4486396812906125e-08, + "loss": 0.1393, + "step": 12215 + }, + { + "epoch": 0.9677956030897208, + "grad_norm": 1.4883511399097444, + "learning_rate": 5.421922123393208e-08, + "loss": 0.2684, + "step": 12216 + }, + { + "epoch": 0.9678748266983561, + "grad_norm": 1.3351623786611855, + "learning_rate": 5.395270053676793e-08, + "loss": 0.1686, + "step": 12217 + }, + { + "epoch": 0.9679540503069914, + "grad_norm": 1.3770433965151345, + "learning_rate": 5.3686834738960744e-08, + "loss": 0.2728, + "step": 12218 + }, + { + "epoch": 0.9680332739156269, + "grad_norm": 1.2701327612679396, + "learning_rate": 5.3421623858016525e-08, + "loss": 0.2238, + "step": 12219 + }, + { + "epoch": 0.9681124975242622, + "grad_norm": 1.3082860632045146, + "learning_rate": 5.3157067911399076e-08, + "loss": 0.1544, + "step": 12220 + }, + { + "epoch": 0.9681917211328976, + "grad_norm": 1.1999361763084468, + "learning_rate": 5.289316691652668e-08, + "loss": 0.1822, + "step": 12221 + }, + { + "epoch": 0.968270944741533, + "grad_norm": 1.4864271015934059, + "learning_rate": 5.2629920890777676e-08, + "loss": 0.2312, + "step": 12222 + }, + { + "epoch": 0.9683501683501684, + "grad_norm": 1.2592982820226126, + "learning_rate": 5.236732985148374e-08, + "loss": 0.2206, + "step": 12223 + }, + { + "epoch": 0.9684293919588037, + "grad_norm": 1.2311695794374358, + "learning_rate": 5.21053938159366e-08, + "loss": 0.157, + "step": 12224 + }, + { + "epoch": 0.9685086155674391, + "grad_norm": 1.3616873828598695, + "learning_rate": 5.1844112801383576e-08, + "loss": 0.223, + "step": 12225 + }, + { + "epoch": 0.9685878391760745, + "grad_norm": 1.412709609714862, + "learning_rate": 5.158348682502756e-08, + "loss": 0.2149, + "step": 12226 + }, + { + "epoch": 0.9686670627847098, + "grad_norm": 1.5097490557116413, + "learning_rate": 5.1323515904031506e-08, + "loss": 0.2774, + "step": 12227 + }, + { + "epoch": 0.9687462863933453, + "grad_norm": 1.8206702170482414, + "learning_rate": 5.1064200055510606e-08, + "loss": 0.2846, + "step": 12228 + }, + { + "epoch": 0.9688255100019806, + "grad_norm": 1.4739707672587357, + "learning_rate": 5.080553929654119e-08, + "loss": 0.2358, + "step": 12229 + }, + { + "epoch": 0.9689047336106159, + "grad_norm": 1.222935833540115, + "learning_rate": 5.05475336441541e-08, + "loss": 0.1656, + "step": 12230 + }, + { + "epoch": 0.9689839572192513, + "grad_norm": 1.3430587472010265, + "learning_rate": 5.0290183115339065e-08, + "loss": 0.2383, + "step": 12231 + }, + { + "epoch": 0.9690631808278867, + "grad_norm": 1.4457664497185594, + "learning_rate": 5.003348772704031e-08, + "loss": 0.1964, + "step": 12232 + }, + { + "epoch": 0.9691424044365221, + "grad_norm": 1.4614767210708277, + "learning_rate": 4.977744749615987e-08, + "loss": 0.2413, + "step": 12233 + }, + { + "epoch": 0.9692216280451574, + "grad_norm": 1.1267893603809416, + "learning_rate": 4.9522062439557595e-08, + "loss": 0.1579, + "step": 12234 + }, + { + "epoch": 0.9693008516537929, + "grad_norm": 1.0472532628652576, + "learning_rate": 4.926733257404892e-08, + "loss": 0.158, + "step": 12235 + }, + { + "epoch": 0.9693800752624282, + "grad_norm": 1.6538958547189024, + "learning_rate": 4.901325791640599e-08, + "loss": 0.2851, + "step": 12236 + }, + { + "epoch": 0.9694592988710635, + "grad_norm": 1.2190111872149716, + "learning_rate": 4.8759838483358745e-08, + "loss": 0.1763, + "step": 12237 + }, + { + "epoch": 0.969538522479699, + "grad_norm": 1.396709109728519, + "learning_rate": 4.850707429159496e-08, + "loss": 0.1826, + "step": 12238 + }, + { + "epoch": 0.9696177460883343, + "grad_norm": 1.5303925469033033, + "learning_rate": 4.825496535775576e-08, + "loss": 0.3026, + "step": 12239 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 1.4589705348855644, + "learning_rate": 4.800351169844231e-08, + "loss": 0.3439, + "step": 12240 + }, + { + "epoch": 0.969776193305605, + "grad_norm": 1.305475434919313, + "learning_rate": 4.7752713330212475e-08, + "loss": 0.1979, + "step": 12241 + }, + { + "epoch": 0.9698554169142405, + "grad_norm": 1.4282557192748924, + "learning_rate": 4.7502570269578605e-08, + "loss": 0.2291, + "step": 12242 + }, + { + "epoch": 0.9699346405228758, + "grad_norm": 1.199495373585879, + "learning_rate": 4.725308253301197e-08, + "loss": 0.1762, + "step": 12243 + }, + { + "epoch": 0.9700138641315111, + "grad_norm": 1.1789346213559573, + "learning_rate": 4.7004250136940547e-08, + "loss": 0.2084, + "step": 12244 + }, + { + "epoch": 0.9700930877401466, + "grad_norm": 1.2697096279483457, + "learning_rate": 4.675607309774899e-08, + "loss": 0.2192, + "step": 12245 + }, + { + "epoch": 0.9701723113487819, + "grad_norm": 1.2500150954125666, + "learning_rate": 4.650855143177757e-08, + "loss": 0.2069, + "step": 12246 + }, + { + "epoch": 0.9702515349574173, + "grad_norm": 1.381256867920319, + "learning_rate": 4.626168515532548e-08, + "loss": 0.2122, + "step": 12247 + }, + { + "epoch": 0.9703307585660527, + "grad_norm": 1.224437497239653, + "learning_rate": 4.6015474284646366e-08, + "loss": 0.182, + "step": 12248 + }, + { + "epoch": 0.9704099821746881, + "grad_norm": 1.2441310782635118, + "learning_rate": 4.576991883595283e-08, + "loss": 0.1689, + "step": 12249 + }, + { + "epoch": 0.9704892057833234, + "grad_norm": 1.3024198692000433, + "learning_rate": 4.5525018825414157e-08, + "loss": 0.2386, + "step": 12250 + }, + { + "epoch": 0.9705684293919588, + "grad_norm": 1.4568418455316767, + "learning_rate": 4.528077426915412e-08, + "loss": 0.2258, + "step": 12251 + }, + { + "epoch": 0.9706476530005942, + "grad_norm": 1.608530520043343, + "learning_rate": 4.50371851832565e-08, + "loss": 0.2535, + "step": 12252 + }, + { + "epoch": 0.9707268766092295, + "grad_norm": 1.2553087254675201, + "learning_rate": 4.4794251583759604e-08, + "loss": 0.194, + "step": 12253 + }, + { + "epoch": 0.970806100217865, + "grad_norm": 1.5261982368658091, + "learning_rate": 4.4551973486660625e-08, + "loss": 0.187, + "step": 12254 + }, + { + "epoch": 0.9708853238265003, + "grad_norm": 1.5382949903498677, + "learning_rate": 4.431035090791125e-08, + "loss": 0.2929, + "step": 12255 + }, + { + "epoch": 0.9709645474351357, + "grad_norm": 1.1914975062853832, + "learning_rate": 4.4069383863420966e-08, + "loss": 0.1717, + "step": 12256 + }, + { + "epoch": 0.971043771043771, + "grad_norm": 2.2593160730728066, + "learning_rate": 4.38290723690582e-08, + "loss": 0.3368, + "step": 12257 + }, + { + "epoch": 0.9711229946524064, + "grad_norm": 1.1762963558647608, + "learning_rate": 4.3589416440643626e-08, + "loss": 0.1665, + "step": 12258 + }, + { + "epoch": 0.9712022182610418, + "grad_norm": 0.9756039236291572, + "learning_rate": 4.335041609396018e-08, + "loss": 0.1266, + "step": 12259 + }, + { + "epoch": 0.9712814418696771, + "grad_norm": 1.4976348064572005, + "learning_rate": 4.3112071344741935e-08, + "loss": 0.188, + "step": 12260 + }, + { + "epoch": 0.9713606654783126, + "grad_norm": 1.5689923083047694, + "learning_rate": 4.287438220868523e-08, + "loss": 0.2385, + "step": 12261 + }, + { + "epoch": 0.9714398890869479, + "grad_norm": 1.3845497814235785, + "learning_rate": 4.263734870143976e-08, + "loss": 0.2366, + "step": 12262 + }, + { + "epoch": 0.9715191126955833, + "grad_norm": 1.3799185290855691, + "learning_rate": 4.2400970838613057e-08, + "loss": 0.2636, + "step": 12263 + }, + { + "epoch": 0.9715983363042187, + "grad_norm": 1.355173656741787, + "learning_rate": 4.216524863576932e-08, + "loss": 0.2045, + "step": 12264 + }, + { + "epoch": 0.971677559912854, + "grad_norm": 1.284122106106047, + "learning_rate": 4.1930182108430584e-08, + "loss": 0.1743, + "step": 12265 + }, + { + "epoch": 0.9717567835214894, + "grad_norm": 1.2559448791372945, + "learning_rate": 4.1695771272073357e-08, + "loss": 0.1773, + "step": 12266 + }, + { + "epoch": 0.9718360071301247, + "grad_norm": 1.4297530117167156, + "learning_rate": 4.146201614213419e-08, + "loss": 0.2075, + "step": 12267 + }, + { + "epoch": 0.9719152307387602, + "grad_norm": 1.334435378369409, + "learning_rate": 4.1228916734002976e-08, + "loss": 0.1922, + "step": 12268 + }, + { + "epoch": 0.9719944543473955, + "grad_norm": 1.5325580662944063, + "learning_rate": 4.099647306302856e-08, + "loss": 0.3257, + "step": 12269 + }, + { + "epoch": 0.972073677956031, + "grad_norm": 1.3515630324357597, + "learning_rate": 4.076468514451759e-08, + "loss": 0.2589, + "step": 12270 + }, + { + "epoch": 0.9721529015646663, + "grad_norm": 1.16160651733541, + "learning_rate": 4.0533552993731186e-08, + "loss": 0.1773, + "step": 12271 + }, + { + "epoch": 0.9722321251733016, + "grad_norm": 1.4589711874460212, + "learning_rate": 4.030307662588939e-08, + "loss": 0.2091, + "step": 12272 + }, + { + "epoch": 0.972311348781937, + "grad_norm": 1.1488169550799014, + "learning_rate": 4.007325605616563e-08, + "loss": 0.1677, + "step": 12273 + }, + { + "epoch": 0.9723905723905724, + "grad_norm": 1.4421467699722428, + "learning_rate": 3.9844091299694466e-08, + "loss": 0.2703, + "step": 12274 + }, + { + "epoch": 0.9724697959992078, + "grad_norm": 1.5500475396421267, + "learning_rate": 3.961558237156493e-08, + "loss": 0.2479, + "step": 12275 + }, + { + "epoch": 0.9725490196078431, + "grad_norm": 1.272127161554797, + "learning_rate": 3.9387729286821666e-08, + "loss": 0.163, + "step": 12276 + }, + { + "epoch": 0.9726282432164786, + "grad_norm": 1.1947098119092487, + "learning_rate": 3.9160532060470435e-08, + "loss": 0.2116, + "step": 12277 + }, + { + "epoch": 0.9727074668251139, + "grad_norm": 1.4089561696816784, + "learning_rate": 3.893399070746928e-08, + "loss": 0.2096, + "step": 12278 + }, + { + "epoch": 0.9727866904337492, + "grad_norm": 1.421915447324665, + "learning_rate": 3.870810524273516e-08, + "loss": 0.2272, + "step": 12279 + }, + { + "epoch": 0.9728659140423846, + "grad_norm": 1.5210323484100234, + "learning_rate": 3.8482875681140616e-08, + "loss": 0.2108, + "step": 12280 + }, + { + "epoch": 0.97294513765102, + "grad_norm": 1.5624821094387638, + "learning_rate": 3.8258302037518234e-08, + "loss": 0.2369, + "step": 12281 + }, + { + "epoch": 0.9730243612596554, + "grad_norm": 1.2131119567447837, + "learning_rate": 3.803438432665396e-08, + "loss": 0.2071, + "step": 12282 + }, + { + "epoch": 0.9731035848682907, + "grad_norm": 0.9724486801869996, + "learning_rate": 3.781112256329045e-08, + "loss": 0.1472, + "step": 12283 + }, + { + "epoch": 0.9731828084769262, + "grad_norm": 1.6714186472791708, + "learning_rate": 3.758851676213038e-08, + "loss": 0.2658, + "step": 12284 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 1.5664184736157616, + "learning_rate": 3.7366566937829804e-08, + "loss": 0.2461, + "step": 12285 + }, + { + "epoch": 0.9733412556941968, + "grad_norm": 1.3309298727417753, + "learning_rate": 3.714527310500371e-08, + "loss": 0.2491, + "step": 12286 + }, + { + "epoch": 0.9734204793028323, + "grad_norm": 1.3131425720222438, + "learning_rate": 3.692463527822376e-08, + "loss": 0.2796, + "step": 12287 + }, + { + "epoch": 0.9734997029114676, + "grad_norm": 1.3179025994874145, + "learning_rate": 3.670465347201724e-08, + "loss": 0.2765, + "step": 12288 + }, + { + "epoch": 0.973578926520103, + "grad_norm": 1.4936406331142933, + "learning_rate": 3.6485327700869214e-08, + "loss": 0.2073, + "step": 12289 + }, + { + "epoch": 0.9736581501287384, + "grad_norm": 1.381280612021301, + "learning_rate": 3.6266657979220356e-08, + "loss": 0.2424, + "step": 12290 + }, + { + "epoch": 0.9737373737373738, + "grad_norm": 1.3052940856114268, + "learning_rate": 3.604864432147026e-08, + "loss": 0.2037, + "step": 12291 + }, + { + "epoch": 0.9738165973460091, + "grad_norm": 1.1378913188808923, + "learning_rate": 3.5831286741973006e-08, + "loss": 0.1711, + "step": 12292 + }, + { + "epoch": 0.9738958209546444, + "grad_norm": 1.1890096230685943, + "learning_rate": 3.561458525504047e-08, + "loss": 0.2222, + "step": 12293 + }, + { + "epoch": 0.9739750445632799, + "grad_norm": 1.255975546601389, + "learning_rate": 3.539853987494235e-08, + "loss": 0.2285, + "step": 12294 + }, + { + "epoch": 0.9740542681719152, + "grad_norm": 0.8770316061226621, + "learning_rate": 3.518315061590394e-08, + "loss": 0.123, + "step": 12295 + }, + { + "epoch": 0.9741334917805506, + "grad_norm": 1.1420133507240764, + "learning_rate": 3.496841749210722e-08, + "loss": 0.1556, + "step": 12296 + }, + { + "epoch": 0.974212715389186, + "grad_norm": 1.1972061063191477, + "learning_rate": 3.4754340517691996e-08, + "loss": 0.1921, + "step": 12297 + }, + { + "epoch": 0.9742919389978214, + "grad_norm": 1.4783612595674176, + "learning_rate": 3.454091970675366e-08, + "loss": 0.2618, + "step": 12298 + }, + { + "epoch": 0.9743711626064567, + "grad_norm": 1.5203103031150713, + "learning_rate": 3.4328155073344306e-08, + "loss": 0.2446, + "step": 12299 + }, + { + "epoch": 0.9744503862150921, + "grad_norm": 1.4796996817951515, + "learning_rate": 3.411604663147494e-08, + "loss": 0.2551, + "step": 12300 + }, + { + "epoch": 0.9745296098237275, + "grad_norm": 1.354552098173944, + "learning_rate": 3.3904594395111066e-08, + "loss": 0.2264, + "step": 12301 + }, + { + "epoch": 0.9746088334323628, + "grad_norm": 1.4754575807933092, + "learning_rate": 3.369379837817599e-08, + "loss": 0.2449, + "step": 12302 + }, + { + "epoch": 0.9746880570409983, + "grad_norm": 1.3076793838394922, + "learning_rate": 3.3483658594548606e-08, + "loss": 0.1774, + "step": 12303 + }, + { + "epoch": 0.9747672806496336, + "grad_norm": 1.2089916805193341, + "learning_rate": 3.327417505806785e-08, + "loss": 0.1595, + "step": 12304 + }, + { + "epoch": 0.9748465042582689, + "grad_norm": 1.1542735074613908, + "learning_rate": 3.30653477825249e-08, + "loss": 0.1992, + "step": 12305 + }, + { + "epoch": 0.9749257278669043, + "grad_norm": 1.4061416701127778, + "learning_rate": 3.2857176781671e-08, + "loss": 0.2293, + "step": 12306 + }, + { + "epoch": 0.9750049514755397, + "grad_norm": 1.2558966329422117, + "learning_rate": 3.264966206921294e-08, + "loss": 0.1994, + "step": 12307 + }, + { + "epoch": 0.9750841750841751, + "grad_norm": 1.140288189456784, + "learning_rate": 3.244280365881536e-08, + "loss": 0.1593, + "step": 12308 + }, + { + "epoch": 0.9751633986928104, + "grad_norm": 1.6389677037609929, + "learning_rate": 3.223660156409847e-08, + "loss": 0.2855, + "step": 12309 + }, + { + "epoch": 0.9752426223014459, + "grad_norm": 1.4261694395806452, + "learning_rate": 3.203105579863919e-08, + "loss": 0.228, + "step": 12310 + }, + { + "epoch": 0.9753218459100812, + "grad_norm": 1.3820384936318337, + "learning_rate": 3.1826166375972246e-08, + "loss": 0.2292, + "step": 12311 + }, + { + "epoch": 0.9754010695187165, + "grad_norm": 1.1456312426525317, + "learning_rate": 3.162193330958796e-08, + "loss": 0.212, + "step": 12312 + }, + { + "epoch": 0.975480293127352, + "grad_norm": 1.4301660536888057, + "learning_rate": 3.141835661293557e-08, + "loss": 0.2159, + "step": 12313 + }, + { + "epoch": 0.9755595167359873, + "grad_norm": 1.115726228691136, + "learning_rate": 3.12154362994177e-08, + "loss": 0.1604, + "step": 12314 + }, + { + "epoch": 0.9756387403446227, + "grad_norm": 1.1364463494656156, + "learning_rate": 3.1013172382396984e-08, + "loss": 0.1496, + "step": 12315 + }, + { + "epoch": 0.975717963953258, + "grad_norm": 1.3332340304254042, + "learning_rate": 3.0811564875190544e-08, + "loss": 0.2186, + "step": 12316 + }, + { + "epoch": 0.9757971875618935, + "grad_norm": 1.2461023124452304, + "learning_rate": 3.061061379107555e-08, + "loss": 0.1854, + "step": 12317 + }, + { + "epoch": 0.9758764111705288, + "grad_norm": 1.1891070730527542, + "learning_rate": 3.04103191432803e-08, + "loss": 0.1698, + "step": 12318 + }, + { + "epoch": 0.9759556347791641, + "grad_norm": 1.592263278429394, + "learning_rate": 3.0210680944995354e-08, + "loss": 0.2442, + "step": 12319 + }, + { + "epoch": 0.9760348583877996, + "grad_norm": 1.1530731052234549, + "learning_rate": 3.001169920936575e-08, + "loss": 0.1622, + "step": 12320 + }, + { + "epoch": 0.9761140819964349, + "grad_norm": 1.2915830873242329, + "learning_rate": 2.981337394949324e-08, + "loss": 0.241, + "step": 12321 + }, + { + "epoch": 0.9761933056050703, + "grad_norm": 1.489731003599819, + "learning_rate": 2.961570517843626e-08, + "loss": 0.2431, + "step": 12322 + }, + { + "epoch": 0.9762725292137057, + "grad_norm": 1.2207160785053095, + "learning_rate": 2.9418692909211066e-08, + "loss": 0.1441, + "step": 12323 + }, + { + "epoch": 0.9763517528223411, + "grad_norm": 1.631887040649819, + "learning_rate": 2.9222337154789504e-08, + "loss": 0.2329, + "step": 12324 + }, + { + "epoch": 0.9764309764309764, + "grad_norm": 1.3069338171926856, + "learning_rate": 2.902663792810012e-08, + "loss": 0.2222, + "step": 12325 + }, + { + "epoch": 0.9765102000396118, + "grad_norm": 1.2699449337567243, + "learning_rate": 2.8831595242030387e-08, + "loss": 0.1864, + "step": 12326 + }, + { + "epoch": 0.9765894236482472, + "grad_norm": 1.4541733198233229, + "learning_rate": 2.863720910942114e-08, + "loss": 0.2041, + "step": 12327 + }, + { + "epoch": 0.9766686472568825, + "grad_norm": 1.022381580231622, + "learning_rate": 2.8443479543073248e-08, + "loss": 0.1575, + "step": 12328 + }, + { + "epoch": 0.976747870865518, + "grad_norm": 1.6720382870379475, + "learning_rate": 2.825040655574207e-08, + "loss": 0.1741, + "step": 12329 + }, + { + "epoch": 0.9768270944741533, + "grad_norm": 1.66110496468617, + "learning_rate": 2.8057990160139658e-08, + "loss": 0.3156, + "step": 12330 + }, + { + "epoch": 0.9769063180827887, + "grad_norm": 1.367090983003149, + "learning_rate": 2.7866230368936986e-08, + "loss": 0.1691, + "step": 12331 + }, + { + "epoch": 0.976985541691424, + "grad_norm": 1.3079821258042446, + "learning_rate": 2.767512719476062e-08, + "loss": 0.2624, + "step": 12332 + }, + { + "epoch": 0.9770647653000594, + "grad_norm": 1.2170933379518558, + "learning_rate": 2.7484680650193827e-08, + "loss": 0.173, + "step": 12333 + }, + { + "epoch": 0.9771439889086948, + "grad_norm": 1.3423451587573787, + "learning_rate": 2.729489074777547e-08, + "loss": 0.1918, + "step": 12334 + }, + { + "epoch": 0.9772232125173301, + "grad_norm": 1.290424328664151, + "learning_rate": 2.7105757500002215e-08, + "loss": 0.2279, + "step": 12335 + }, + { + "epoch": 0.9773024361259656, + "grad_norm": 1.4204912700790133, + "learning_rate": 2.6917280919329656e-08, + "loss": 0.2871, + "step": 12336 + }, + { + "epoch": 0.9773816597346009, + "grad_norm": 1.2567117459778772, + "learning_rate": 2.6729461018166758e-08, + "loss": 0.1684, + "step": 12337 + }, + { + "epoch": 0.9774608833432363, + "grad_norm": 1.1438383613017145, + "learning_rate": 2.654229780887918e-08, + "loss": 0.1392, + "step": 12338 + }, + { + "epoch": 0.9775401069518717, + "grad_norm": 1.569488934340685, + "learning_rate": 2.6355791303792622e-08, + "loss": 0.2032, + "step": 12339 + }, + { + "epoch": 0.977619330560507, + "grad_norm": 1.6808363837369735, + "learning_rate": 2.6169941515188368e-08, + "loss": 0.3481, + "step": 12340 + }, + { + "epoch": 0.9776985541691424, + "grad_norm": 1.0962788413502957, + "learning_rate": 2.5984748455301077e-08, + "loss": 0.204, + "step": 12341 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 1.3503058540728166, + "learning_rate": 2.5800212136326552e-08, + "loss": 0.2045, + "step": 12342 + }, + { + "epoch": 0.9778570013864132, + "grad_norm": 1.3639522368819534, + "learning_rate": 2.561633257041507e-08, + "loss": 0.2583, + "step": 12343 + }, + { + "epoch": 0.9779362249950485, + "grad_norm": 1.535935546539105, + "learning_rate": 2.5433109769674724e-08, + "loss": 0.3171, + "step": 12344 + }, + { + "epoch": 0.978015448603684, + "grad_norm": 1.4627871385836193, + "learning_rate": 2.52505437461692e-08, + "loss": 0.2164, + "step": 12345 + }, + { + "epoch": 0.9780946722123193, + "grad_norm": 1.2946980640255934, + "learning_rate": 2.5068634511919986e-08, + "loss": 0.2444, + "step": 12346 + }, + { + "epoch": 0.9781738958209546, + "grad_norm": 1.1692818989338118, + "learning_rate": 2.4887382078905287e-08, + "loss": 0.1782, + "step": 12347 + }, + { + "epoch": 0.97825311942959, + "grad_norm": 1.798326315808199, + "learning_rate": 2.4706786459058885e-08, + "loss": 0.3196, + "step": 12348 + }, + { + "epoch": 0.9783323430382254, + "grad_norm": 1.5160327265129587, + "learning_rate": 2.4526847664273488e-08, + "loss": 0.2312, + "step": 12349 + }, + { + "epoch": 0.9784115666468608, + "grad_norm": 1.089235382405294, + "learning_rate": 2.434756570639518e-08, + "loss": 0.1652, + "step": 12350 + }, + { + "epoch": 0.9784907902554961, + "grad_norm": 1.519519715802541, + "learning_rate": 2.4168940597230074e-08, + "loss": 0.3181, + "step": 12351 + }, + { + "epoch": 0.9785700138641316, + "grad_norm": 1.1257134158129076, + "learning_rate": 2.3990972348539864e-08, + "loss": 0.2043, + "step": 12352 + }, + { + "epoch": 0.9786492374727669, + "grad_norm": 1.303173375760026, + "learning_rate": 2.381366097204296e-08, + "loss": 0.249, + "step": 12353 + }, + { + "epoch": 0.9787284610814022, + "grad_norm": 1.4167374269998783, + "learning_rate": 2.363700647941336e-08, + "loss": 0.2469, + "step": 12354 + }, + { + "epoch": 0.9788076846900376, + "grad_norm": 1.2069445342672047, + "learning_rate": 2.3461008882283977e-08, + "loss": 0.1623, + "step": 12355 + }, + { + "epoch": 0.978886908298673, + "grad_norm": 1.351024222252906, + "learning_rate": 2.3285668192243317e-08, + "loss": 0.181, + "step": 12356 + }, + { + "epoch": 0.9789661319073084, + "grad_norm": 1.3085318095105418, + "learning_rate": 2.311098442083659e-08, + "loss": 0.2291, + "step": 12357 + }, + { + "epoch": 0.9790453555159437, + "grad_norm": 1.3948311518110854, + "learning_rate": 2.293695757956571e-08, + "loss": 0.1872, + "step": 12358 + }, + { + "epoch": 0.9791245791245792, + "grad_norm": 1.3861269503243618, + "learning_rate": 2.2763587679889288e-08, + "loss": 0.2599, + "step": 12359 + }, + { + "epoch": 0.9792038027332145, + "grad_norm": 1.1915911119804863, + "learning_rate": 2.2590874733223744e-08, + "loss": 0.1882, + "step": 12360 + }, + { + "epoch": 0.9792830263418498, + "grad_norm": 1.2021813832864123, + "learning_rate": 2.2418818750939986e-08, + "loss": 0.2026, + "step": 12361 + }, + { + "epoch": 0.9793622499504853, + "grad_norm": 1.6682931894189295, + "learning_rate": 2.2247419744368946e-08, + "loss": 0.2834, + "step": 12362 + }, + { + "epoch": 0.9794414735591206, + "grad_norm": 1.4471310801991073, + "learning_rate": 2.207667772479494e-08, + "loss": 0.2511, + "step": 12363 + }, + { + "epoch": 0.979520697167756, + "grad_norm": 1.4350412222295283, + "learning_rate": 2.190659270346118e-08, + "loss": 0.1995, + "step": 12364 + }, + { + "epoch": 0.9795999207763914, + "grad_norm": 1.563790131584391, + "learning_rate": 2.1737164691566502e-08, + "loss": 0.2792, + "step": 12365 + }, + { + "epoch": 0.9796791443850268, + "grad_norm": 1.2037339848124784, + "learning_rate": 2.156839370026753e-08, + "loss": 0.1796, + "step": 12366 + }, + { + "epoch": 0.9797583679936621, + "grad_norm": 1.3618015145621454, + "learning_rate": 2.140027974067649e-08, + "loss": 0.2438, + "step": 12367 + }, + { + "epoch": 0.9798375916022974, + "grad_norm": 1.2552708385352784, + "learning_rate": 2.1232822823862297e-08, + "loss": 0.1826, + "step": 12368 + }, + { + "epoch": 0.9799168152109329, + "grad_norm": 1.5051866327523513, + "learning_rate": 2.1066022960852806e-08, + "loss": 0.2612, + "step": 12369 + }, + { + "epoch": 0.9799960388195682, + "grad_norm": 1.4032366641715137, + "learning_rate": 2.0899880162630336e-08, + "loss": 0.2488, + "step": 12370 + }, + { + "epoch": 0.9800752624282036, + "grad_norm": 1.4064475514994181, + "learning_rate": 2.073439444013392e-08, + "loss": 0.1988, + "step": 12371 + }, + { + "epoch": 0.980154486036839, + "grad_norm": 0.9951831481828061, + "learning_rate": 2.0569565804260393e-08, + "loss": 0.174, + "step": 12372 + }, + { + "epoch": 0.9802337096454744, + "grad_norm": 1.2529452381863029, + "learning_rate": 2.04053942658633e-08, + "loss": 0.176, + "step": 12373 + }, + { + "epoch": 0.9803129332541097, + "grad_norm": 1.7136635696712321, + "learning_rate": 2.0241879835752875e-08, + "loss": 0.2561, + "step": 12374 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.9178366534683317, + "learning_rate": 2.0079022524694957e-08, + "loss": 0.1184, + "step": 12375 + }, + { + "epoch": 0.9804713804713805, + "grad_norm": 1.485368168420031, + "learning_rate": 1.991682234341208e-08, + "loss": 0.2245, + "step": 12376 + }, + { + "epoch": 0.9805506040800158, + "grad_norm": 1.5752433191734283, + "learning_rate": 1.9755279302585696e-08, + "loss": 0.258, + "step": 12377 + }, + { + "epoch": 0.9806298276886513, + "grad_norm": 1.2115654497400714, + "learning_rate": 1.959439341285285e-08, + "loss": 0.1801, + "step": 12378 + }, + { + "epoch": 0.9807090512972866, + "grad_norm": 1.7486470193716777, + "learning_rate": 1.943416468480619e-08, + "loss": 0.279, + "step": 12379 + }, + { + "epoch": 0.980788274905922, + "grad_norm": 1.4098978744442567, + "learning_rate": 1.9274593128996155e-08, + "loss": 0.2638, + "step": 12380 + }, + { + "epoch": 0.9808674985145573, + "grad_norm": 0.9557778347678417, + "learning_rate": 1.9115678755929902e-08, + "loss": 0.1507, + "step": 12381 + }, + { + "epoch": 0.9809467221231927, + "grad_norm": 1.3095147222633758, + "learning_rate": 1.8957421576071277e-08, + "loss": 0.2289, + "step": 12382 + }, + { + "epoch": 0.9810259457318281, + "grad_norm": 1.560136172695027, + "learning_rate": 1.879982159984084e-08, + "loss": 0.2265, + "step": 12383 + }, + { + "epoch": 0.9811051693404634, + "grad_norm": 1.479422861917443, + "learning_rate": 1.864287883761695e-08, + "loss": 0.2631, + "step": 12384 + }, + { + "epoch": 0.9811843929490989, + "grad_norm": 1.1434767942922264, + "learning_rate": 1.8486593299730236e-08, + "loss": 0.147, + "step": 12385 + }, + { + "epoch": 0.9812636165577342, + "grad_norm": 1.4227158335024508, + "learning_rate": 1.8330964996474688e-08, + "loss": 0.2394, + "step": 12386 + }, + { + "epoch": 0.9813428401663695, + "grad_norm": 1.2161179458531532, + "learning_rate": 1.817599393809544e-08, + "loss": 0.1734, + "step": 12387 + }, + { + "epoch": 0.981422063775005, + "grad_norm": 1.3204361413124135, + "learning_rate": 1.802168013479877e-08, + "loss": 0.1937, + "step": 12388 + }, + { + "epoch": 0.9815012873836403, + "grad_norm": 1.4210907675251467, + "learning_rate": 1.7868023596743224e-08, + "loss": 0.2738, + "step": 12389 + }, + { + "epoch": 0.9815805109922757, + "grad_norm": 1.5423237245530157, + "learning_rate": 1.771502433404737e-08, + "loss": 0.2223, + "step": 12390 + }, + { + "epoch": 0.981659734600911, + "grad_norm": 1.0960636524734113, + "learning_rate": 1.7562682356786488e-08, + "loss": 0.1692, + "step": 12391 + }, + { + "epoch": 0.9817389582095465, + "grad_norm": 1.224037969364072, + "learning_rate": 1.7410997674989215e-08, + "loss": 0.1646, + "step": 12392 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 1.2287911671495872, + "learning_rate": 1.7259970298645345e-08, + "loss": 0.2018, + "step": 12393 + }, + { + "epoch": 0.9818974054268171, + "grad_norm": 1.4248427765283922, + "learning_rate": 1.7109600237698032e-08, + "loss": 0.2008, + "step": 12394 + }, + { + "epoch": 0.9819766290354526, + "grad_norm": 1.3351943952400513, + "learning_rate": 1.6959887502049356e-08, + "loss": 0.1931, + "step": 12395 + }, + { + "epoch": 0.9820558526440879, + "grad_norm": 1.4110675976471403, + "learning_rate": 1.6810832101556984e-08, + "loss": 0.2579, + "step": 12396 + }, + { + "epoch": 0.9821350762527233, + "grad_norm": 1.3507169345682914, + "learning_rate": 1.666243404603529e-08, + "loss": 0.1886, + "step": 12397 + }, + { + "epoch": 0.9822142998613587, + "grad_norm": 1.1322106240504406, + "learning_rate": 1.651469334525424e-08, + "loss": 0.1872, + "step": 12398 + }, + { + "epoch": 0.9822935234699941, + "grad_norm": 1.113384015483527, + "learning_rate": 1.6367610008944935e-08, + "loss": 0.1432, + "step": 12399 + }, + { + "epoch": 0.9823727470786294, + "grad_norm": 1.493085021191041, + "learning_rate": 1.622118404678963e-08, + "loss": 0.2413, + "step": 12400 + }, + { + "epoch": 0.9824519706872648, + "grad_norm": 1.4337184817119455, + "learning_rate": 1.607541546843061e-08, + "loss": 0.2505, + "step": 12401 + }, + { + "epoch": 0.9825311942959002, + "grad_norm": 1.302138429353761, + "learning_rate": 1.593030428346576e-08, + "loss": 0.1516, + "step": 12402 + }, + { + "epoch": 0.9826104179045355, + "grad_norm": 1.305931013942236, + "learning_rate": 1.578585050144965e-08, + "loss": 0.213, + "step": 12403 + }, + { + "epoch": 0.982689641513171, + "grad_norm": 1.3937899573322106, + "learning_rate": 1.564205413189468e-08, + "loss": 0.219, + "step": 12404 + }, + { + "epoch": 0.9827688651218063, + "grad_norm": 1.2300172769040794, + "learning_rate": 1.5498915184268826e-08, + "loss": 0.2192, + "step": 12405 + }, + { + "epoch": 0.9828480887304417, + "grad_norm": 1.3598427424642636, + "learning_rate": 1.5356433667996772e-08, + "loss": 0.2048, + "step": 12406 + }, + { + "epoch": 0.982927312339077, + "grad_norm": 1.2476670997547008, + "learning_rate": 1.5214609592461015e-08, + "loss": 0.1848, + "step": 12407 + }, + { + "epoch": 0.9830065359477124, + "grad_norm": 1.3477909011329958, + "learning_rate": 1.507344296699964e-08, + "loss": 0.2286, + "step": 12408 + }, + { + "epoch": 0.9830857595563478, + "grad_norm": 1.402477249128504, + "learning_rate": 1.4932933800907435e-08, + "loss": 0.2577, + "step": 12409 + }, + { + "epoch": 0.9831649831649831, + "grad_norm": 1.4452043374940193, + "learning_rate": 1.4793082103435885e-08, + "loss": 0.2927, + "step": 12410 + }, + { + "epoch": 0.9832442067736186, + "grad_norm": 1.1156485681800947, + "learning_rate": 1.4653887883794293e-08, + "loss": 0.1546, + "step": 12411 + }, + { + "epoch": 0.9833234303822539, + "grad_norm": 1.2941942429050544, + "learning_rate": 1.451535115114866e-08, + "loss": 0.2238, + "step": 12412 + }, + { + "epoch": 0.9834026539908893, + "grad_norm": 1.3692368823279657, + "learning_rate": 1.4377471914619468e-08, + "loss": 0.2268, + "step": 12413 + }, + { + "epoch": 0.9834818775995247, + "grad_norm": 1.1764729320468845, + "learning_rate": 1.424025018328612e-08, + "loss": 0.1671, + "step": 12414 + }, + { + "epoch": 0.98356110120816, + "grad_norm": 1.6737251884922335, + "learning_rate": 1.4103685966183612e-08, + "loss": 0.2855, + "step": 12415 + }, + { + "epoch": 0.9836403248167954, + "grad_norm": 1.291085058232385, + "learning_rate": 1.396777927230475e-08, + "loss": 0.1876, + "step": 12416 + }, + { + "epoch": 0.9837195484254307, + "grad_norm": 1.2580872819884064, + "learning_rate": 1.383253011059682e-08, + "loss": 0.2, + "step": 12417 + }, + { + "epoch": 0.9837987720340662, + "grad_norm": 1.1638219042848887, + "learning_rate": 1.3697938489967144e-08, + "loss": 0.1481, + "step": 12418 + }, + { + "epoch": 0.9838779956427015, + "grad_norm": 1.3406977954707553, + "learning_rate": 1.3564004419277522e-08, + "loss": 0.2469, + "step": 12419 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 1.3012301114446034, + "learning_rate": 1.3430727907346453e-08, + "loss": 0.3466, + "step": 12420 + }, + { + "epoch": 0.9840364428599723, + "grad_norm": 1.6150807931230433, + "learning_rate": 1.329810896294914e-08, + "loss": 0.2725, + "step": 12421 + }, + { + "epoch": 0.9841156664686076, + "grad_norm": 1.4440963179670876, + "learning_rate": 1.3166147594818601e-08, + "loss": 0.2576, + "step": 12422 + }, + { + "epoch": 0.984194890077243, + "grad_norm": 1.38243655507372, + "learning_rate": 1.3034843811644548e-08, + "loss": 0.195, + "step": 12423 + }, + { + "epoch": 0.9842741136858784, + "grad_norm": 1.5422851378192501, + "learning_rate": 1.290419762207007e-08, + "loss": 0.2858, + "step": 12424 + }, + { + "epoch": 0.9843533372945138, + "grad_norm": 1.1204227120903085, + "learning_rate": 1.2774209034700503e-08, + "loss": 0.1734, + "step": 12425 + }, + { + "epoch": 0.9844325609031491, + "grad_norm": 1.2065474351513097, + "learning_rate": 1.2644878058093446e-08, + "loss": 0.1607, + "step": 12426 + }, + { + "epoch": 0.9845117845117846, + "grad_norm": 1.2996072629065045, + "learning_rate": 1.2516204700765422e-08, + "loss": 0.1969, + "step": 12427 + }, + { + "epoch": 0.9845910081204199, + "grad_norm": 1.386928936396314, + "learning_rate": 1.2388188971188542e-08, + "loss": 0.2429, + "step": 12428 + }, + { + "epoch": 0.9846702317290552, + "grad_norm": 1.6411707783276153, + "learning_rate": 1.2260830877792729e-08, + "loss": 0.2725, + "step": 12429 + }, + { + "epoch": 0.9847494553376906, + "grad_norm": 1.4835494526385964, + "learning_rate": 1.2134130428962387e-08, + "loss": 0.2509, + "step": 12430 + }, + { + "epoch": 0.984828678946326, + "grad_norm": 1.3389969580612024, + "learning_rate": 1.2008087633040843e-08, + "loss": 0.207, + "step": 12431 + }, + { + "epoch": 0.9849079025549614, + "grad_norm": 1.5434304870871656, + "learning_rate": 1.1882702498328125e-08, + "loss": 0.2799, + "step": 12432 + }, + { + "epoch": 0.9849871261635967, + "grad_norm": 1.209208353919731, + "learning_rate": 1.175797503307874e-08, + "loss": 0.155, + "step": 12433 + }, + { + "epoch": 0.9850663497722322, + "grad_norm": 1.406233772110184, + "learning_rate": 1.1633905245507227e-08, + "loss": 0.2149, + "step": 12434 + }, + { + "epoch": 0.9851455733808675, + "grad_norm": 1.4076835467397089, + "learning_rate": 1.1510493143782609e-08, + "loss": 0.1754, + "step": 12435 + }, + { + "epoch": 0.9852247969895028, + "grad_norm": 1.468031163373947, + "learning_rate": 1.1387738736029496e-08, + "loss": 0.2353, + "step": 12436 + }, + { + "epoch": 0.9853040205981383, + "grad_norm": 1.4114296259216852, + "learning_rate": 1.1265642030331426e-08, + "loss": 0.1832, + "step": 12437 + }, + { + "epoch": 0.9853832442067736, + "grad_norm": 1.3433495501088457, + "learning_rate": 1.114420303472974e-08, + "loss": 0.2018, + "step": 12438 + }, + { + "epoch": 0.985462467815409, + "grad_norm": 0.9853083831674087, + "learning_rate": 1.1023421757216934e-08, + "loss": 0.1178, + "step": 12439 + }, + { + "epoch": 0.9855416914240444, + "grad_norm": 1.0935194771472654, + "learning_rate": 1.090329820574887e-08, + "loss": 0.1462, + "step": 12440 + }, + { + "epoch": 0.9856209150326798, + "grad_norm": 1.3992800499309292, + "learning_rate": 1.0783832388234772e-08, + "loss": 0.283, + "step": 12441 + }, + { + "epoch": 0.9857001386413151, + "grad_norm": 1.4623034187907258, + "learning_rate": 1.0665024312539462e-08, + "loss": 0.3125, + "step": 12442 + }, + { + "epoch": 0.9857793622499504, + "grad_norm": 1.1049513044985437, + "learning_rate": 1.0546873986486682e-08, + "loss": 0.1399, + "step": 12443 + }, + { + "epoch": 0.9858585858585859, + "grad_norm": 1.2271985889529136, + "learning_rate": 1.0429381417856877e-08, + "loss": 0.2278, + "step": 12444 + }, + { + "epoch": 0.9859378094672212, + "grad_norm": 1.2211581373473384, + "learning_rate": 1.0312546614384966e-08, + "loss": 0.1739, + "step": 12445 + }, + { + "epoch": 0.9860170330758566, + "grad_norm": 1.5085648544878496, + "learning_rate": 1.0196369583763688e-08, + "loss": 0.2606, + "step": 12446 + }, + { + "epoch": 0.986096256684492, + "grad_norm": 1.3721600368115283, + "learning_rate": 1.0080850333644698e-08, + "loss": 0.192, + "step": 12447 + }, + { + "epoch": 0.9861754802931274, + "grad_norm": 1.1780353310856728, + "learning_rate": 9.965988871633025e-09, + "loss": 0.1655, + "step": 12448 + }, + { + "epoch": 0.9862547039017627, + "grad_norm": 1.3939693766988244, + "learning_rate": 9.851785205291508e-09, + "loss": 0.2347, + "step": 12449 + }, + { + "epoch": 0.9863339275103981, + "grad_norm": 1.615554871611278, + "learning_rate": 9.738239342141909e-09, + "loss": 0.2794, + "step": 12450 + }, + { + "epoch": 0.9864131511190335, + "grad_norm": 1.603367285531167, + "learning_rate": 9.625351289658247e-09, + "loss": 0.1984, + "step": 12451 + }, + { + "epoch": 0.9864923747276688, + "grad_norm": 1.4516254351938802, + "learning_rate": 9.513121055273467e-09, + "loss": 0.2268, + "step": 12452 + }, + { + "epoch": 0.9865715983363043, + "grad_norm": 1.0507069300196927, + "learning_rate": 9.401548646380543e-09, + "loss": 0.1204, + "step": 12453 + }, + { + "epoch": 0.9866508219449396, + "grad_norm": 1.2739488848925171, + "learning_rate": 9.290634070322491e-09, + "loss": 0.2212, + "step": 12454 + }, + { + "epoch": 0.986730045553575, + "grad_norm": 1.2125383514988055, + "learning_rate": 9.180377334404577e-09, + "loss": 0.1475, + "step": 12455 + }, + { + "epoch": 0.9868092691622103, + "grad_norm": 1.391328333397416, + "learning_rate": 9.070778445885442e-09, + "loss": 0.255, + "step": 12456 + }, + { + "epoch": 0.9868884927708457, + "grad_norm": 1.275436941861454, + "learning_rate": 8.961837411982643e-09, + "loss": 0.2208, + "step": 12457 + }, + { + "epoch": 0.9869677163794811, + "grad_norm": 1.2900627248811802, + "learning_rate": 8.853554239869333e-09, + "loss": 0.2339, + "step": 12458 + }, + { + "epoch": 0.9870469399881164, + "grad_norm": 1.8050407324485647, + "learning_rate": 8.745928936675363e-09, + "loss": 0.2669, + "step": 12459 + }, + { + "epoch": 0.9871261635967519, + "grad_norm": 1.3327343595256749, + "learning_rate": 8.638961509486177e-09, + "loss": 0.2097, + "step": 12460 + }, + { + "epoch": 0.9872053872053872, + "grad_norm": 1.0723984626526875, + "learning_rate": 8.53265196534725e-09, + "loss": 0.1696, + "step": 12461 + }, + { + "epoch": 0.9872846108140226, + "grad_norm": 1.4359811901078419, + "learning_rate": 8.427000311256317e-09, + "loss": 0.238, + "step": 12462 + }, + { + "epoch": 0.987363834422658, + "grad_norm": 1.1545614969400706, + "learning_rate": 8.322006554171147e-09, + "loss": 0.1606, + "step": 12463 + }, + { + "epoch": 0.9874430580312933, + "grad_norm": 1.4254148973056375, + "learning_rate": 8.217670701005098e-09, + "loss": 0.2308, + "step": 12464 + }, + { + "epoch": 0.9875222816399287, + "grad_norm": 1.4059356464485842, + "learning_rate": 8.113992758628231e-09, + "loss": 0.2048, + "step": 12465 + }, + { + "epoch": 0.987601505248564, + "grad_norm": 1.3924541739055427, + "learning_rate": 8.010972733867306e-09, + "loss": 0.2202, + "step": 12466 + }, + { + "epoch": 0.9876807288571995, + "grad_norm": 1.5624754217451045, + "learning_rate": 7.908610633504676e-09, + "loss": 0.2209, + "step": 12467 + }, + { + "epoch": 0.9877599524658348, + "grad_norm": 1.4149611229759678, + "learning_rate": 7.806906464281617e-09, + "loss": 0.18, + "step": 12468 + }, + { + "epoch": 0.9878391760744701, + "grad_norm": 1.2260131965110193, + "learning_rate": 7.70586023289388e-09, + "loss": 0.1544, + "step": 12469 + }, + { + "epoch": 0.9879183996831056, + "grad_norm": 1.5891808434540446, + "learning_rate": 7.605471945996146e-09, + "loss": 0.2488, + "step": 12470 + }, + { + "epoch": 0.9879976232917409, + "grad_norm": 1.5374319157316712, + "learning_rate": 7.50574161019757e-09, + "loss": 0.2085, + "step": 12471 + }, + { + "epoch": 0.9880768469003763, + "grad_norm": 1.2509710981086186, + "learning_rate": 7.406669232065122e-09, + "loss": 0.2224, + "step": 12472 + }, + { + "epoch": 0.9881560705090117, + "grad_norm": 1.400817291463732, + "learning_rate": 7.3082548181213635e-09, + "loss": 0.2511, + "step": 12473 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 1.2273374374997892, + "learning_rate": 7.210498374848884e-09, + "loss": 0.2638, + "step": 12474 + }, + { + "epoch": 0.9883145177262824, + "grad_norm": 1.0755615754982213, + "learning_rate": 7.113399908681429e-09, + "loss": 0.1561, + "step": 12475 + }, + { + "epoch": 0.9883937413349178, + "grad_norm": 1.204940158789142, + "learning_rate": 7.016959426013881e-09, + "loss": 0.149, + "step": 12476 + }, + { + "epoch": 0.9884729649435532, + "grad_norm": 1.346058835354687, + "learning_rate": 6.9211769331978265e-09, + "loss": 0.2227, + "step": 12477 + }, + { + "epoch": 0.9885521885521885, + "grad_norm": 1.2698331741183637, + "learning_rate": 6.8260524365371115e-09, + "loss": 0.2025, + "step": 12478 + }, + { + "epoch": 0.988631412160824, + "grad_norm": 1.7364152816557938, + "learning_rate": 6.731585942297836e-09, + "loss": 0.3232, + "step": 12479 + }, + { + "epoch": 0.9887106357694593, + "grad_norm": 1.3574165046155162, + "learning_rate": 6.637777456698358e-09, + "loss": 0.1702, + "step": 12480 + }, + { + "epoch": 0.9887898593780947, + "grad_norm": 1.6303718539574523, + "learning_rate": 6.544626985915958e-09, + "loss": 0.2278, + "step": 12481 + }, + { + "epoch": 0.98886908298673, + "grad_norm": 1.206890183157199, + "learning_rate": 6.45213453608573e-09, + "loss": 0.1769, + "step": 12482 + }, + { + "epoch": 0.9889483065953654, + "grad_norm": 1.1760049911837305, + "learning_rate": 6.360300113295026e-09, + "loss": 0.2082, + "step": 12483 + }, + { + "epoch": 0.9890275302040008, + "grad_norm": 1.2813541597013667, + "learning_rate": 6.269123723593451e-09, + "loss": 0.1837, + "step": 12484 + }, + { + "epoch": 0.9891067538126361, + "grad_norm": 1.1498826292844289, + "learning_rate": 6.178605372982871e-09, + "loss": 0.1922, + "step": 12485 + }, + { + "epoch": 0.9891859774212716, + "grad_norm": 1.7405664475519997, + "learning_rate": 6.088745067424073e-09, + "loss": 0.2988, + "step": 12486 + }, + { + "epoch": 0.9892652010299069, + "grad_norm": 1.5269990083452556, + "learning_rate": 5.9995428128334365e-09, + "loss": 0.191, + "step": 12487 + }, + { + "epoch": 0.9893444246385423, + "grad_norm": 1.5024018254912053, + "learning_rate": 5.910998615085151e-09, + "loss": 0.1996, + "step": 12488 + }, + { + "epoch": 0.9894236482471777, + "grad_norm": 1.3464939126797384, + "learning_rate": 5.8231124800089965e-09, + "loss": 0.1994, + "step": 12489 + }, + { + "epoch": 0.989502871855813, + "grad_norm": 1.2805824796965277, + "learning_rate": 5.735884413391457e-09, + "loss": 0.2178, + "step": 12490 + }, + { + "epoch": 0.9895820954644484, + "grad_norm": 1.3781739595855038, + "learning_rate": 5.6493144209768255e-09, + "loss": 0.2164, + "step": 12491 + }, + { + "epoch": 0.9896613190730837, + "grad_norm": 1.1950693183716967, + "learning_rate": 5.5634025084660985e-09, + "loss": 0.2031, + "step": 12492 + }, + { + "epoch": 0.9897405426817192, + "grad_norm": 1.0637787291335812, + "learning_rate": 5.47814868151364e-09, + "loss": 0.1474, + "step": 12493 + }, + { + "epoch": 0.9898197662903545, + "grad_norm": 1.0776936021291765, + "learning_rate": 5.393552945736069e-09, + "loss": 0.1498, + "step": 12494 + }, + { + "epoch": 0.98989898989899, + "grad_norm": 1.4013138962482272, + "learning_rate": 5.309615306701155e-09, + "loss": 0.2517, + "step": 12495 + }, + { + "epoch": 0.9899782135076253, + "grad_norm": 1.4067846097530692, + "learning_rate": 5.226335769936697e-09, + "loss": 0.2073, + "step": 12496 + }, + { + "epoch": 0.9900574371162606, + "grad_norm": 1.8322357689087203, + "learning_rate": 5.143714340926087e-09, + "loss": 0.308, + "step": 12497 + }, + { + "epoch": 0.990136660724896, + "grad_norm": 1.5401922238827737, + "learning_rate": 5.0617510251105284e-09, + "loss": 0.2772, + "step": 12498 + }, + { + "epoch": 0.9902158843335314, + "grad_norm": 1.6471308717890774, + "learning_rate": 4.980445827885705e-09, + "loss": 0.2281, + "step": 12499 + }, + { + "epoch": 0.9902951079421668, + "grad_norm": 1.3308290591423746, + "learning_rate": 4.899798754605112e-09, + "loss": 0.2108, + "step": 12500 + }, + { + "epoch": 0.9903743315508021, + "grad_norm": 1.420277492743954, + "learning_rate": 4.819809810578946e-09, + "loss": 0.2673, + "step": 12501 + }, + { + "epoch": 0.9904535551594376, + "grad_norm": 1.5495186136199472, + "learning_rate": 4.740479001076326e-09, + "loss": 0.2455, + "step": 12502 + }, + { + "epoch": 0.9905327787680729, + "grad_norm": 1.2769506170301432, + "learning_rate": 4.66180633131752e-09, + "loss": 0.2283, + "step": 12503 + }, + { + "epoch": 0.9906120023767082, + "grad_norm": 1.4813104757547562, + "learning_rate": 4.583791806485049e-09, + "loss": 0.2703, + "step": 12504 + }, + { + "epoch": 0.9906912259853436, + "grad_norm": 1.506462318180011, + "learning_rate": 4.506435431714806e-09, + "loss": 0.2793, + "step": 12505 + }, + { + "epoch": 0.990770449593979, + "grad_norm": 1.6290961485009878, + "learning_rate": 4.429737212100493e-09, + "loss": 0.2757, + "step": 12506 + }, + { + "epoch": 0.9908496732026144, + "grad_norm": 1.3324355530366954, + "learning_rate": 4.353697152692515e-09, + "loss": 0.193, + "step": 12507 + }, + { + "epoch": 0.9909288968112497, + "grad_norm": 1.2225821225844495, + "learning_rate": 4.278315258496868e-09, + "loss": 0.1683, + "step": 12508 + }, + { + "epoch": 0.9910081204198852, + "grad_norm": 1.5974530087833683, + "learning_rate": 4.203591534478468e-09, + "loss": 0.2666, + "step": 12509 + }, + { + "epoch": 0.9910873440285205, + "grad_norm": 1.489577522516601, + "learning_rate": 4.129525985556715e-09, + "loss": 0.1789, + "step": 12510 + }, + { + "epoch": 0.9911665676371558, + "grad_norm": 1.2444406845720806, + "learning_rate": 4.056118616608817e-09, + "loss": 0.1616, + "step": 12511 + }, + { + "epoch": 0.9912457912457913, + "grad_norm": 1.51248730938774, + "learning_rate": 3.9833694324686864e-09, + "loss": 0.2074, + "step": 12512 + }, + { + "epoch": 0.9913250148544266, + "grad_norm": 1.3547145750551723, + "learning_rate": 3.9112784379247145e-09, + "loss": 0.339, + "step": 12513 + }, + { + "epoch": 0.991404238463062, + "grad_norm": 1.1830324648650488, + "learning_rate": 3.839845637725326e-09, + "loss": 0.1624, + "step": 12514 + }, + { + "epoch": 0.9914834620716974, + "grad_norm": 1.1104159225502654, + "learning_rate": 3.769071036573424e-09, + "loss": 0.1302, + "step": 12515 + }, + { + "epoch": 0.9915626856803328, + "grad_norm": 1.244599216165791, + "learning_rate": 3.698954639129726e-09, + "loss": 0.2414, + "step": 12516 + }, + { + "epoch": 0.9916419092889681, + "grad_norm": 1.4195685557322446, + "learning_rate": 3.6294964500116492e-09, + "loss": 0.2579, + "step": 12517 + }, + { + "epoch": 0.9917211328976034, + "grad_norm": 1.3235364134475698, + "learning_rate": 3.560696473789982e-09, + "loss": 0.2403, + "step": 12518 + }, + { + "epoch": 0.9918003565062389, + "grad_norm": 1.3700588990464042, + "learning_rate": 3.4925547149977645e-09, + "loss": 0.1979, + "step": 12519 + }, + { + "epoch": 0.9918795801148742, + "grad_norm": 1.3963926339025485, + "learning_rate": 3.425071178120298e-09, + "loss": 0.2536, + "step": 12520 + }, + { + "epoch": 0.9919588037235096, + "grad_norm": 0.9766851949468118, + "learning_rate": 3.3582458676018058e-09, + "loss": 0.1561, + "step": 12521 + }, + { + "epoch": 0.992038027332145, + "grad_norm": 1.432457159825864, + "learning_rate": 3.292078787842101e-09, + "loss": 0.2354, + "step": 12522 + }, + { + "epoch": 0.9921172509407804, + "grad_norm": 1.1668504643305284, + "learning_rate": 3.226569943197699e-09, + "loss": 0.1651, + "step": 12523 + }, + { + "epoch": 0.9921964745494157, + "grad_norm": 1.5761229671699646, + "learning_rate": 3.1617193379818167e-09, + "loss": 0.2292, + "step": 12524 + }, + { + "epoch": 0.9922756981580511, + "grad_norm": 1.1865416227179866, + "learning_rate": 3.0975269764654816e-09, + "loss": 0.1749, + "step": 12525 + }, + { + "epoch": 0.9923549217666865, + "grad_norm": 1.6039717782862073, + "learning_rate": 3.033992862875312e-09, + "loss": 0.2588, + "step": 12526 + }, + { + "epoch": 0.9924341453753218, + "grad_norm": 1.2687919021186191, + "learning_rate": 2.9711170013935196e-09, + "loss": 0.1949, + "step": 12527 + }, + { + "epoch": 0.9925133689839573, + "grad_norm": 1.4537660251490192, + "learning_rate": 2.9088993961612355e-09, + "loss": 0.2499, + "step": 12528 + }, + { + "epoch": 0.9925925925925926, + "grad_norm": 0.9534090552544947, + "learning_rate": 2.8473400512762928e-09, + "loss": 0.1255, + "step": 12529 + }, + { + "epoch": 0.992671816201228, + "grad_norm": 1.253448052631948, + "learning_rate": 2.7864389707887853e-09, + "loss": 0.1799, + "step": 12530 + }, + { + "epoch": 0.9927510398098633, + "grad_norm": 1.211191951328995, + "learning_rate": 2.726196158712169e-09, + "loss": 0.1381, + "step": 12531 + }, + { + "epoch": 0.9928302634184987, + "grad_norm": 1.2675340787779716, + "learning_rate": 2.66661161901105e-09, + "loss": 0.1947, + "step": 12532 + }, + { + "epoch": 0.9929094870271341, + "grad_norm": 1.4126343689953031, + "learning_rate": 2.607685355610068e-09, + "loss": 0.2765, + "step": 12533 + }, + { + "epoch": 0.9929887106357694, + "grad_norm": 1.4856731382979294, + "learning_rate": 2.549417372388341e-09, + "loss": 0.2855, + "step": 12534 + }, + { + "epoch": 0.9930679342444049, + "grad_norm": 1.4146051243797273, + "learning_rate": 2.4918076731828e-09, + "loss": 0.2223, + "step": 12535 + }, + { + "epoch": 0.9931471578530402, + "grad_norm": 1.2259398907907308, + "learning_rate": 2.434856261785967e-09, + "loss": 0.1704, + "step": 12536 + }, + { + "epoch": 0.9932263814616756, + "grad_norm": 1.2589088474837895, + "learning_rate": 2.378563141949286e-09, + "loss": 0.1738, + "step": 12537 + }, + { + "epoch": 0.993305605070311, + "grad_norm": 1.1551345494029515, + "learning_rate": 2.322928317378681e-09, + "loss": 0.1906, + "step": 12538 + }, + { + "epoch": 0.9933848286789463, + "grad_norm": 1.3547682596441615, + "learning_rate": 2.267951791737888e-09, + "loss": 0.2077, + "step": 12539 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 1.7153493443236285, + "learning_rate": 2.213633568646234e-09, + "loss": 0.3191, + "step": 12540 + }, + { + "epoch": 0.993543275896217, + "grad_norm": 1.1419776843512282, + "learning_rate": 2.1599736516808577e-09, + "loss": 0.1814, + "step": 12541 + }, + { + "epoch": 0.9936224995048525, + "grad_norm": 1.0637268349366085, + "learning_rate": 2.106972044373379e-09, + "loss": 0.1589, + "step": 12542 + }, + { + "epoch": 0.9937017231134878, + "grad_norm": 1.727835615080682, + "learning_rate": 2.0546287502165583e-09, + "loss": 0.2757, + "step": 12543 + }, + { + "epoch": 0.9937809467221231, + "grad_norm": 1.3561804102516777, + "learning_rate": 2.002943772654309e-09, + "loss": 0.2229, + "step": 12544 + }, + { + "epoch": 0.9938601703307586, + "grad_norm": 1.4376269485878115, + "learning_rate": 1.951917115091684e-09, + "loss": 0.2939, + "step": 12545 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 1.6512973400898159, + "learning_rate": 1.901548780887108e-09, + "loss": 0.2115, + "step": 12546 + }, + { + "epoch": 0.9940186175480293, + "grad_norm": 1.320015208949776, + "learning_rate": 1.851838773357928e-09, + "loss": 0.2318, + "step": 12547 + }, + { + "epoch": 0.9940978411566647, + "grad_norm": 1.198533008666455, + "learning_rate": 1.8027870957781912e-09, + "loss": 0.1648, + "step": 12548 + }, + { + "epoch": 0.9941770647653001, + "grad_norm": 1.2946999466391675, + "learning_rate": 1.7543937513753161e-09, + "loss": 0.2149, + "step": 12549 + }, + { + "epoch": 0.9942562883739354, + "grad_norm": 1.4405576583346942, + "learning_rate": 1.7066587433378634e-09, + "loss": 0.2208, + "step": 12550 + }, + { + "epoch": 0.9943355119825708, + "grad_norm": 1.5084074757572192, + "learning_rate": 1.659582074807764e-09, + "loss": 0.1872, + "step": 12551 + }, + { + "epoch": 0.9944147355912062, + "grad_norm": 1.4747730778220889, + "learning_rate": 1.6131637488858708e-09, + "loss": 0.1867, + "step": 12552 + }, + { + "epoch": 0.9944939591998415, + "grad_norm": 1.268587688193285, + "learning_rate": 1.5674037686275178e-09, + "loss": 0.2054, + "step": 12553 + }, + { + "epoch": 0.994573182808477, + "grad_norm": 1.2447409302626689, + "learning_rate": 1.5223021370458502e-09, + "loss": 0.239, + "step": 12554 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 1.3362271588865342, + "learning_rate": 1.4778588571107144e-09, + "loss": 0.1694, + "step": 12555 + }, + { + "epoch": 0.9947316300257477, + "grad_norm": 1.830965598598806, + "learning_rate": 1.4340739317497688e-09, + "loss": 0.3172, + "step": 12556 + }, + { + "epoch": 0.994810853634383, + "grad_norm": 1.6419578688167966, + "learning_rate": 1.390947363845152e-09, + "loss": 0.3192, + "step": 12557 + }, + { + "epoch": 0.9948900772430184, + "grad_norm": 1.142639020114391, + "learning_rate": 1.3484791562357048e-09, + "loss": 0.1564, + "step": 12558 + }, + { + "epoch": 0.9949693008516538, + "grad_norm": 1.5160810611758526, + "learning_rate": 1.3066693117191886e-09, + "loss": 0.2822, + "step": 12559 + }, + { + "epoch": 0.9950485244602891, + "grad_norm": 1.3972647357200154, + "learning_rate": 1.2655178330467366e-09, + "loss": 0.22, + "step": 12560 + }, + { + "epoch": 0.9951277480689246, + "grad_norm": 1.4662781990439455, + "learning_rate": 1.2250247229295132e-09, + "loss": 0.3091, + "step": 12561 + }, + { + "epoch": 0.9952069716775599, + "grad_norm": 1.1682890671085937, + "learning_rate": 1.185189984034274e-09, + "loss": 0.1672, + "step": 12562 + }, + { + "epoch": 0.9952861952861953, + "grad_norm": 1.1987516313351618, + "learning_rate": 1.1460136189822556e-09, + "loss": 0.148, + "step": 12563 + }, + { + "epoch": 0.9953654188948307, + "grad_norm": 1.4186857466682117, + "learning_rate": 1.1074956303536165e-09, + "loss": 0.2453, + "step": 12564 + }, + { + "epoch": 0.995444642503466, + "grad_norm": 1.1778637829024925, + "learning_rate": 1.0696360206852162e-09, + "loss": 0.2096, + "step": 12565 + }, + { + "epoch": 0.9955238661121014, + "grad_norm": 1.3644194785082215, + "learning_rate": 1.0324347924695055e-09, + "loss": 0.2905, + "step": 12566 + }, + { + "epoch": 0.9956030897207367, + "grad_norm": 1.2889824217233188, + "learning_rate": 9.958919481556362e-10, + "loss": 0.172, + "step": 12567 + }, + { + "epoch": 0.9956823133293722, + "grad_norm": 1.5109538498877142, + "learning_rate": 9.600074901505718e-10, + "loss": 0.2479, + "step": 12568 + }, + { + "epoch": 0.9957615369380075, + "grad_norm": 1.6098756681008, + "learning_rate": 9.24781420816867e-10, + "loss": 0.272, + "step": 12569 + }, + { + "epoch": 0.995840760546643, + "grad_norm": 1.5930147949606568, + "learning_rate": 8.902137424726675e-10, + "loss": 0.2082, + "step": 12570 + }, + { + "epoch": 0.9959199841552783, + "grad_norm": 1.6230997570751065, + "learning_rate": 8.56304457396151e-10, + "loss": 0.2867, + "step": 12571 + }, + { + "epoch": 0.9959992077639136, + "grad_norm": 1.5188729303389858, + "learning_rate": 8.230535678188656e-10, + "loss": 0.2123, + "step": 12572 + }, + { + "epoch": 0.996078431372549, + "grad_norm": 1.2508980642777277, + "learning_rate": 7.904610759312814e-10, + "loss": 0.1552, + "step": 12573 + }, + { + "epoch": 0.9961576549811844, + "grad_norm": 1.4361496615147935, + "learning_rate": 7.585269838783494e-10, + "loss": 0.3118, + "step": 12574 + }, + { + "epoch": 0.9962368785898198, + "grad_norm": 1.1542101467217232, + "learning_rate": 7.272512937628318e-10, + "loss": 0.1722, + "step": 12575 + }, + { + "epoch": 0.9963161021984551, + "grad_norm": 1.278491052498372, + "learning_rate": 6.966340076441924e-10, + "loss": 0.2072, + "step": 12576 + }, + { + "epoch": 0.9963953258070906, + "grad_norm": 1.1746040879529536, + "learning_rate": 6.666751275385963e-10, + "loss": 0.1828, + "step": 12577 + }, + { + "epoch": 0.9964745494157259, + "grad_norm": 1.4330130828120105, + "learning_rate": 6.3737465542002e-10, + "loss": 0.241, + "step": 12578 + }, + { + "epoch": 0.9965537730243612, + "grad_norm": 1.4271566692606346, + "learning_rate": 6.087325932147003e-10, + "loss": 0.24, + "step": 12579 + }, + { + "epoch": 0.9966329966329966, + "grad_norm": 1.3332745611384074, + "learning_rate": 5.807489428111268e-10, + "loss": 0.2682, + "step": 12580 + }, + { + "epoch": 0.996712220241632, + "grad_norm": 1.314716955830647, + "learning_rate": 5.534237060511594e-10, + "loss": 0.1759, + "step": 12581 + }, + { + "epoch": 0.9967914438502674, + "grad_norm": 2.2100600515436515, + "learning_rate": 5.267568847344695e-10, + "loss": 0.2297, + "step": 12582 + }, + { + "epoch": 0.9968706674589027, + "grad_norm": 1.7602103323485883, + "learning_rate": 5.007484806152097e-10, + "loss": 0.3064, + "step": 12583 + }, + { + "epoch": 0.9969498910675382, + "grad_norm": 1.426143301755599, + "learning_rate": 4.753984954086743e-10, + "loss": 0.2395, + "step": 12584 + }, + { + "epoch": 0.9970291146761735, + "grad_norm": 1.211768410738935, + "learning_rate": 4.5070693078130834e-10, + "loss": 0.1538, + "step": 12585 + }, + { + "epoch": 0.9971083382848088, + "grad_norm": 1.5085316651191683, + "learning_rate": 4.266737883606986e-10, + "loss": 0.2579, + "step": 12586 + }, + { + "epoch": 0.9971875618934443, + "grad_norm": 1.2719726097229966, + "learning_rate": 4.0329906972780276e-10, + "loss": 0.1893, + "step": 12587 + }, + { + "epoch": 0.9972667855020796, + "grad_norm": 1.3224030935410247, + "learning_rate": 3.805827764236103e-10, + "loss": 0.224, + "step": 12588 + }, + { + "epoch": 0.997346009110715, + "grad_norm": 1.2289755597251892, + "learning_rate": 3.585249099435917e-10, + "loss": 0.2635, + "step": 12589 + }, + { + "epoch": 0.9974252327193504, + "grad_norm": 1.3221490130214644, + "learning_rate": 3.3712547173769816e-10, + "loss": 0.2204, + "step": 12590 + }, + { + "epoch": 0.9975044563279858, + "grad_norm": 1.4595330106386109, + "learning_rate": 3.163844632181334e-10, + "loss": 0.2284, + "step": 12591 + }, + { + "epoch": 0.9975836799366211, + "grad_norm": 1.2733603141038226, + "learning_rate": 2.963018857493616e-10, + "loss": 0.1997, + "step": 12592 + }, + { + "epoch": 0.9976629035452564, + "grad_norm": 1.2851104650545306, + "learning_rate": 2.7687774065254804e-10, + "loss": 0.1541, + "step": 12593 + }, + { + "epoch": 0.9977421271538919, + "grad_norm": 1.2610574675323607, + "learning_rate": 2.581120292077799e-10, + "loss": 0.2074, + "step": 12594 + }, + { + "epoch": 0.9978213507625272, + "grad_norm": 1.3439470654486827, + "learning_rate": 2.400047526518456e-10, + "loss": 0.2202, + "step": 12595 + }, + { + "epoch": 0.9979005743711626, + "grad_norm": 1.4223196445066777, + "learning_rate": 2.2255591217490437e-10, + "loss": 0.2449, + "step": 12596 + }, + { + "epoch": 0.997979797979798, + "grad_norm": 1.169471863088717, + "learning_rate": 2.057655089271471e-10, + "loss": 0.1459, + "step": 12597 + }, + { + "epoch": 0.9980590215884334, + "grad_norm": 1.8938312357980984, + "learning_rate": 1.8963354401324575e-10, + "loss": 0.3463, + "step": 12598 + }, + { + "epoch": 0.9981382451970687, + "grad_norm": 1.5770285442906824, + "learning_rate": 1.74160018496794e-10, + "loss": 0.2438, + "step": 12599 + }, + { + "epoch": 0.9982174688057041, + "grad_norm": 1.2867503443460573, + "learning_rate": 1.593449333947561e-10, + "loss": 0.2098, + "step": 12600 + }, + { + "epoch": 0.9982966924143395, + "grad_norm": 1.4130448920366796, + "learning_rate": 1.4518828968523857e-10, + "loss": 0.2302, + "step": 12601 + }, + { + "epoch": 0.9983759160229748, + "grad_norm": 1.1190835363853238, + "learning_rate": 1.3169008829749808e-10, + "loss": 0.2064, + "step": 12602 + }, + { + "epoch": 0.9984551396316103, + "grad_norm": 1.307888018530365, + "learning_rate": 1.1885033012193348e-10, + "loss": 0.1674, + "step": 12603 + }, + { + "epoch": 0.9985343632402456, + "grad_norm": 1.193597720975357, + "learning_rate": 1.0666901600453473e-10, + "loss": 0.1736, + "step": 12604 + }, + { + "epoch": 0.998613586848881, + "grad_norm": 1.392658771335661, + "learning_rate": 9.51461467457726e-11, + "loss": 0.2241, + "step": 12605 + }, + { + "epoch": 0.9986928104575163, + "grad_norm": 1.25744168470611, + "learning_rate": 8.428172310503968e-11, + "loss": 0.2231, + "step": 12606 + }, + { + "epoch": 0.9987720340661517, + "grad_norm": 1.1388855690170891, + "learning_rate": 7.40757457984298e-11, + "loss": 0.1712, + "step": 12607 + }, + { + "epoch": 0.9988512576747871, + "grad_norm": 1.7328102242920655, + "learning_rate": 6.452821549651766e-11, + "loss": 0.3831, + "step": 12608 + }, + { + "epoch": 0.9989304812834224, + "grad_norm": 1.5719729743532507, + "learning_rate": 5.563913282990996e-11, + "loss": 0.2842, + "step": 12609 + }, + { + "epoch": 0.9990097048920579, + "grad_norm": 1.557877226047442, + "learning_rate": 4.7408498381473765e-11, + "loss": 0.339, + "step": 12610 + }, + { + "epoch": 0.9990889285006932, + "grad_norm": 1.4930107277820386, + "learning_rate": 3.983631269521837e-11, + "loss": 0.226, + "step": 12611 + }, + { + "epoch": 0.9991681521093286, + "grad_norm": 1.228601240977321, + "learning_rate": 3.292257626963391e-11, + "loss": 0.2042, + "step": 12612 + }, + { + "epoch": 0.999247375717964, + "grad_norm": 1.206532423282528, + "learning_rate": 2.6667289557691378e-11, + "loss": 0.2225, + "step": 12613 + }, + { + "epoch": 0.9993265993265993, + "grad_norm": 1.245138462654191, + "learning_rate": 2.1070452974614187e-11, + "loss": 0.2038, + "step": 12614 + }, + { + "epoch": 0.9994058229352347, + "grad_norm": 1.6390682665890544, + "learning_rate": 1.6132066886775932e-11, + "loss": 0.2894, + "step": 12615 + }, + { + "epoch": 0.99948504654387, + "grad_norm": 1.1032165949364316, + "learning_rate": 1.1852131619471963e-11, + "loss": 0.1323, + "step": 12616 + }, + { + "epoch": 0.9995642701525055, + "grad_norm": 1.3807418347483535, + "learning_rate": 8.230647454698926e-12, + "loss": 0.2222, + "step": 12617 + }, + { + "epoch": 0.9996434937611408, + "grad_norm": 1.674992983819206, + "learning_rate": 5.267614631154772e-12, + "loss": 0.3302, + "step": 12618 + }, + { + "epoch": 0.9997227173697762, + "grad_norm": 1.412962469950124, + "learning_rate": 2.9630333442387525e-12, + "loss": 0.1729, + "step": 12619 + }, + { + "epoch": 0.9998019409784116, + "grad_norm": 1.618717443879094, + "learning_rate": 1.3169037449412004e-12, + "loss": 0.2662, + "step": 12620 + }, + { + "epoch": 0.9998811645870469, + "grad_norm": 1.429308333073465, + "learning_rate": 3.29225942063971e-13, + "loss": 0.2245, + "step": 12621 + }, + { + "epoch": 0.9999603881956823, + "grad_norm": 1.335405194521231, + "learning_rate": 0.0, + "loss": 0.2444, + "step": 12622 + }, + { + "epoch": 0.9999603881956823, + "step": 12622, + "total_flos": 6735811612704768.0, + "train_loss": 0.3364984280516995, + "train_runtime": 50045.9274, + "train_samples_per_second": 32.284, + "train_steps_per_second": 0.252 + } + ], + "logging_steps": 1.0, + "max_steps": 12622, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6735811612704768.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}