{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3606, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.829824447631836, "eval_runtime": 408.1353, "eval_samples_per_second": 100.489, "eval_steps_per_second": 1.571, "step": 0 }, { "epoch": 0.00027731558513588466, "grad_norm": 46.86890411376953, "learning_rate": 0.0, "loss": 2.4417, "step": 1 }, { "epoch": 0.0005546311702717693, "grad_norm": 69.00373077392578, "learning_rate": 2.5000000000000004e-07, "loss": 2.3396, "step": 2 }, { "epoch": 0.0008319467554076539, "grad_norm": 53.255943298339844, "learning_rate": 5.000000000000001e-07, "loss": 2.3484, "step": 3 }, { "epoch": 0.0011092623405435386, "grad_norm": 21.859601974487305, "learning_rate": 7.5e-07, "loss": 2.3949, "step": 4 }, { "epoch": 0.0013865779256794233, "grad_norm": 18.474227905273438, "learning_rate": 1.0000000000000002e-06, "loss": 2.3011, "step": 5 }, { "epoch": 0.0016638935108153079, "grad_norm": 18.92083740234375, "learning_rate": 1.25e-06, "loss": 2.1921, "step": 6 }, { "epoch": 0.0019412090959511925, "grad_norm": 17.222856521606445, "learning_rate": 1.5e-06, "loss": 2.2153, "step": 7 }, { "epoch": 0.0022185246810870773, "grad_norm": 31.592514038085938, "learning_rate": 1.7500000000000002e-06, "loss": 2.0413, "step": 8 }, { "epoch": 0.0024958402662229617, "grad_norm": 17.17295265197754, "learning_rate": 2.0000000000000003e-06, "loss": 2.0861, "step": 9 }, { "epoch": 0.0027731558513588465, "grad_norm": 17.179834365844727, "learning_rate": 2.25e-06, "loss": 2.0175, "step": 10 }, { "epoch": 0.003050471436494731, "grad_norm": 17.518646240234375, "learning_rate": 2.5e-06, "loss": 1.9135, "step": 11 }, { "epoch": 0.0033277870216306157, "grad_norm": 14.855606079101562, "learning_rate": 2.7500000000000004e-06, "loss": 1.7351, "step": 12 }, { "epoch": 0.0036051026067665, "grad_norm": 13.514114379882812, "learning_rate": 3e-06, "loss": 1.5465, "step": 13 }, { "epoch": 0.003882418191902385, "grad_norm": 12.117414474487305, "learning_rate": 3.2500000000000002e-06, "loss": 1.4642, "step": 14 }, { "epoch": 0.004159733777038269, "grad_norm": 10.935081481933594, "learning_rate": 3.5000000000000004e-06, "loss": 1.3851, "step": 15 }, { "epoch": 0.004437049362174155, "grad_norm": 7.275962829589844, "learning_rate": 3.75e-06, "loss": 1.1289, "step": 16 }, { "epoch": 0.004714364947310039, "grad_norm": 6.399021148681641, "learning_rate": 4.000000000000001e-06, "loss": 1.1277, "step": 17 }, { "epoch": 0.004991680532445923, "grad_norm": 6.132956027984619, "learning_rate": 4.250000000000001e-06, "loss": 1.1483, "step": 18 }, { "epoch": 0.005268996117581808, "grad_norm": 5.525564670562744, "learning_rate": 4.5e-06, "loss": 1.0578, "step": 19 }, { "epoch": 0.005546311702717693, "grad_norm": 5.441694259643555, "learning_rate": 4.75e-06, "loss": 1.0647, "step": 20 }, { "epoch": 0.005823627287853577, "grad_norm": 5.160792827606201, "learning_rate": 5e-06, "loss": 0.9961, "step": 21 }, { "epoch": 0.006100942872989462, "grad_norm": 5.569485664367676, "learning_rate": 5.25e-06, "loss": 1.0063, "step": 22 }, { "epoch": 0.006378258458125347, "grad_norm": 4.869104385375977, "learning_rate": 5.500000000000001e-06, "loss": 0.9848, "step": 23 }, { "epoch": 0.0066555740432612314, "grad_norm": 3.172858238220215, "learning_rate": 5.750000000000001e-06, "loss": 0.9317, "step": 24 }, { "epoch": 0.006932889628397116, "grad_norm": 2.5935134887695312, "learning_rate": 6e-06, "loss": 0.9086, "step": 25 }, { "epoch": 0.007210205213533, "grad_norm": 1.7386329174041748, "learning_rate": 6.25e-06, "loss": 0.8976, "step": 26 }, { "epoch": 0.0074875207986688855, "grad_norm": 1.3860479593276978, "learning_rate": 6.5000000000000004e-06, "loss": 0.8668, "step": 27 }, { "epoch": 0.00776483638380477, "grad_norm": 1.3284790515899658, "learning_rate": 6.750000000000001e-06, "loss": 0.8438, "step": 28 }, { "epoch": 0.008042151968940654, "grad_norm": 1.208060383796692, "learning_rate": 7.000000000000001e-06, "loss": 0.8601, "step": 29 }, { "epoch": 0.008319467554076539, "grad_norm": 0.99210125207901, "learning_rate": 7.25e-06, "loss": 0.836, "step": 30 }, { "epoch": 0.008596783139212423, "grad_norm": 0.7937288284301758, "learning_rate": 7.5e-06, "loss": 0.829, "step": 31 }, { "epoch": 0.00887409872434831, "grad_norm": 0.706200361251831, "learning_rate": 7.75e-06, "loss": 0.8006, "step": 32 }, { "epoch": 0.009151414309484194, "grad_norm": 0.9658659100532532, "learning_rate": 8.000000000000001e-06, "loss": 0.7989, "step": 33 }, { "epoch": 0.009428729894620078, "grad_norm": 1.174869418144226, "learning_rate": 8.25e-06, "loss": 0.8159, "step": 34 }, { "epoch": 0.009706045479755962, "grad_norm": 0.5839990973472595, "learning_rate": 8.500000000000002e-06, "loss": 0.7837, "step": 35 }, { "epoch": 0.009983361064891847, "grad_norm": 0.6130610704421997, "learning_rate": 8.75e-06, "loss": 0.7799, "step": 36 }, { "epoch": 0.010260676650027731, "grad_norm": 0.7108742594718933, "learning_rate": 9e-06, "loss": 0.7611, "step": 37 }, { "epoch": 0.010537992235163616, "grad_norm": 0.7302682995796204, "learning_rate": 9.25e-06, "loss": 0.7942, "step": 38 }, { "epoch": 0.010815307820299502, "grad_norm": 0.5843620896339417, "learning_rate": 9.5e-06, "loss": 0.765, "step": 39 }, { "epoch": 0.011092623405435386, "grad_norm": 0.41768163442611694, "learning_rate": 9.750000000000002e-06, "loss": 0.7112, "step": 40 }, { "epoch": 0.01136993899057127, "grad_norm": 0.5103988647460938, "learning_rate": 1e-05, "loss": 0.762, "step": 41 }, { "epoch": 0.011647254575707155, "grad_norm": 0.47128552198410034, "learning_rate": 1.025e-05, "loss": 0.7509, "step": 42 }, { "epoch": 0.01192457016084304, "grad_norm": 1.0201480388641357, "learning_rate": 1.05e-05, "loss": 0.7507, "step": 43 }, { "epoch": 0.012201885745978924, "grad_norm": 0.3908264935016632, "learning_rate": 1.075e-05, "loss": 0.7629, "step": 44 }, { "epoch": 0.012479201331114808, "grad_norm": 0.4154920279979706, "learning_rate": 1.1000000000000001e-05, "loss": 0.7531, "step": 45 }, { "epoch": 0.012756516916250694, "grad_norm": 0.4213290512561798, "learning_rate": 1.125e-05, "loss": 0.7472, "step": 46 }, { "epoch": 0.013033832501386578, "grad_norm": 0.6245641112327576, "learning_rate": 1.1500000000000002e-05, "loss": 0.7404, "step": 47 }, { "epoch": 0.013311148086522463, "grad_norm": 0.44496941566467285, "learning_rate": 1.175e-05, "loss": 0.7427, "step": 48 }, { "epoch": 0.013588463671658347, "grad_norm": 0.4155629575252533, "learning_rate": 1.2e-05, "loss": 0.7147, "step": 49 }, { "epoch": 0.013865779256794232, "grad_norm": 0.37920621037483215, "learning_rate": 1.225e-05, "loss": 0.7426, "step": 50 }, { "epoch": 0.014143094841930116, "grad_norm": 0.3893055319786072, "learning_rate": 1.25e-05, "loss": 0.7176, "step": 51 }, { "epoch": 0.014420410427066, "grad_norm": 0.3363882005214691, "learning_rate": 1.2750000000000002e-05, "loss": 0.7063, "step": 52 }, { "epoch": 0.014697726012201887, "grad_norm": 0.4444830119609833, "learning_rate": 1.3000000000000001e-05, "loss": 0.6945, "step": 53 }, { "epoch": 0.014975041597337771, "grad_norm": 0.3413512706756592, "learning_rate": 1.3250000000000002e-05, "loss": 0.7288, "step": 54 }, { "epoch": 0.015252357182473655, "grad_norm": 0.4114389717578888, "learning_rate": 1.3500000000000001e-05, "loss": 0.7575, "step": 55 }, { "epoch": 0.01552967276760954, "grad_norm": 0.36049914360046387, "learning_rate": 1.3750000000000002e-05, "loss": 0.7217, "step": 56 }, { "epoch": 0.015806988352745424, "grad_norm": 0.41267284750938416, "learning_rate": 1.4000000000000001e-05, "loss": 0.7166, "step": 57 }, { "epoch": 0.01608430393788131, "grad_norm": 0.4639422297477722, "learning_rate": 1.4249999999999999e-05, "loss": 0.7069, "step": 58 }, { "epoch": 0.016361619523017193, "grad_norm": 0.36772483587265015, "learning_rate": 1.45e-05, "loss": 0.7247, "step": 59 }, { "epoch": 0.016638935108153077, "grad_norm": 0.3546575903892517, "learning_rate": 1.475e-05, "loss": 0.7128, "step": 60 }, { "epoch": 0.01691625069328896, "grad_norm": 0.31919416785240173, "learning_rate": 1.5e-05, "loss": 0.7207, "step": 61 }, { "epoch": 0.017193566278424846, "grad_norm": 0.3498699367046356, "learning_rate": 1.525e-05, "loss": 0.7065, "step": 62 }, { "epoch": 0.01747088186356073, "grad_norm": 0.35648590326309204, "learning_rate": 1.55e-05, "loss": 0.7146, "step": 63 }, { "epoch": 0.01774819744869662, "grad_norm": 0.30697041749954224, "learning_rate": 1.575e-05, "loss": 0.6805, "step": 64 }, { "epoch": 0.018025513033832503, "grad_norm": 0.5759001970291138, "learning_rate": 1.6000000000000003e-05, "loss": 0.7267, "step": 65 }, { "epoch": 0.018302828618968387, "grad_norm": 0.263336718082428, "learning_rate": 1.6250000000000002e-05, "loss": 0.6936, "step": 66 }, { "epoch": 0.01858014420410427, "grad_norm": 0.2977915108203888, "learning_rate": 1.65e-05, "loss": 0.6685, "step": 67 }, { "epoch": 0.018857459789240156, "grad_norm": 0.3028334081172943, "learning_rate": 1.675e-05, "loss": 0.7109, "step": 68 }, { "epoch": 0.01913477537437604, "grad_norm": 0.3265489935874939, "learning_rate": 1.7000000000000003e-05, "loss": 0.7023, "step": 69 }, { "epoch": 0.019412090959511925, "grad_norm": 0.2899531126022339, "learning_rate": 1.725e-05, "loss": 0.6985, "step": 70 }, { "epoch": 0.01968940654464781, "grad_norm": 0.29272034764289856, "learning_rate": 1.75e-05, "loss": 0.7125, "step": 71 }, { "epoch": 0.019966722129783693, "grad_norm": 0.3114602863788605, "learning_rate": 1.775e-05, "loss": 0.6836, "step": 72 }, { "epoch": 0.020244037714919578, "grad_norm": 0.28768229484558105, "learning_rate": 1.8e-05, "loss": 0.6809, "step": 73 }, { "epoch": 0.020521353300055462, "grad_norm": 0.270345002412796, "learning_rate": 1.825e-05, "loss": 0.6776, "step": 74 }, { "epoch": 0.020798668885191347, "grad_norm": 0.2635841369628906, "learning_rate": 1.85e-05, "loss": 0.6645, "step": 75 }, { "epoch": 0.02107598447032723, "grad_norm": 0.3204723000526428, "learning_rate": 1.8750000000000002e-05, "loss": 0.6631, "step": 76 }, { "epoch": 0.021353300055463115, "grad_norm": 0.27179455757141113, "learning_rate": 1.9e-05, "loss": 0.693, "step": 77 }, { "epoch": 0.021630615640599003, "grad_norm": 0.26558464765548706, "learning_rate": 1.925e-05, "loss": 0.6715, "step": 78 }, { "epoch": 0.021907931225734888, "grad_norm": 0.3682558834552765, "learning_rate": 1.9500000000000003e-05, "loss": 0.6826, "step": 79 }, { "epoch": 0.022185246810870772, "grad_norm": 0.281429648399353, "learning_rate": 1.9750000000000002e-05, "loss": 0.6599, "step": 80 }, { "epoch": 0.022462562396006656, "grad_norm": 0.28537672758102417, "learning_rate": 2e-05, "loss": 0.6695, "step": 81 }, { "epoch": 0.02273987798114254, "grad_norm": 0.274913489818573, "learning_rate": 2.025e-05, "loss": 0.678, "step": 82 }, { "epoch": 0.023017193566278425, "grad_norm": 0.28847208619117737, "learning_rate": 2.05e-05, "loss": 0.669, "step": 83 }, { "epoch": 0.02329450915141431, "grad_norm": 0.30678853392601013, "learning_rate": 2.075e-05, "loss": 0.6647, "step": 84 }, { "epoch": 0.023571824736550194, "grad_norm": 0.28266021609306335, "learning_rate": 2.1e-05, "loss": 0.671, "step": 85 }, { "epoch": 0.02384914032168608, "grad_norm": 0.2712315320968628, "learning_rate": 2.125e-05, "loss": 0.6579, "step": 86 }, { "epoch": 0.024126455906821963, "grad_norm": 0.30666086077690125, "learning_rate": 2.15e-05, "loss": 0.6598, "step": 87 }, { "epoch": 0.024403771491957847, "grad_norm": 0.257932186126709, "learning_rate": 2.175e-05, "loss": 0.6348, "step": 88 }, { "epoch": 0.02468108707709373, "grad_norm": 0.3133629560470581, "learning_rate": 2.2000000000000003e-05, "loss": 0.6611, "step": 89 }, { "epoch": 0.024958402662229616, "grad_norm": 0.27258774638175964, "learning_rate": 2.2250000000000002e-05, "loss": 0.6736, "step": 90 }, { "epoch": 0.0252357182473655, "grad_norm": 0.3201597034931183, "learning_rate": 2.25e-05, "loss": 0.6918, "step": 91 }, { "epoch": 0.025513033832501388, "grad_norm": 0.26909735798835754, "learning_rate": 2.275e-05, "loss": 0.6738, "step": 92 }, { "epoch": 0.025790349417637273, "grad_norm": 0.40945449471473694, "learning_rate": 2.3000000000000003e-05, "loss": 0.6615, "step": 93 }, { "epoch": 0.026067665002773157, "grad_norm": 0.3059796392917633, "learning_rate": 2.3250000000000003e-05, "loss": 0.6677, "step": 94 }, { "epoch": 0.02634498058790904, "grad_norm": 0.2737233638763428, "learning_rate": 2.35e-05, "loss": 0.6436, "step": 95 }, { "epoch": 0.026622296173044926, "grad_norm": 0.3231774866580963, "learning_rate": 2.375e-05, "loss": 0.6735, "step": 96 }, { "epoch": 0.02689961175818081, "grad_norm": 0.28404903411865234, "learning_rate": 2.4e-05, "loss": 0.6688, "step": 97 }, { "epoch": 0.027176927343316695, "grad_norm": 0.3392276465892792, "learning_rate": 2.425e-05, "loss": 0.684, "step": 98 }, { "epoch": 0.02745424292845258, "grad_norm": 0.3021562099456787, "learning_rate": 2.45e-05, "loss": 0.6886, "step": 99 }, { "epoch": 0.027731558513588463, "grad_norm": 0.2843964099884033, "learning_rate": 2.4750000000000002e-05, "loss": 0.6358, "step": 100 }, { "epoch": 0.028008874098724348, "grad_norm": 0.3120342195034027, "learning_rate": 2.5e-05, "loss": 0.669, "step": 101 }, { "epoch": 0.028286189683860232, "grad_norm": 0.2752966284751892, "learning_rate": 2.499999878045941e-05, "loss": 0.6231, "step": 102 }, { "epoch": 0.028563505268996116, "grad_norm": 0.28501492738723755, "learning_rate": 2.4999995121837877e-05, "loss": 0.6205, "step": 103 }, { "epoch": 0.028840820854132, "grad_norm": 0.2797880172729492, "learning_rate": 2.4999989024136113e-05, "loss": 0.6697, "step": 104 }, { "epoch": 0.029118136439267885, "grad_norm": 0.2929855287075043, "learning_rate": 2.4999980487355314e-05, "loss": 0.642, "step": 105 }, { "epoch": 0.029395452024403773, "grad_norm": 0.264911025762558, "learning_rate": 2.4999969511497135e-05, "loss": 0.6575, "step": 106 }, { "epoch": 0.029672767609539658, "grad_norm": 0.2797522246837616, "learning_rate": 2.4999956096563725e-05, "loss": 0.6566, "step": 107 }, { "epoch": 0.029950083194675542, "grad_norm": 0.24777652323246002, "learning_rate": 2.49999402425577e-05, "loss": 0.6288, "step": 108 }, { "epoch": 0.030227398779811426, "grad_norm": 0.34936287999153137, "learning_rate": 2.4999921949482157e-05, "loss": 0.6506, "step": 109 }, { "epoch": 0.03050471436494731, "grad_norm": 0.30599579215049744, "learning_rate": 2.499990121734066e-05, "loss": 0.6554, "step": 110 }, { "epoch": 0.030782029950083195, "grad_norm": 0.29751792550086975, "learning_rate": 2.499987804613726e-05, "loss": 0.6698, "step": 111 }, { "epoch": 0.03105934553521908, "grad_norm": 0.2642778754234314, "learning_rate": 2.4999852435876473e-05, "loss": 0.6337, "step": 112 }, { "epoch": 0.031336661120354964, "grad_norm": 0.2584931552410126, "learning_rate": 2.49998243865633e-05, "loss": 0.661, "step": 113 }, { "epoch": 0.03161397670549085, "grad_norm": 0.266797810792923, "learning_rate": 2.4999793898203212e-05, "loss": 0.6368, "step": 114 }, { "epoch": 0.03189129229062673, "grad_norm": 0.35552042722702026, "learning_rate": 2.4999760970802155e-05, "loss": 0.6364, "step": 115 }, { "epoch": 0.03216860787576262, "grad_norm": 0.28450194001197815, "learning_rate": 2.4999725604366562e-05, "loss": 0.661, "step": 116 }, { "epoch": 0.0324459234608985, "grad_norm": 0.3352636694908142, "learning_rate": 2.4999687798903327e-05, "loss": 0.6439, "step": 117 }, { "epoch": 0.032723239046034386, "grad_norm": 0.2475953847169876, "learning_rate": 2.499964755441983e-05, "loss": 0.6344, "step": 118 }, { "epoch": 0.03300055463117027, "grad_norm": 0.30431386828422546, "learning_rate": 2.4999604870923926e-05, "loss": 0.6459, "step": 119 }, { "epoch": 0.033277870216306155, "grad_norm": 0.2653152644634247, "learning_rate": 2.499955974842394e-05, "loss": 0.6425, "step": 120 }, { "epoch": 0.03355518580144204, "grad_norm": 0.29490575194358826, "learning_rate": 2.4999512186928675e-05, "loss": 0.6427, "step": 121 }, { "epoch": 0.03383250138657792, "grad_norm": 0.2630308270454407, "learning_rate": 2.4999462186447415e-05, "loss": 0.6597, "step": 122 }, { "epoch": 0.03410981697171381, "grad_norm": 0.26787513494491577, "learning_rate": 2.4999409746989914e-05, "loss": 0.6622, "step": 123 }, { "epoch": 0.03438713255684969, "grad_norm": 0.25667890906333923, "learning_rate": 2.499935486856641e-05, "loss": 0.6335, "step": 124 }, { "epoch": 0.03466444814198558, "grad_norm": 0.26751402020454407, "learning_rate": 2.4999297551187603e-05, "loss": 0.6358, "step": 125 }, { "epoch": 0.03494176372712146, "grad_norm": 0.2815951108932495, "learning_rate": 2.4999237794864683e-05, "loss": 0.6615, "step": 126 }, { "epoch": 0.03521907931225735, "grad_norm": 0.2573346793651581, "learning_rate": 2.499917559960931e-05, "loss": 0.6463, "step": 127 }, { "epoch": 0.03549639489739324, "grad_norm": 0.26202693581581116, "learning_rate": 2.4999110965433615e-05, "loss": 0.6436, "step": 128 }, { "epoch": 0.03577371048252912, "grad_norm": 0.267046719789505, "learning_rate": 2.4999043892350213e-05, "loss": 0.6433, "step": 129 }, { "epoch": 0.036051026067665005, "grad_norm": 0.2713761329650879, "learning_rate": 2.499897438037219e-05, "loss": 0.6314, "step": 130 }, { "epoch": 0.03632834165280089, "grad_norm": 0.2704955041408539, "learning_rate": 2.4998902429513115e-05, "loss": 0.6471, "step": 131 }, { "epoch": 0.036605657237936774, "grad_norm": 0.25811654329299927, "learning_rate": 2.4998828039787027e-05, "loss": 0.6346, "step": 132 }, { "epoch": 0.03688297282307266, "grad_norm": 0.2768125832080841, "learning_rate": 2.4998751211208432e-05, "loss": 0.6327, "step": 133 }, { "epoch": 0.03716028840820854, "grad_norm": 0.29026105999946594, "learning_rate": 2.499867194379233e-05, "loss": 0.6632, "step": 134 }, { "epoch": 0.03743760399334443, "grad_norm": 0.26648250222206116, "learning_rate": 2.4998590237554182e-05, "loss": 0.6414, "step": 135 }, { "epoch": 0.03771491957848031, "grad_norm": 0.2578074336051941, "learning_rate": 2.4998506092509938e-05, "loss": 0.6459, "step": 136 }, { "epoch": 0.037992235163616196, "grad_norm": 0.2555679678916931, "learning_rate": 2.4998419508676014e-05, "loss": 0.6561, "step": 137 }, { "epoch": 0.03826955074875208, "grad_norm": 0.25471994280815125, "learning_rate": 2.4998330486069304e-05, "loss": 0.6616, "step": 138 }, { "epoch": 0.038546866333887965, "grad_norm": 0.2434554398059845, "learning_rate": 2.4998239024707183e-05, "loss": 0.6423, "step": 139 }, { "epoch": 0.03882418191902385, "grad_norm": 0.23697395622730255, "learning_rate": 2.4998145124607485e-05, "loss": 0.629, "step": 140 }, { "epoch": 0.039101497504159734, "grad_norm": 0.2652537226676941, "learning_rate": 2.4998048785788547e-05, "loss": 0.6558, "step": 141 }, { "epoch": 0.03937881308929562, "grad_norm": 0.2602185606956482, "learning_rate": 2.499795000826916e-05, "loss": 0.6427, "step": 142 }, { "epoch": 0.0396561286744315, "grad_norm": 0.23875969648361206, "learning_rate": 2.49978487920686e-05, "loss": 0.6097, "step": 143 }, { "epoch": 0.03993344425956739, "grad_norm": 0.2549594044685364, "learning_rate": 2.4997745137206618e-05, "loss": 0.6477, "step": 144 }, { "epoch": 0.04021075984470327, "grad_norm": 0.2528778910636902, "learning_rate": 2.4997639043703437e-05, "loss": 0.6028, "step": 145 }, { "epoch": 0.040488075429839156, "grad_norm": 0.252888023853302, "learning_rate": 2.499753051157976e-05, "loss": 0.6547, "step": 146 }, { "epoch": 0.04076539101497504, "grad_norm": 0.289661705493927, "learning_rate": 2.4997419540856762e-05, "loss": 0.6604, "step": 147 }, { "epoch": 0.041042706600110924, "grad_norm": 0.27772676944732666, "learning_rate": 2.49973061315561e-05, "loss": 0.6321, "step": 148 }, { "epoch": 0.04132002218524681, "grad_norm": 0.29435357451438904, "learning_rate": 2.4997190283699904e-05, "loss": 0.6539, "step": 149 }, { "epoch": 0.04159733777038269, "grad_norm": 0.2796315848827362, "learning_rate": 2.4997071997310774e-05, "loss": 0.6816, "step": 150 }, { "epoch": 0.04187465335551858, "grad_norm": 0.2854909598827362, "learning_rate": 2.4996951272411794e-05, "loss": 0.621, "step": 151 }, { "epoch": 0.04215196894065446, "grad_norm": 0.3513517677783966, "learning_rate": 2.499682810902652e-05, "loss": 0.6462, "step": 152 }, { "epoch": 0.042429284525790346, "grad_norm": 0.2492416799068451, "learning_rate": 2.4996702507178988e-05, "loss": 0.6455, "step": 153 }, { "epoch": 0.04270660011092623, "grad_norm": 0.2352532148361206, "learning_rate": 2.49965744668937e-05, "loss": 0.6168, "step": 154 }, { "epoch": 0.04298391569606212, "grad_norm": 0.25850751996040344, "learning_rate": 2.4996443988195644e-05, "loss": 0.6452, "step": 155 }, { "epoch": 0.04326123128119801, "grad_norm": 0.23972827196121216, "learning_rate": 2.499631107111028e-05, "loss": 0.6447, "step": 156 }, { "epoch": 0.04353854686633389, "grad_norm": 0.24847468733787537, "learning_rate": 2.499617571566354e-05, "loss": 0.6386, "step": 157 }, { "epoch": 0.043815862451469775, "grad_norm": 0.25739696621894836, "learning_rate": 2.4996037921881837e-05, "loss": 0.6417, "step": 158 }, { "epoch": 0.04409317803660566, "grad_norm": 0.23640736937522888, "learning_rate": 2.4995897689792062e-05, "loss": 0.6451, "step": 159 }, { "epoch": 0.044370493621741544, "grad_norm": 0.25362861156463623, "learning_rate": 2.4995755019421577e-05, "loss": 0.6525, "step": 160 }, { "epoch": 0.04464780920687743, "grad_norm": 0.2607216536998749, "learning_rate": 2.4995609910798214e-05, "loss": 0.6276, "step": 161 }, { "epoch": 0.04492512479201331, "grad_norm": 0.2426438182592392, "learning_rate": 2.4995462363950295e-05, "loss": 0.6375, "step": 162 }, { "epoch": 0.0452024403771492, "grad_norm": 0.35536178946495056, "learning_rate": 2.499531237890661e-05, "loss": 0.6502, "step": 163 }, { "epoch": 0.04547975596228508, "grad_norm": 0.2616370618343353, "learning_rate": 2.4995159955696417e-05, "loss": 0.6422, "step": 164 }, { "epoch": 0.045757071547420966, "grad_norm": 0.2493521124124527, "learning_rate": 2.4995005094349473e-05, "loss": 0.6314, "step": 165 }, { "epoch": 0.04603438713255685, "grad_norm": 0.25228554010391235, "learning_rate": 2.4994847794895977e-05, "loss": 0.6154, "step": 166 }, { "epoch": 0.046311702717692735, "grad_norm": 0.24656261503696442, "learning_rate": 2.4994688057366635e-05, "loss": 0.6241, "step": 167 }, { "epoch": 0.04658901830282862, "grad_norm": 0.27083900570869446, "learning_rate": 2.4994525881792612e-05, "loss": 0.627, "step": 168 }, { "epoch": 0.046866333887964504, "grad_norm": 0.24816080927848816, "learning_rate": 2.499436126820555e-05, "loss": 0.6237, "step": 169 }, { "epoch": 0.04714364947310039, "grad_norm": 0.2589535415172577, "learning_rate": 2.499419421663758e-05, "loss": 0.6139, "step": 170 }, { "epoch": 0.04742096505823627, "grad_norm": 0.2533140182495117, "learning_rate": 2.499402472712129e-05, "loss": 0.6533, "step": 171 }, { "epoch": 0.04769828064337216, "grad_norm": 0.23976951837539673, "learning_rate": 2.499385279968975e-05, "loss": 0.645, "step": 172 }, { "epoch": 0.04797559622850804, "grad_norm": 0.24593569338321686, "learning_rate": 2.4993678434376507e-05, "loss": 0.5958, "step": 173 }, { "epoch": 0.048252911813643926, "grad_norm": 0.283794641494751, "learning_rate": 2.4993501631215593e-05, "loss": 0.6565, "step": 174 }, { "epoch": 0.04853022739877981, "grad_norm": 0.23555535078048706, "learning_rate": 2.4993322390241496e-05, "loss": 0.6077, "step": 175 }, { "epoch": 0.048807542983915694, "grad_norm": 0.24843829870224, "learning_rate": 2.4993140711489203e-05, "loss": 0.6001, "step": 176 }, { "epoch": 0.04908485856905158, "grad_norm": 0.24111098051071167, "learning_rate": 2.4992956594994156e-05, "loss": 0.6445, "step": 177 }, { "epoch": 0.04936217415418746, "grad_norm": 0.2340569943189621, "learning_rate": 2.499277004079228e-05, "loss": 0.6174, "step": 178 }, { "epoch": 0.04963948973932335, "grad_norm": 0.26766470074653625, "learning_rate": 2.499258104891998e-05, "loss": 0.6167, "step": 179 }, { "epoch": 0.04991680532445923, "grad_norm": 0.2567934989929199, "learning_rate": 2.499238961941413e-05, "loss": 0.6394, "step": 180 }, { "epoch": 0.050194120909595116, "grad_norm": 0.27323460578918457, "learning_rate": 2.4992195752312093e-05, "loss": 0.6337, "step": 181 }, { "epoch": 0.050471436494731, "grad_norm": 0.3052992522716522, "learning_rate": 2.4991999447651686e-05, "loss": 0.6389, "step": 182 }, { "epoch": 0.050748752079866885, "grad_norm": 0.24447570741176605, "learning_rate": 2.4991800705471218e-05, "loss": 0.6165, "step": 183 }, { "epoch": 0.051026067665002776, "grad_norm": 0.319871187210083, "learning_rate": 2.499159952580947e-05, "loss": 0.5816, "step": 184 }, { "epoch": 0.05130338325013866, "grad_norm": 0.2554628551006317, "learning_rate": 2.4991395908705693e-05, "loss": 0.646, "step": 185 }, { "epoch": 0.051580698835274545, "grad_norm": 0.2671261727809906, "learning_rate": 2.499118985419962e-05, "loss": 0.6592, "step": 186 }, { "epoch": 0.05185801442041043, "grad_norm": 0.2524307370185852, "learning_rate": 2.4990981362331462e-05, "loss": 0.6178, "step": 187 }, { "epoch": 0.052135330005546314, "grad_norm": 0.2809012532234192, "learning_rate": 2.4990770433141898e-05, "loss": 0.6234, "step": 188 }, { "epoch": 0.0524126455906822, "grad_norm": 0.23483595252037048, "learning_rate": 2.499055706667208e-05, "loss": 0.6209, "step": 189 }, { "epoch": 0.05268996117581808, "grad_norm": 0.2775763273239136, "learning_rate": 2.4990341262963654e-05, "loss": 0.618, "step": 190 }, { "epoch": 0.05296727676095397, "grad_norm": 0.27081090211868286, "learning_rate": 2.499012302205872e-05, "loss": 0.6377, "step": 191 }, { "epoch": 0.05324459234608985, "grad_norm": 0.24989834427833557, "learning_rate": 2.4989902343999865e-05, "loss": 0.6179, "step": 192 }, { "epoch": 0.053521907931225736, "grad_norm": 0.27669793367385864, "learning_rate": 2.498967922883015e-05, "loss": 0.6298, "step": 193 }, { "epoch": 0.05379922351636162, "grad_norm": 0.2672564387321472, "learning_rate": 2.4989453676593106e-05, "loss": 0.6536, "step": 194 }, { "epoch": 0.054076539101497505, "grad_norm": 0.24099047482013702, "learning_rate": 2.4989225687332752e-05, "loss": 0.61, "step": 195 }, { "epoch": 0.05435385468663339, "grad_norm": 0.235582634806633, "learning_rate": 2.4988995261093566e-05, "loss": 0.654, "step": 196 }, { "epoch": 0.054631170271769273, "grad_norm": 0.2795652747154236, "learning_rate": 2.4988762397920517e-05, "loss": 0.6224, "step": 197 }, { "epoch": 0.05490848585690516, "grad_norm": 0.22800379991531372, "learning_rate": 2.4988527097859045e-05, "loss": 0.6186, "step": 198 }, { "epoch": 0.05518580144204104, "grad_norm": 0.24810528755187988, "learning_rate": 2.4988289360955053e-05, "loss": 0.6286, "step": 199 }, { "epoch": 0.05546311702717693, "grad_norm": 0.21688294410705566, "learning_rate": 2.4988049187254935e-05, "loss": 0.598, "step": 200 }, { "epoch": 0.05574043261231281, "grad_norm": 0.23709554970264435, "learning_rate": 2.4987806576805562e-05, "loss": 0.6598, "step": 201 }, { "epoch": 0.056017748197448695, "grad_norm": 0.24982847273349762, "learning_rate": 2.4987561529654263e-05, "loss": 0.6342, "step": 202 }, { "epoch": 0.05629506378258458, "grad_norm": 0.2215258628129959, "learning_rate": 2.498731404584886e-05, "loss": 0.6159, "step": 203 }, { "epoch": 0.056572379367720464, "grad_norm": 0.25177863240242004, "learning_rate": 2.4987064125437643e-05, "loss": 0.6289, "step": 204 }, { "epoch": 0.05684969495285635, "grad_norm": 0.24716275930404663, "learning_rate": 2.498681176846937e-05, "loss": 0.628, "step": 205 }, { "epoch": 0.05712701053799223, "grad_norm": 0.2888506054878235, "learning_rate": 2.49865569749933e-05, "loss": 0.6216, "step": 206 }, { "epoch": 0.05740432612312812, "grad_norm": 0.24658267199993134, "learning_rate": 2.4986299745059127e-05, "loss": 0.6132, "step": 207 }, { "epoch": 0.057681641708264, "grad_norm": 0.24297240376472473, "learning_rate": 2.4986040078717063e-05, "loss": 0.6201, "step": 208 }, { "epoch": 0.057958957293399886, "grad_norm": 0.2425074279308319, "learning_rate": 2.4985777976017767e-05, "loss": 0.5997, "step": 209 }, { "epoch": 0.05823627287853577, "grad_norm": 0.25336262583732605, "learning_rate": 2.498551343701238e-05, "loss": 0.6489, "step": 210 }, { "epoch": 0.058513588463671655, "grad_norm": 0.23498830199241638, "learning_rate": 2.498524646175253e-05, "loss": 0.6341, "step": 211 }, { "epoch": 0.058790904048807546, "grad_norm": 0.24754488468170166, "learning_rate": 2.49849770502903e-05, "loss": 0.6538, "step": 212 }, { "epoch": 0.05906821963394343, "grad_norm": 0.24818097054958344, "learning_rate": 2.4984705202678266e-05, "loss": 0.6098, "step": 213 }, { "epoch": 0.059345535219079315, "grad_norm": 0.22981123626232147, "learning_rate": 2.498443091896947e-05, "loss": 0.6072, "step": 214 }, { "epoch": 0.0596228508042152, "grad_norm": 0.2612292766571045, "learning_rate": 2.4984154199217434e-05, "loss": 0.626, "step": 215 }, { "epoch": 0.059900166389351084, "grad_norm": 0.26491644978523254, "learning_rate": 2.4983875043476153e-05, "loss": 0.6495, "step": 216 }, { "epoch": 0.06017748197448697, "grad_norm": 0.22399267554283142, "learning_rate": 2.4983593451800096e-05, "loss": 0.6341, "step": 217 }, { "epoch": 0.06045479755962285, "grad_norm": 0.2318061739206314, "learning_rate": 2.498330942424421e-05, "loss": 0.5787, "step": 218 }, { "epoch": 0.06073211314475874, "grad_norm": 0.2700578272342682, "learning_rate": 2.498302296086392e-05, "loss": 0.6314, "step": 219 }, { "epoch": 0.06100942872989462, "grad_norm": 0.22675910592079163, "learning_rate": 2.4982734061715112e-05, "loss": 0.5714, "step": 220 }, { "epoch": 0.061286744315030506, "grad_norm": 0.2522087097167969, "learning_rate": 2.4982442726854173e-05, "loss": 0.6053, "step": 221 }, { "epoch": 0.06156405990016639, "grad_norm": 0.22665978968143463, "learning_rate": 2.4982148956337935e-05, "loss": 0.6156, "step": 222 }, { "epoch": 0.061841375485302275, "grad_norm": 0.24832209944725037, "learning_rate": 2.4981852750223726e-05, "loss": 0.6406, "step": 223 }, { "epoch": 0.06211869107043816, "grad_norm": 0.2526067793369293, "learning_rate": 2.498155410856935e-05, "loss": 0.6312, "step": 224 }, { "epoch": 0.06239600665557404, "grad_norm": 0.24257095158100128, "learning_rate": 2.4981253031433076e-05, "loss": 0.6456, "step": 225 }, { "epoch": 0.06267332224070993, "grad_norm": 0.25497207045555115, "learning_rate": 2.4980949518873648e-05, "loss": 0.6047, "step": 226 }, { "epoch": 0.06295063782584581, "grad_norm": 0.23874424397945404, "learning_rate": 2.498064357095029e-05, "loss": 0.614, "step": 227 }, { "epoch": 0.0632279534109817, "grad_norm": 0.24398040771484375, "learning_rate": 2.498033518772271e-05, "loss": 0.6212, "step": 228 }, { "epoch": 0.06350526899611758, "grad_norm": 0.27126333117485046, "learning_rate": 2.498002436925107e-05, "loss": 0.6051, "step": 229 }, { "epoch": 0.06378258458125347, "grad_norm": 0.22852414846420288, "learning_rate": 2.497971111559602e-05, "loss": 0.6133, "step": 230 }, { "epoch": 0.06405990016638935, "grad_norm": 0.22752775251865387, "learning_rate": 2.4979395426818696e-05, "loss": 0.5893, "step": 231 }, { "epoch": 0.06433721575152523, "grad_norm": 0.27361559867858887, "learning_rate": 2.4979077302980683e-05, "loss": 0.6431, "step": 232 }, { "epoch": 0.06461453133666112, "grad_norm": 0.3071225881576538, "learning_rate": 2.497875674414406e-05, "loss": 0.617, "step": 233 }, { "epoch": 0.064891846921797, "grad_norm": 0.3025614023208618, "learning_rate": 2.4978433750371382e-05, "loss": 0.6294, "step": 234 }, { "epoch": 0.06516916250693289, "grad_norm": 0.21824614703655243, "learning_rate": 2.4978108321725667e-05, "loss": 0.6189, "step": 235 }, { "epoch": 0.06544647809206877, "grad_norm": 0.23781217634677887, "learning_rate": 2.497778045827042e-05, "loss": 0.6115, "step": 236 }, { "epoch": 0.06572379367720466, "grad_norm": 0.3532952666282654, "learning_rate": 2.497745016006961e-05, "loss": 0.6091, "step": 237 }, { "epoch": 0.06600110926234054, "grad_norm": 0.26994985342025757, "learning_rate": 2.4977117427187692e-05, "loss": 0.6078, "step": 238 }, { "epoch": 0.06627842484747642, "grad_norm": 0.2395590841770172, "learning_rate": 2.4976782259689587e-05, "loss": 0.6437, "step": 239 }, { "epoch": 0.06655574043261231, "grad_norm": 0.2347521334886551, "learning_rate": 2.49764446576407e-05, "loss": 0.6015, "step": 240 }, { "epoch": 0.0668330560177482, "grad_norm": 0.2603704333305359, "learning_rate": 2.49761046211069e-05, "loss": 0.6441, "step": 241 }, { "epoch": 0.06711037160288408, "grad_norm": 0.21548427641391754, "learning_rate": 2.4975762150154542e-05, "loss": 0.6059, "step": 242 }, { "epoch": 0.06738768718801996, "grad_norm": 0.2270384132862091, "learning_rate": 2.497541724485045e-05, "loss": 0.6148, "step": 243 }, { "epoch": 0.06766500277315585, "grad_norm": 0.23716577887535095, "learning_rate": 2.497506990526192e-05, "loss": 0.6135, "step": 244 }, { "epoch": 0.06794231835829173, "grad_norm": 0.24414962530136108, "learning_rate": 2.4974720131456736e-05, "loss": 0.6363, "step": 245 }, { "epoch": 0.06821963394342762, "grad_norm": 0.24200287461280823, "learning_rate": 2.497436792350314e-05, "loss": 0.6278, "step": 246 }, { "epoch": 0.0684969495285635, "grad_norm": 0.25273025035858154, "learning_rate": 2.497401328146986e-05, "loss": 0.6012, "step": 247 }, { "epoch": 0.06877426511369938, "grad_norm": 0.2444678694009781, "learning_rate": 2.4973656205426094e-05, "loss": 0.6218, "step": 248 }, { "epoch": 0.06905158069883527, "grad_norm": 0.25649040937423706, "learning_rate": 2.4973296695441523e-05, "loss": 0.6678, "step": 249 }, { "epoch": 0.06932889628397115, "grad_norm": 0.2729082703590393, "learning_rate": 2.4972934751586292e-05, "loss": 0.6018, "step": 250 }, { "epoch": 0.06960621186910704, "grad_norm": 0.23950397968292236, "learning_rate": 2.4972570373931026e-05, "loss": 0.6342, "step": 251 }, { "epoch": 0.06988352745424292, "grad_norm": 0.27791211009025574, "learning_rate": 2.4972203562546825e-05, "loss": 0.5948, "step": 252 }, { "epoch": 0.0701608430393788, "grad_norm": 0.24157491326332092, "learning_rate": 2.4971834317505266e-05, "loss": 0.6346, "step": 253 }, { "epoch": 0.0704381586245147, "grad_norm": 0.23233160376548767, "learning_rate": 2.4971462638878394e-05, "loss": 0.6023, "step": 254 }, { "epoch": 0.07071547420965059, "grad_norm": 0.22705158591270447, "learning_rate": 2.4971088526738737e-05, "loss": 0.6314, "step": 255 }, { "epoch": 0.07099278979478647, "grad_norm": 0.2364932894706726, "learning_rate": 2.4970711981159294e-05, "loss": 0.6239, "step": 256 }, { "epoch": 0.07127010537992236, "grad_norm": 0.2369173765182495, "learning_rate": 2.4970333002213535e-05, "loss": 0.6056, "step": 257 }, { "epoch": 0.07154742096505824, "grad_norm": 0.23944415152072906, "learning_rate": 2.4969951589975415e-05, "loss": 0.6188, "step": 258 }, { "epoch": 0.07182473655019413, "grad_norm": 0.24431641399860382, "learning_rate": 2.4969567744519357e-05, "loss": 0.6393, "step": 259 }, { "epoch": 0.07210205213533001, "grad_norm": 0.23058977723121643, "learning_rate": 2.4969181465920254e-05, "loss": 0.623, "step": 260 }, { "epoch": 0.0723793677204659, "grad_norm": 0.2396584004163742, "learning_rate": 2.4968792754253483e-05, "loss": 0.6085, "step": 261 }, { "epoch": 0.07265668330560178, "grad_norm": 0.28117048740386963, "learning_rate": 2.496840160959489e-05, "loss": 0.612, "step": 262 }, { "epoch": 0.07293399889073766, "grad_norm": 0.22773273289203644, "learning_rate": 2.49680080320208e-05, "loss": 0.6215, "step": 263 }, { "epoch": 0.07321131447587355, "grad_norm": 0.25395745038986206, "learning_rate": 2.496761202160801e-05, "loss": 0.6197, "step": 264 }, { "epoch": 0.07348863006100943, "grad_norm": 0.2748405337333679, "learning_rate": 2.496721357843379e-05, "loss": 0.5983, "step": 265 }, { "epoch": 0.07376594564614532, "grad_norm": 0.37056779861450195, "learning_rate": 2.496681270257589e-05, "loss": 0.5974, "step": 266 }, { "epoch": 0.0740432612312812, "grad_norm": 0.2506537139415741, "learning_rate": 2.4966409394112528e-05, "loss": 0.6279, "step": 267 }, { "epoch": 0.07432057681641709, "grad_norm": 0.3248004615306854, "learning_rate": 2.4966003653122406e-05, "loss": 0.5968, "step": 268 }, { "epoch": 0.07459789240155297, "grad_norm": 0.27633965015411377, "learning_rate": 2.4965595479684685e-05, "loss": 0.6207, "step": 269 }, { "epoch": 0.07487520798668885, "grad_norm": 0.2533873915672302, "learning_rate": 2.4965184873879015e-05, "loss": 0.6428, "step": 270 }, { "epoch": 0.07515252357182474, "grad_norm": 0.24860017001628876, "learning_rate": 2.496477183578552e-05, "loss": 0.5717, "step": 271 }, { "epoch": 0.07542983915696062, "grad_norm": 0.2541423738002777, "learning_rate": 2.4964356365484797e-05, "loss": 0.6331, "step": 272 }, { "epoch": 0.07570715474209651, "grad_norm": 0.21915870904922485, "learning_rate": 2.4963938463057907e-05, "loss": 0.6167, "step": 273 }, { "epoch": 0.07598447032723239, "grad_norm": 0.21635891497135162, "learning_rate": 2.4963518128586393e-05, "loss": 0.602, "step": 274 }, { "epoch": 0.07626178591236828, "grad_norm": 0.23918819427490234, "learning_rate": 2.4963095362152282e-05, "loss": 0.5869, "step": 275 }, { "epoch": 0.07653910149750416, "grad_norm": 0.22294320166110992, "learning_rate": 2.496267016383806e-05, "loss": 0.6256, "step": 276 }, { "epoch": 0.07681641708264005, "grad_norm": 0.2167348712682724, "learning_rate": 2.49622425337267e-05, "loss": 0.5936, "step": 277 }, { "epoch": 0.07709373266777593, "grad_norm": 0.21333451569080353, "learning_rate": 2.496181247190164e-05, "loss": 0.6221, "step": 278 }, { "epoch": 0.07737104825291181, "grad_norm": 0.22917450964450836, "learning_rate": 2.4961379978446793e-05, "loss": 0.6132, "step": 279 }, { "epoch": 0.0776483638380477, "grad_norm": 0.228413388133049, "learning_rate": 2.496094505344656e-05, "loss": 0.5953, "step": 280 }, { "epoch": 0.07792567942318358, "grad_norm": 0.20908214151859283, "learning_rate": 2.4960507696985796e-05, "loss": 0.6081, "step": 281 }, { "epoch": 0.07820299500831947, "grad_norm": 0.2270585596561432, "learning_rate": 2.4960067909149846e-05, "loss": 0.5915, "step": 282 }, { "epoch": 0.07848031059345535, "grad_norm": 0.23176094889640808, "learning_rate": 2.4959625690024524e-05, "loss": 0.6126, "step": 283 }, { "epoch": 0.07875762617859124, "grad_norm": 0.24496515095233917, "learning_rate": 2.495918103969612e-05, "loss": 0.6245, "step": 284 }, { "epoch": 0.07903494176372712, "grad_norm": 0.21696555614471436, "learning_rate": 2.4958733958251394e-05, "loss": 0.6104, "step": 285 }, { "epoch": 0.079312257348863, "grad_norm": 0.22292231023311615, "learning_rate": 2.4958284445777584e-05, "loss": 0.6164, "step": 286 }, { "epoch": 0.07958957293399889, "grad_norm": 0.24104639887809753, "learning_rate": 2.4957832502362404e-05, "loss": 0.6002, "step": 287 }, { "epoch": 0.07986688851913477, "grad_norm": 0.22866299748420715, "learning_rate": 2.495737812809404e-05, "loss": 0.6143, "step": 288 }, { "epoch": 0.08014420410427066, "grad_norm": 0.23108075559139252, "learning_rate": 2.495692132306115e-05, "loss": 0.6591, "step": 289 }, { "epoch": 0.08042151968940654, "grad_norm": 0.24755387008190155, "learning_rate": 2.4956462087352868e-05, "loss": 0.5883, "step": 290 }, { "epoch": 0.08069883527454243, "grad_norm": 0.23270832002162933, "learning_rate": 2.4956000421058807e-05, "loss": 0.5727, "step": 291 }, { "epoch": 0.08097615085967831, "grad_norm": 0.23588238656520844, "learning_rate": 2.4955536324269048e-05, "loss": 0.6178, "step": 292 }, { "epoch": 0.0812534664448142, "grad_norm": 0.26772478222846985, "learning_rate": 2.4955069797074147e-05, "loss": 0.6214, "step": 293 }, { "epoch": 0.08153078202995008, "grad_norm": 0.22470323741436005, "learning_rate": 2.495460083956514e-05, "loss": 0.61, "step": 294 }, { "epoch": 0.08180809761508596, "grad_norm": 0.22901985049247742, "learning_rate": 2.495412945183353e-05, "loss": 0.5965, "step": 295 }, { "epoch": 0.08208541320022185, "grad_norm": 0.23927484452724457, "learning_rate": 2.49536556339713e-05, "loss": 0.6229, "step": 296 }, { "epoch": 0.08236272878535773, "grad_norm": 0.2505173087120056, "learning_rate": 2.49531793860709e-05, "loss": 0.5974, "step": 297 }, { "epoch": 0.08264004437049362, "grad_norm": 0.7846159934997559, "learning_rate": 2.4952700708225263e-05, "loss": 0.6097, "step": 298 }, { "epoch": 0.0829173599556295, "grad_norm": 0.23284678161144257, "learning_rate": 2.4952219600527786e-05, "loss": 0.6161, "step": 299 }, { "epoch": 0.08319467554076539, "grad_norm": 0.22659331560134888, "learning_rate": 2.4951736063072356e-05, "loss": 0.5917, "step": 300 }, { "epoch": 0.08347199112590127, "grad_norm": 0.24401767551898956, "learning_rate": 2.4951250095953315e-05, "loss": 0.6163, "step": 301 }, { "epoch": 0.08374930671103716, "grad_norm": 0.23994800448417664, "learning_rate": 2.4950761699265487e-05, "loss": 0.6035, "step": 302 }, { "epoch": 0.08402662229617304, "grad_norm": 0.293527752161026, "learning_rate": 2.495027087310418e-05, "loss": 0.6148, "step": 303 }, { "epoch": 0.08430393788130892, "grad_norm": 0.2797812819480896, "learning_rate": 2.4949777617565156e-05, "loss": 0.6249, "step": 304 }, { "epoch": 0.08458125346644481, "grad_norm": 0.2422715574502945, "learning_rate": 2.4949281932744672e-05, "loss": 0.6064, "step": 305 }, { "epoch": 0.08485856905158069, "grad_norm": 0.2489105761051178, "learning_rate": 2.4948783818739446e-05, "loss": 0.6176, "step": 306 }, { "epoch": 0.08513588463671658, "grad_norm": 0.23189565539360046, "learning_rate": 2.4948283275646672e-05, "loss": 0.6172, "step": 307 }, { "epoch": 0.08541320022185246, "grad_norm": 0.21257632970809937, "learning_rate": 2.4947780303564015e-05, "loss": 0.6132, "step": 308 }, { "epoch": 0.08569051580698835, "grad_norm": 0.23266074061393738, "learning_rate": 2.4947274902589628e-05, "loss": 0.6001, "step": 309 }, { "epoch": 0.08596783139212424, "grad_norm": 0.21587929129600525, "learning_rate": 2.4946767072822126e-05, "loss": 0.6381, "step": 310 }, { "epoch": 0.08624514697726013, "grad_norm": 0.23052595555782318, "learning_rate": 2.4946256814360594e-05, "loss": 0.6643, "step": 311 }, { "epoch": 0.08652246256239601, "grad_norm": 0.2822146713733673, "learning_rate": 2.4945744127304598e-05, "loss": 0.6331, "step": 312 }, { "epoch": 0.0867997781475319, "grad_norm": 0.22692646086215973, "learning_rate": 2.4945229011754184e-05, "loss": 0.6126, "step": 313 }, { "epoch": 0.08707709373266778, "grad_norm": 0.2250347137451172, "learning_rate": 2.4944711467809855e-05, "loss": 0.6308, "step": 314 }, { "epoch": 0.08735440931780367, "grad_norm": 0.21644283831119537, "learning_rate": 2.4944191495572604e-05, "loss": 0.587, "step": 315 }, { "epoch": 0.08763172490293955, "grad_norm": 0.22959665954113007, "learning_rate": 2.494366909514389e-05, "loss": 0.6138, "step": 316 }, { "epoch": 0.08790904048807544, "grad_norm": 0.24681390821933746, "learning_rate": 2.4943144266625645e-05, "loss": 0.6309, "step": 317 }, { "epoch": 0.08818635607321132, "grad_norm": 0.22859139740467072, "learning_rate": 2.4942617010120282e-05, "loss": 0.5937, "step": 318 }, { "epoch": 0.0884636716583472, "grad_norm": 0.20714016258716583, "learning_rate": 2.4942087325730678e-05, "loss": 0.5925, "step": 319 }, { "epoch": 0.08874098724348309, "grad_norm": 0.2056405246257782, "learning_rate": 2.494155521356019e-05, "loss": 0.5922, "step": 320 }, { "epoch": 0.08901830282861897, "grad_norm": 0.22429367899894714, "learning_rate": 2.4941020673712644e-05, "loss": 0.6141, "step": 321 }, { "epoch": 0.08929561841375486, "grad_norm": 0.2454768568277359, "learning_rate": 2.494048370629235e-05, "loss": 0.6221, "step": 322 }, { "epoch": 0.08957293399889074, "grad_norm": 0.21887235343456268, "learning_rate": 2.493994431140408e-05, "loss": 0.6249, "step": 323 }, { "epoch": 0.08985024958402663, "grad_norm": 0.23439091444015503, "learning_rate": 2.493940248915308e-05, "loss": 0.6145, "step": 324 }, { "epoch": 0.09012756516916251, "grad_norm": 0.21770575642585754, "learning_rate": 2.4938858239645087e-05, "loss": 0.6123, "step": 325 }, { "epoch": 0.0904048807542984, "grad_norm": 0.24734006822109222, "learning_rate": 2.4938311562986284e-05, "loss": 0.6223, "step": 326 }, { "epoch": 0.09068219633943428, "grad_norm": 0.22917009890079498, "learning_rate": 2.4937762459283348e-05, "loss": 0.6041, "step": 327 }, { "epoch": 0.09095951192457016, "grad_norm": 0.22535157203674316, "learning_rate": 2.4937210928643423e-05, "loss": 0.6449, "step": 328 }, { "epoch": 0.09123682750970605, "grad_norm": 0.21863703429698944, "learning_rate": 2.4936656971174134e-05, "loss": 0.6144, "step": 329 }, { "epoch": 0.09151414309484193, "grad_norm": 0.24071593582630157, "learning_rate": 2.4936100586983563e-05, "loss": 0.6391, "step": 330 }, { "epoch": 0.09179145867997782, "grad_norm": 0.21045182645320892, "learning_rate": 2.4935541776180275e-05, "loss": 0.613, "step": 331 }, { "epoch": 0.0920687742651137, "grad_norm": 0.250699520111084, "learning_rate": 2.493498053887332e-05, "loss": 0.6155, "step": 332 }, { "epoch": 0.09234608985024959, "grad_norm": 0.22076334059238434, "learning_rate": 2.4934416875172202e-05, "loss": 0.6184, "step": 333 }, { "epoch": 0.09262340543538547, "grad_norm": 0.22932595014572144, "learning_rate": 2.4933850785186906e-05, "loss": 0.6234, "step": 334 }, { "epoch": 0.09290072102052135, "grad_norm": 0.2126377820968628, "learning_rate": 2.4933282269027898e-05, "loss": 0.5768, "step": 335 }, { "epoch": 0.09317803660565724, "grad_norm": 0.21872107684612274, "learning_rate": 2.49327113268061e-05, "loss": 0.5847, "step": 336 }, { "epoch": 0.09345535219079312, "grad_norm": 0.22751103341579437, "learning_rate": 2.4932137958632922e-05, "loss": 0.6241, "step": 337 }, { "epoch": 0.09373266777592901, "grad_norm": 0.24364197254180908, "learning_rate": 2.493156216462025e-05, "loss": 0.5956, "step": 338 }, { "epoch": 0.09400998336106489, "grad_norm": 0.2077159285545349, "learning_rate": 2.493098394488043e-05, "loss": 0.5911, "step": 339 }, { "epoch": 0.09428729894620078, "grad_norm": 0.24238905310630798, "learning_rate": 2.4930403299526292e-05, "loss": 0.629, "step": 340 }, { "epoch": 0.09456461453133666, "grad_norm": 0.22944410145282745, "learning_rate": 2.492982022867113e-05, "loss": 0.596, "step": 341 }, { "epoch": 0.09484193011647254, "grad_norm": 0.22006259858608246, "learning_rate": 2.492923473242872e-05, "loss": 0.581, "step": 342 }, { "epoch": 0.09511924570160843, "grad_norm": 0.2297179251909256, "learning_rate": 2.4928646810913307e-05, "loss": 0.6107, "step": 343 }, { "epoch": 0.09539656128674431, "grad_norm": 0.21393971145153046, "learning_rate": 2.4928056464239614e-05, "loss": 0.5773, "step": 344 }, { "epoch": 0.0956738768718802, "grad_norm": 0.23898737132549286, "learning_rate": 2.4927463692522825e-05, "loss": 0.6119, "step": 345 }, { "epoch": 0.09595119245701608, "grad_norm": 0.22290126979351044, "learning_rate": 2.4926868495878613e-05, "loss": 0.5721, "step": 346 }, { "epoch": 0.09622850804215197, "grad_norm": 0.23102609813213348, "learning_rate": 2.4926270874423113e-05, "loss": 0.5735, "step": 347 }, { "epoch": 0.09650582362728785, "grad_norm": 0.22276602685451508, "learning_rate": 2.4925670828272935e-05, "loss": 0.5799, "step": 348 }, { "epoch": 0.09678313921242374, "grad_norm": 0.229088693857193, "learning_rate": 2.492506835754517e-05, "loss": 0.6191, "step": 349 }, { "epoch": 0.09706045479755962, "grad_norm": 0.22365529835224152, "learning_rate": 2.4924463462357373e-05, "loss": 0.5932, "step": 350 }, { "epoch": 0.0973377703826955, "grad_norm": 0.21552829444408417, "learning_rate": 2.492385614282757e-05, "loss": 0.6091, "step": 351 }, { "epoch": 0.09761508596783139, "grad_norm": 0.2322327196598053, "learning_rate": 2.4923246399074272e-05, "loss": 0.6216, "step": 352 }, { "epoch": 0.09789240155296727, "grad_norm": 0.27425798773765564, "learning_rate": 2.4922634231216458e-05, "loss": 0.5915, "step": 353 }, { "epoch": 0.09816971713810316, "grad_norm": 0.21547527611255646, "learning_rate": 2.492201963937357e-05, "loss": 0.6003, "step": 354 }, { "epoch": 0.09844703272323904, "grad_norm": 0.24001803994178772, "learning_rate": 2.4921402623665535e-05, "loss": 0.5879, "step": 355 }, { "epoch": 0.09872434830837493, "grad_norm": 0.2128361016511917, "learning_rate": 2.492078318421275e-05, "loss": 0.6192, "step": 356 }, { "epoch": 0.09900166389351081, "grad_norm": 0.23891720175743103, "learning_rate": 2.492016132113608e-05, "loss": 0.6237, "step": 357 }, { "epoch": 0.0992789794786467, "grad_norm": 0.2530137300491333, "learning_rate": 2.4919537034556876e-05, "loss": 0.5975, "step": 358 }, { "epoch": 0.09955629506378258, "grad_norm": 0.22864577174186707, "learning_rate": 2.4918910324596944e-05, "loss": 0.6085, "step": 359 }, { "epoch": 0.09983361064891846, "grad_norm": 0.21646267175674438, "learning_rate": 2.4918281191378573e-05, "loss": 0.5734, "step": 360 }, { "epoch": 0.10011092623405435, "grad_norm": 0.21921531856060028, "learning_rate": 2.491764963502453e-05, "loss": 0.6003, "step": 361 }, { "epoch": 0.10038824181919023, "grad_norm": 0.22741259634494781, "learning_rate": 2.491701565565804e-05, "loss": 0.59, "step": 362 }, { "epoch": 0.10066555740432612, "grad_norm": 0.2382003366947174, "learning_rate": 2.4916379253402815e-05, "loss": 0.6021, "step": 363 }, { "epoch": 0.100942872989462, "grad_norm": 0.20885150134563446, "learning_rate": 2.4915740428383032e-05, "loss": 0.5973, "step": 364 }, { "epoch": 0.10122018857459789, "grad_norm": 0.1941784769296646, "learning_rate": 2.491509918072334e-05, "loss": 0.591, "step": 365 }, { "epoch": 0.10149750415973377, "grad_norm": 0.21724678575992584, "learning_rate": 2.491445551054887e-05, "loss": 0.6212, "step": 366 }, { "epoch": 0.10177481974486967, "grad_norm": 0.29596778750419617, "learning_rate": 2.4913809417985213e-05, "loss": 0.6241, "step": 367 }, { "epoch": 0.10205213533000555, "grad_norm": 0.23365665972232819, "learning_rate": 2.4913160903158443e-05, "loss": 0.6243, "step": 368 }, { "epoch": 0.10232945091514144, "grad_norm": 0.23263859748840332, "learning_rate": 2.4912509966195098e-05, "loss": 0.5946, "step": 369 }, { "epoch": 0.10260676650027732, "grad_norm": 0.22337226569652557, "learning_rate": 2.4911856607222196e-05, "loss": 0.6287, "step": 370 }, { "epoch": 0.1028840820854132, "grad_norm": 0.23225417733192444, "learning_rate": 2.491120082636722e-05, "loss": 0.5927, "step": 371 }, { "epoch": 0.10316139767054909, "grad_norm": 0.22292552888393402, "learning_rate": 2.4910542623758142e-05, "loss": 0.6208, "step": 372 }, { "epoch": 0.10343871325568497, "grad_norm": 0.21180188655853271, "learning_rate": 2.4909881999523382e-05, "loss": 0.5652, "step": 373 }, { "epoch": 0.10371602884082086, "grad_norm": 0.2395281195640564, "learning_rate": 2.4909218953791853e-05, "loss": 0.5922, "step": 374 }, { "epoch": 0.10399334442595674, "grad_norm": 0.2313883900642395, "learning_rate": 2.4908553486692926e-05, "loss": 0.6083, "step": 375 }, { "epoch": 0.10427066001109263, "grad_norm": 0.21677231788635254, "learning_rate": 2.4907885598356456e-05, "loss": 0.6115, "step": 376 }, { "epoch": 0.10454797559622851, "grad_norm": 0.21811628341674805, "learning_rate": 2.4907215288912766e-05, "loss": 0.5815, "step": 377 }, { "epoch": 0.1048252911813644, "grad_norm": 0.22422359883785248, "learning_rate": 2.4906542558492652e-05, "loss": 0.6161, "step": 378 }, { "epoch": 0.10510260676650028, "grad_norm": 0.2190743088722229, "learning_rate": 2.4905867407227377e-05, "loss": 0.5554, "step": 379 }, { "epoch": 0.10537992235163617, "grad_norm": 0.25590968132019043, "learning_rate": 2.490518983524869e-05, "loss": 0.5856, "step": 380 }, { "epoch": 0.10565723793677205, "grad_norm": 0.26324909925460815, "learning_rate": 2.490450984268879e-05, "loss": 0.6057, "step": 381 }, { "epoch": 0.10593455352190793, "grad_norm": 0.2394174039363861, "learning_rate": 2.490382742968037e-05, "loss": 0.6045, "step": 382 }, { "epoch": 0.10621186910704382, "grad_norm": 0.23230458796024323, "learning_rate": 2.4903142596356586e-05, "loss": 0.6188, "step": 383 }, { "epoch": 0.1064891846921797, "grad_norm": 0.21763205528259277, "learning_rate": 2.4902455342851067e-05, "loss": 0.5626, "step": 384 }, { "epoch": 0.10676650027731559, "grad_norm": 0.469051718711853, "learning_rate": 2.490176566929791e-05, "loss": 0.5909, "step": 385 }, { "epoch": 0.10704381586245147, "grad_norm": 0.24806742370128632, "learning_rate": 2.4901073575831697e-05, "loss": 0.6215, "step": 386 }, { "epoch": 0.10732113144758736, "grad_norm": 0.22851231694221497, "learning_rate": 2.4900379062587463e-05, "loss": 0.593, "step": 387 }, { "epoch": 0.10759844703272324, "grad_norm": 0.24515169858932495, "learning_rate": 2.489968212970074e-05, "loss": 0.6036, "step": 388 }, { "epoch": 0.10787576261785913, "grad_norm": 0.24662603437900543, "learning_rate": 2.4898982777307506e-05, "loss": 0.6153, "step": 389 }, { "epoch": 0.10815307820299501, "grad_norm": 0.2459113895893097, "learning_rate": 2.4898281005544227e-05, "loss": 0.5771, "step": 390 }, { "epoch": 0.1084303937881309, "grad_norm": 0.23075874149799347, "learning_rate": 2.489757681454784e-05, "loss": 0.6297, "step": 391 }, { "epoch": 0.10870770937326678, "grad_norm": 0.24344393610954285, "learning_rate": 2.4896870204455746e-05, "loss": 0.5993, "step": 392 }, { "epoch": 0.10898502495840266, "grad_norm": 0.2444470226764679, "learning_rate": 2.4896161175405826e-05, "loss": 0.6159, "step": 393 }, { "epoch": 0.10926234054353855, "grad_norm": 0.24199549853801727, "learning_rate": 2.4895449727536435e-05, "loss": 0.6177, "step": 394 }, { "epoch": 0.10953965612867443, "grad_norm": 0.20678602159023285, "learning_rate": 2.4894735860986385e-05, "loss": 0.5894, "step": 395 }, { "epoch": 0.10981697171381032, "grad_norm": 0.25881609320640564, "learning_rate": 2.489401957589498e-05, "loss": 0.631, "step": 396 }, { "epoch": 0.1100942872989462, "grad_norm": 0.2568078637123108, "learning_rate": 2.489330087240198e-05, "loss": 0.5902, "step": 397 }, { "epoch": 0.11037160288408208, "grad_norm": 0.2495458871126175, "learning_rate": 2.489257975064763e-05, "loss": 0.6141, "step": 398 }, { "epoch": 0.11064891846921797, "grad_norm": 0.6050971150398254, "learning_rate": 2.489185621077263e-05, "loss": 0.597, "step": 399 }, { "epoch": 0.11092623405435385, "grad_norm": 0.22337263822555542, "learning_rate": 2.489113025291817e-05, "loss": 0.58, "step": 400 }, { "epoch": 0.11120354963948974, "grad_norm": 0.20583049952983856, "learning_rate": 2.4890401877225898e-05, "loss": 0.5751, "step": 401 }, { "epoch": 0.11148086522462562, "grad_norm": 0.2487124800682068, "learning_rate": 2.488967108383795e-05, "loss": 0.6009, "step": 402 }, { "epoch": 0.1117581808097615, "grad_norm": 0.24986512959003448, "learning_rate": 2.4888937872896908e-05, "loss": 0.6203, "step": 403 }, { "epoch": 0.11203549639489739, "grad_norm": 0.28655165433883667, "learning_rate": 2.488820224454585e-05, "loss": 0.6037, "step": 404 }, { "epoch": 0.11231281198003328, "grad_norm": 0.24651272594928741, "learning_rate": 2.4887464198928317e-05, "loss": 0.5853, "step": 405 }, { "epoch": 0.11259012756516916, "grad_norm": 0.1938582807779312, "learning_rate": 2.4886723736188318e-05, "loss": 0.5888, "step": 406 }, { "epoch": 0.11286744315030504, "grad_norm": 0.22223535180091858, "learning_rate": 2.4885980856470338e-05, "loss": 0.627, "step": 407 }, { "epoch": 0.11314475873544093, "grad_norm": 0.24378454685211182, "learning_rate": 2.4885235559919328e-05, "loss": 0.5827, "step": 408 }, { "epoch": 0.11342207432057681, "grad_norm": 0.2019236534833908, "learning_rate": 2.4884487846680727e-05, "loss": 0.5976, "step": 409 }, { "epoch": 0.1136993899057127, "grad_norm": 0.21661922335624695, "learning_rate": 2.4883737716900424e-05, "loss": 0.6013, "step": 410 }, { "epoch": 0.11397670549084858, "grad_norm": 0.26957792043685913, "learning_rate": 2.4882985170724787e-05, "loss": 0.63, "step": 411 }, { "epoch": 0.11425402107598447, "grad_norm": 0.21899108588695526, "learning_rate": 2.4882230208300668e-05, "loss": 0.5935, "step": 412 }, { "epoch": 0.11453133666112035, "grad_norm": 0.2505897879600525, "learning_rate": 2.488147282977537e-05, "loss": 0.5689, "step": 413 }, { "epoch": 0.11480865224625623, "grad_norm": 0.20966675877571106, "learning_rate": 2.4880713035296686e-05, "loss": 0.5893, "step": 414 }, { "epoch": 0.11508596783139212, "grad_norm": 0.26599064469337463, "learning_rate": 2.4879950825012864e-05, "loss": 0.5912, "step": 415 }, { "epoch": 0.115363283416528, "grad_norm": 0.22095918655395508, "learning_rate": 2.487918619907264e-05, "loss": 0.6068, "step": 416 }, { "epoch": 0.11564059900166389, "grad_norm": 0.20822377502918243, "learning_rate": 2.4878419157625206e-05, "loss": 0.5783, "step": 417 }, { "epoch": 0.11591791458679977, "grad_norm": 0.20983396470546722, "learning_rate": 2.4877649700820232e-05, "loss": 0.6258, "step": 418 }, { "epoch": 0.11619523017193566, "grad_norm": 0.2288864701986313, "learning_rate": 2.4876877828807864e-05, "loss": 0.6196, "step": 419 }, { "epoch": 0.11647254575707154, "grad_norm": 0.20762163400650024, "learning_rate": 2.4876103541738714e-05, "loss": 0.5674, "step": 420 }, { "epoch": 0.11674986134220743, "grad_norm": 0.2152256816625595, "learning_rate": 2.4875326839763863e-05, "loss": 0.5681, "step": 421 }, { "epoch": 0.11702717692734331, "grad_norm": 0.25224751234054565, "learning_rate": 2.4874547723034865e-05, "loss": 0.5948, "step": 422 }, { "epoch": 0.11730449251247921, "grad_norm": 0.21316662430763245, "learning_rate": 2.4873766191703752e-05, "loss": 0.5757, "step": 423 }, { "epoch": 0.11758180809761509, "grad_norm": 0.20757247507572174, "learning_rate": 2.4872982245923014e-05, "loss": 0.5903, "step": 424 }, { "epoch": 0.11785912368275098, "grad_norm": 0.23846663534641266, "learning_rate": 2.487219588584563e-05, "loss": 0.5735, "step": 425 }, { "epoch": 0.11813643926788686, "grad_norm": 0.21389099955558777, "learning_rate": 2.4871407111625027e-05, "loss": 0.5998, "step": 426 }, { "epoch": 0.11841375485302275, "grad_norm": 0.21840502321720123, "learning_rate": 2.487061592341513e-05, "loss": 0.5854, "step": 427 }, { "epoch": 0.11869107043815863, "grad_norm": 0.23358672857284546, "learning_rate": 2.4869822321370308e-05, "loss": 0.6212, "step": 428 }, { "epoch": 0.11896838602329451, "grad_norm": 0.24467387795448303, "learning_rate": 2.4869026305645418e-05, "loss": 0.5937, "step": 429 }, { "epoch": 0.1192457016084304, "grad_norm": 0.24679329991340637, "learning_rate": 2.486822787639579e-05, "loss": 0.6027, "step": 430 }, { "epoch": 0.11952301719356628, "grad_norm": 0.22588002681732178, "learning_rate": 2.4867427033777206e-05, "loss": 0.5707, "step": 431 }, { "epoch": 0.11980033277870217, "grad_norm": 0.20728443562984467, "learning_rate": 2.486662377794594e-05, "loss": 0.5857, "step": 432 }, { "epoch": 0.12007764836383805, "grad_norm": 0.2292574942111969, "learning_rate": 2.4865818109058732e-05, "loss": 0.6288, "step": 433 }, { "epoch": 0.12035496394897394, "grad_norm": 0.22358085215091705, "learning_rate": 2.4865010027272784e-05, "loss": 0.6043, "step": 434 }, { "epoch": 0.12063227953410982, "grad_norm": 0.21650134027004242, "learning_rate": 2.4864199532745776e-05, "loss": 0.5772, "step": 435 }, { "epoch": 0.1209095951192457, "grad_norm": 0.21783700585365295, "learning_rate": 2.486338662563585e-05, "loss": 0.608, "step": 436 }, { "epoch": 0.12118691070438159, "grad_norm": 0.2252453863620758, "learning_rate": 2.4862571306101633e-05, "loss": 0.5783, "step": 437 }, { "epoch": 0.12146422628951747, "grad_norm": 0.22224466502666473, "learning_rate": 2.4861753574302217e-05, "loss": 0.5823, "step": 438 }, { "epoch": 0.12174154187465336, "grad_norm": 0.24375957250595093, "learning_rate": 2.486093343039716e-05, "loss": 0.5872, "step": 439 }, { "epoch": 0.12201885745978924, "grad_norm": 0.20903299748897552, "learning_rate": 2.4860110874546495e-05, "loss": 0.6237, "step": 440 }, { "epoch": 0.12229617304492513, "grad_norm": 0.23007185757160187, "learning_rate": 2.485928590691072e-05, "loss": 0.6188, "step": 441 }, { "epoch": 0.12257348863006101, "grad_norm": 0.23085376620292664, "learning_rate": 2.4858458527650814e-05, "loss": 0.5693, "step": 442 }, { "epoch": 0.1228508042151969, "grad_norm": 0.2241743952035904, "learning_rate": 2.485762873692822e-05, "loss": 0.6294, "step": 443 }, { "epoch": 0.12312811980033278, "grad_norm": 0.20904746651649475, "learning_rate": 2.4856796534904845e-05, "loss": 0.6301, "step": 444 }, { "epoch": 0.12340543538546866, "grad_norm": 0.6742352843284607, "learning_rate": 2.4855961921743083e-05, "loss": 0.5524, "step": 445 }, { "epoch": 0.12368275097060455, "grad_norm": 0.20682546496391296, "learning_rate": 2.4855124897605782e-05, "loss": 0.5907, "step": 446 }, { "epoch": 0.12396006655574043, "grad_norm": 0.2383589744567871, "learning_rate": 2.485428546265627e-05, "loss": 0.5865, "step": 447 }, { "epoch": 0.12423738214087632, "grad_norm": 0.2051754891872406, "learning_rate": 2.4853443617058348e-05, "loss": 0.6112, "step": 448 }, { "epoch": 0.1245146977260122, "grad_norm": 0.2156454175710678, "learning_rate": 2.4852599360976274e-05, "loss": 0.5913, "step": 449 }, { "epoch": 0.12479201331114809, "grad_norm": 0.22987020015716553, "learning_rate": 2.485175269457479e-05, "loss": 0.5873, "step": 450 }, { "epoch": 0.12506932889628397, "grad_norm": 0.20809032022953033, "learning_rate": 2.4850903618019102e-05, "loss": 0.582, "step": 451 }, { "epoch": 0.12534664448141986, "grad_norm": 0.2254360467195511, "learning_rate": 2.485005213147489e-05, "loss": 0.5998, "step": 452 }, { "epoch": 0.12562396006655574, "grad_norm": 0.214163139462471, "learning_rate": 2.4849198235108296e-05, "loss": 0.5884, "step": 453 }, { "epoch": 0.12590127565169162, "grad_norm": 0.21463198959827423, "learning_rate": 2.484834192908594e-05, "loss": 0.606, "step": 454 }, { "epoch": 0.1261785912368275, "grad_norm": 0.20102332532405853, "learning_rate": 2.4847483213574908e-05, "loss": 0.6012, "step": 455 }, { "epoch": 0.1264559068219634, "grad_norm": 0.19328515231609344, "learning_rate": 2.4846622088742765e-05, "loss": 0.5749, "step": 456 }, { "epoch": 0.12673322240709928, "grad_norm": 0.20251993834972382, "learning_rate": 2.484575855475753e-05, "loss": 0.6121, "step": 457 }, { "epoch": 0.12701053799223516, "grad_norm": 0.21547801792621613, "learning_rate": 2.484489261178771e-05, "loss": 0.6019, "step": 458 }, { "epoch": 0.12728785357737105, "grad_norm": 0.21968044340610504, "learning_rate": 2.4844024260002276e-05, "loss": 0.5863, "step": 459 }, { "epoch": 0.12756516916250693, "grad_norm": 0.21164929866790771, "learning_rate": 2.4843153499570648e-05, "loss": 0.5995, "step": 460 }, { "epoch": 0.12784248474764282, "grad_norm": 0.2152341902256012, "learning_rate": 2.4842280330662753e-05, "loss": 0.6374, "step": 461 }, { "epoch": 0.1281198003327787, "grad_norm": 0.19914227724075317, "learning_rate": 2.4841404753448963e-05, "loss": 0.5919, "step": 462 }, { "epoch": 0.12839711591791458, "grad_norm": 0.2268274873495102, "learning_rate": 2.4840526768100124e-05, "loss": 0.5913, "step": 463 }, { "epoch": 0.12867443150305047, "grad_norm": 0.21451812982559204, "learning_rate": 2.483964637478756e-05, "loss": 0.6146, "step": 464 }, { "epoch": 0.12895174708818635, "grad_norm": 0.1978655308485031, "learning_rate": 2.483876357368305e-05, "loss": 0.5938, "step": 465 }, { "epoch": 0.12922906267332224, "grad_norm": 0.20545656979084015, "learning_rate": 2.4837878364958865e-05, "loss": 0.6172, "step": 466 }, { "epoch": 0.12950637825845812, "grad_norm": 0.21529193222522736, "learning_rate": 2.483699074878772e-05, "loss": 0.5794, "step": 467 }, { "epoch": 0.129783693843594, "grad_norm": 0.2971234917640686, "learning_rate": 2.4836100725342818e-05, "loss": 0.6166, "step": 468 }, { "epoch": 0.1300610094287299, "grad_norm": 0.1968923807144165, "learning_rate": 2.4835208294797824e-05, "loss": 0.5898, "step": 469 }, { "epoch": 0.13033832501386577, "grad_norm": 0.2248852252960205, "learning_rate": 2.483431345732688e-05, "loss": 0.5984, "step": 470 }, { "epoch": 0.13061564059900166, "grad_norm": 0.21942903101444244, "learning_rate": 2.4833416213104588e-05, "loss": 0.5984, "step": 471 }, { "epoch": 0.13089295618413754, "grad_norm": 0.22266723215579987, "learning_rate": 2.4832516562306024e-05, "loss": 0.5858, "step": 472 }, { "epoch": 0.13117027176927343, "grad_norm": 0.21460357308387756, "learning_rate": 2.483161450510674e-05, "loss": 0.5763, "step": 473 }, { "epoch": 0.1314475873544093, "grad_norm": 0.333474725484848, "learning_rate": 2.4830710041682735e-05, "loss": 0.6024, "step": 474 }, { "epoch": 0.1317249029395452, "grad_norm": 0.1983480155467987, "learning_rate": 2.4829803172210515e-05, "loss": 0.5898, "step": 475 }, { "epoch": 0.13200221852468108, "grad_norm": 0.2835070788860321, "learning_rate": 2.482889389686702e-05, "loss": 0.571, "step": 476 }, { "epoch": 0.13227953410981697, "grad_norm": 0.2176080197095871, "learning_rate": 2.4827982215829674e-05, "loss": 0.5875, "step": 477 }, { "epoch": 0.13255684969495285, "grad_norm": 0.2436138391494751, "learning_rate": 2.482706812927638e-05, "loss": 0.5965, "step": 478 }, { "epoch": 0.13283416528008873, "grad_norm": 0.21060815453529358, "learning_rate": 2.4826151637385495e-05, "loss": 0.5881, "step": 479 }, { "epoch": 0.13311148086522462, "grad_norm": 0.49135246872901917, "learning_rate": 2.4825232740335847e-05, "loss": 0.5742, "step": 480 }, { "epoch": 0.1333887964503605, "grad_norm": 0.20535485446453094, "learning_rate": 2.4824311438306742e-05, "loss": 0.5877, "step": 481 }, { "epoch": 0.1336661120354964, "grad_norm": 0.20854201912879944, "learning_rate": 2.482338773147795e-05, "loss": 0.6065, "step": 482 }, { "epoch": 0.13394342762063227, "grad_norm": 0.20914287865161896, "learning_rate": 2.4822461620029708e-05, "loss": 0.5919, "step": 483 }, { "epoch": 0.13422074320576816, "grad_norm": 0.20028036832809448, "learning_rate": 2.4821533104142724e-05, "loss": 0.5707, "step": 484 }, { "epoch": 0.13449805879090404, "grad_norm": 0.22616969048976898, "learning_rate": 2.4820602183998185e-05, "loss": 0.5896, "step": 485 }, { "epoch": 0.13477537437603992, "grad_norm": 0.2049257457256317, "learning_rate": 2.4819668859777728e-05, "loss": 0.5693, "step": 486 }, { "epoch": 0.1350526899611758, "grad_norm": 0.21746453642845154, "learning_rate": 2.4818733131663473e-05, "loss": 0.6177, "step": 487 }, { "epoch": 0.1353300055463117, "grad_norm": 0.20084752142429352, "learning_rate": 2.4817794999838004e-05, "loss": 0.5871, "step": 488 }, { "epoch": 0.13560732113144758, "grad_norm": 0.2062511444091797, "learning_rate": 2.4816854464484378e-05, "loss": 0.5975, "step": 489 }, { "epoch": 0.13588463671658346, "grad_norm": 0.2201562523841858, "learning_rate": 2.4815911525786118e-05, "loss": 0.5683, "step": 490 }, { "epoch": 0.13616195230171935, "grad_norm": 0.22616079449653625, "learning_rate": 2.4814966183927213e-05, "loss": 0.6306, "step": 491 }, { "epoch": 0.13643926788685523, "grad_norm": 0.21003180742263794, "learning_rate": 2.4814018439092128e-05, "loss": 0.6064, "step": 492 }, { "epoch": 0.13671658347199112, "grad_norm": 0.2046622782945633, "learning_rate": 2.481306829146579e-05, "loss": 0.6107, "step": 493 }, { "epoch": 0.136993899057127, "grad_norm": 0.2102370411157608, "learning_rate": 2.4812115741233606e-05, "loss": 0.596, "step": 494 }, { "epoch": 0.13727121464226288, "grad_norm": 0.20774902403354645, "learning_rate": 2.4811160788581434e-05, "loss": 0.6111, "step": 495 }, { "epoch": 0.13754853022739877, "grad_norm": 0.20868700742721558, "learning_rate": 2.481020343369561e-05, "loss": 0.604, "step": 496 }, { "epoch": 0.13782584581253465, "grad_norm": 0.20590144395828247, "learning_rate": 2.4809243676762947e-05, "loss": 0.606, "step": 497 }, { "epoch": 0.13810316139767054, "grad_norm": 0.2019280344247818, "learning_rate": 2.4808281517970716e-05, "loss": 0.6034, "step": 498 }, { "epoch": 0.13838047698280642, "grad_norm": 0.22689440846443176, "learning_rate": 2.4807316957506656e-05, "loss": 0.5715, "step": 499 }, { "epoch": 0.1386577925679423, "grad_norm": 0.2134653776884079, "learning_rate": 2.4806349995558986e-05, "loss": 0.6184, "step": 500 }, { "epoch": 0.1389351081530782, "grad_norm": 0.20334339141845703, "learning_rate": 2.4805380632316377e-05, "loss": 0.5804, "step": 501 }, { "epoch": 0.13921242373821408, "grad_norm": 0.20713390409946442, "learning_rate": 2.4804408867967984e-05, "loss": 0.5898, "step": 502 }, { "epoch": 0.13948973932334996, "grad_norm": 0.21584905683994293, "learning_rate": 2.4803434702703422e-05, "loss": 0.5957, "step": 503 }, { "epoch": 0.13976705490848584, "grad_norm": 0.21197180449962616, "learning_rate": 2.4802458136712775e-05, "loss": 0.5981, "step": 504 }, { "epoch": 0.14004437049362173, "grad_norm": 0.19864031672477722, "learning_rate": 2.4801479170186597e-05, "loss": 0.6027, "step": 505 }, { "epoch": 0.1403216860787576, "grad_norm": 0.21110500395298004, "learning_rate": 2.4800497803315913e-05, "loss": 0.5882, "step": 506 }, { "epoch": 0.1405990016638935, "grad_norm": 0.20834285020828247, "learning_rate": 2.4799514036292215e-05, "loss": 0.5935, "step": 507 }, { "epoch": 0.1408763172490294, "grad_norm": 0.22122903168201447, "learning_rate": 2.4798527869307454e-05, "loss": 0.6011, "step": 508 }, { "epoch": 0.1411536328341653, "grad_norm": 0.21510954201221466, "learning_rate": 2.4797539302554064e-05, "loss": 0.6266, "step": 509 }, { "epoch": 0.14143094841930118, "grad_norm": 0.20589859783649445, "learning_rate": 2.479654833622494e-05, "loss": 0.5858, "step": 510 }, { "epoch": 0.14170826400443706, "grad_norm": 0.20928624272346497, "learning_rate": 2.4795554970513445e-05, "loss": 0.6006, "step": 511 }, { "epoch": 0.14198557958957295, "grad_norm": 0.2174837589263916, "learning_rate": 2.4794559205613412e-05, "loss": 0.5792, "step": 512 }, { "epoch": 0.14226289517470883, "grad_norm": 0.20877033472061157, "learning_rate": 2.4793561041719137e-05, "loss": 0.5662, "step": 513 }, { "epoch": 0.14254021075984472, "grad_norm": 0.240639790892601, "learning_rate": 2.479256047902539e-05, "loss": 0.5824, "step": 514 }, { "epoch": 0.1428175263449806, "grad_norm": 0.21567635238170624, "learning_rate": 2.479155751772741e-05, "loss": 0.5833, "step": 515 }, { "epoch": 0.14309484193011648, "grad_norm": 0.2284121960401535, "learning_rate": 2.4790552158020896e-05, "loss": 0.6057, "step": 516 }, { "epoch": 0.14337215751525237, "grad_norm": 0.19480617344379425, "learning_rate": 2.478954440010203e-05, "loss": 0.5972, "step": 517 }, { "epoch": 0.14364947310038825, "grad_norm": 0.20838883519172668, "learning_rate": 2.4788534244167443e-05, "loss": 0.6373, "step": 518 }, { "epoch": 0.14392678868552414, "grad_norm": 0.21365465223789215, "learning_rate": 2.4787521690414245e-05, "loss": 0.5796, "step": 519 }, { "epoch": 0.14420410427066002, "grad_norm": 2.2805471420288086, "learning_rate": 2.4786506739040018e-05, "loss": 0.5915, "step": 520 }, { "epoch": 0.1444814198557959, "grad_norm": 0.34635624289512634, "learning_rate": 2.47854893902428e-05, "loss": 0.6325, "step": 521 }, { "epoch": 0.1447587354409318, "grad_norm": 0.39266762137413025, "learning_rate": 2.47844696442211e-05, "loss": 0.5756, "step": 522 }, { "epoch": 0.14503605102606767, "grad_norm": 0.31766456365585327, "learning_rate": 2.4783447501173907e-05, "loss": 0.5703, "step": 523 }, { "epoch": 0.14531336661120356, "grad_norm": 0.24752533435821533, "learning_rate": 2.478242296130066e-05, "loss": 0.5878, "step": 524 }, { "epoch": 0.14559068219633944, "grad_norm": 0.24595655500888824, "learning_rate": 2.4781396024801272e-05, "loss": 0.5819, "step": 525 }, { "epoch": 0.14586799778147533, "grad_norm": 0.2457636296749115, "learning_rate": 2.478036669187614e-05, "loss": 0.599, "step": 526 }, { "epoch": 0.1461453133666112, "grad_norm": 0.244289368391037, "learning_rate": 2.4779334962726096e-05, "loss": 0.5922, "step": 527 }, { "epoch": 0.1464226289517471, "grad_norm": 0.23528233170509338, "learning_rate": 2.477830083755247e-05, "loss": 0.6032, "step": 528 }, { "epoch": 0.14669994453688298, "grad_norm": 0.2198038101196289, "learning_rate": 2.477726431655704e-05, "loss": 0.5954, "step": 529 }, { "epoch": 0.14697726012201887, "grad_norm": 0.23673711717128754, "learning_rate": 2.4776225399942066e-05, "loss": 0.5938, "step": 530 }, { "epoch": 0.14725457570715475, "grad_norm": 0.2085774540901184, "learning_rate": 2.4775184087910262e-05, "loss": 0.5856, "step": 531 }, { "epoch": 0.14753189129229063, "grad_norm": 0.21415582299232483, "learning_rate": 2.4774140380664816e-05, "loss": 0.5751, "step": 532 }, { "epoch": 0.14780920687742652, "grad_norm": 0.2082296758890152, "learning_rate": 2.4773094278409388e-05, "loss": 0.5573, "step": 533 }, { "epoch": 0.1480865224625624, "grad_norm": 0.20202411711215973, "learning_rate": 2.4772045781348093e-05, "loss": 0.5883, "step": 534 }, { "epoch": 0.1483638380476983, "grad_norm": 0.20766015350818634, "learning_rate": 2.477099488968553e-05, "loss": 0.6066, "step": 535 }, { "epoch": 0.14864115363283417, "grad_norm": 0.2137647122144699, "learning_rate": 2.4769941603626744e-05, "loss": 0.597, "step": 536 }, { "epoch": 0.14891846921797006, "grad_norm": 0.23699134588241577, "learning_rate": 2.4768885923377265e-05, "loss": 0.587, "step": 537 }, { "epoch": 0.14919578480310594, "grad_norm": 0.21466752886772156, "learning_rate": 2.4767827849143087e-05, "loss": 0.5725, "step": 538 }, { "epoch": 0.14947310038824183, "grad_norm": 0.20940807461738586, "learning_rate": 2.476676738113067e-05, "loss": 0.5807, "step": 539 }, { "epoch": 0.1497504159733777, "grad_norm": 0.22769619524478912, "learning_rate": 2.476570451954693e-05, "loss": 0.6089, "step": 540 }, { "epoch": 0.1500277315585136, "grad_norm": 0.20399393141269684, "learning_rate": 2.4764639264599266e-05, "loss": 0.5705, "step": 541 }, { "epoch": 0.15030504714364948, "grad_norm": 0.2241872102022171, "learning_rate": 2.4763571616495535e-05, "loss": 0.5731, "step": 542 }, { "epoch": 0.15058236272878536, "grad_norm": 0.20283614099025726, "learning_rate": 2.4762501575444062e-05, "loss": 0.6051, "step": 543 }, { "epoch": 0.15085967831392125, "grad_norm": 0.2145642638206482, "learning_rate": 2.4761429141653646e-05, "loss": 0.6069, "step": 544 }, { "epoch": 0.15113699389905713, "grad_norm": 0.2139946073293686, "learning_rate": 2.4760354315333546e-05, "loss": 0.6055, "step": 545 }, { "epoch": 0.15141430948419302, "grad_norm": 0.22807584702968597, "learning_rate": 2.4759277096693486e-05, "loss": 0.5945, "step": 546 }, { "epoch": 0.1516916250693289, "grad_norm": 0.2132754623889923, "learning_rate": 2.4758197485943657e-05, "loss": 0.5975, "step": 547 }, { "epoch": 0.15196894065446478, "grad_norm": 0.2016879767179489, "learning_rate": 2.4757115483294724e-05, "loss": 0.5863, "step": 548 }, { "epoch": 0.15224625623960067, "grad_norm": 0.227370485663414, "learning_rate": 2.475603108895782e-05, "loss": 0.583, "step": 549 }, { "epoch": 0.15252357182473655, "grad_norm": 0.22234570980072021, "learning_rate": 2.475494430314453e-05, "loss": 0.5962, "step": 550 }, { "epoch": 0.15280088740987244, "grad_norm": 0.20360559225082397, "learning_rate": 2.4753855126066916e-05, "loss": 0.587, "step": 551 }, { "epoch": 0.15307820299500832, "grad_norm": 0.23359502851963043, "learning_rate": 2.475276355793751e-05, "loss": 0.5967, "step": 552 }, { "epoch": 0.1533555185801442, "grad_norm": 0.23216257989406586, "learning_rate": 2.47516695989693e-05, "loss": 0.5845, "step": 553 }, { "epoch": 0.1536328341652801, "grad_norm": 0.21213343739509583, "learning_rate": 2.475057324937575e-05, "loss": 0.5821, "step": 554 }, { "epoch": 0.15391014975041598, "grad_norm": 0.21203738451004028, "learning_rate": 2.4749474509370784e-05, "loss": 0.5792, "step": 555 }, { "epoch": 0.15418746533555186, "grad_norm": 0.21234023571014404, "learning_rate": 2.4748373379168805e-05, "loss": 0.5985, "step": 556 }, { "epoch": 0.15446478092068774, "grad_norm": 0.20847538113594055, "learning_rate": 2.4747269858984658e-05, "loss": 0.595, "step": 557 }, { "epoch": 0.15474209650582363, "grad_norm": 0.20475518703460693, "learning_rate": 2.474616394903368e-05, "loss": 0.5821, "step": 558 }, { "epoch": 0.1550194120909595, "grad_norm": 0.211504727602005, "learning_rate": 2.474505564953166e-05, "loss": 0.572, "step": 559 }, { "epoch": 0.1552967276760954, "grad_norm": 0.21250484883785248, "learning_rate": 2.4743944960694854e-05, "loss": 0.5748, "step": 560 }, { "epoch": 0.15557404326123128, "grad_norm": 0.2148432582616806, "learning_rate": 2.4742831882739988e-05, "loss": 0.5881, "step": 561 }, { "epoch": 0.15585135884636717, "grad_norm": 0.19098572432994843, "learning_rate": 2.4741716415884257e-05, "loss": 0.5989, "step": 562 }, { "epoch": 0.15612867443150305, "grad_norm": 0.20260894298553467, "learning_rate": 2.474059856034531e-05, "loss": 0.567, "step": 563 }, { "epoch": 0.15640599001663893, "grad_norm": 0.21840746700763702, "learning_rate": 2.4739478316341282e-05, "loss": 0.6054, "step": 564 }, { "epoch": 0.15668330560177482, "grad_norm": 0.2050980031490326, "learning_rate": 2.473835568409075e-05, "loss": 0.5842, "step": 565 }, { "epoch": 0.1569606211869107, "grad_norm": 0.20163971185684204, "learning_rate": 2.473723066381278e-05, "loss": 0.5823, "step": 566 }, { "epoch": 0.1572379367720466, "grad_norm": 0.2088451236486435, "learning_rate": 2.473610325572689e-05, "loss": 0.5995, "step": 567 }, { "epoch": 0.15751525235718247, "grad_norm": 0.20921272039413452, "learning_rate": 2.4734973460053056e-05, "loss": 0.585, "step": 568 }, { "epoch": 0.15779256794231836, "grad_norm": 0.22330057621002197, "learning_rate": 2.473384127701175e-05, "loss": 0.5888, "step": 569 }, { "epoch": 0.15806988352745424, "grad_norm": 0.2152683287858963, "learning_rate": 2.4732706706823876e-05, "loss": 0.5942, "step": 570 }, { "epoch": 0.15834719911259013, "grad_norm": 0.20223170518875122, "learning_rate": 2.4731569749710824e-05, "loss": 0.5781, "step": 571 }, { "epoch": 0.158624514697726, "grad_norm": 0.20824022591114044, "learning_rate": 2.4730430405894446e-05, "loss": 0.6404, "step": 572 }, { "epoch": 0.1589018302828619, "grad_norm": 0.19907240569591522, "learning_rate": 2.4729288675597058e-05, "loss": 0.5983, "step": 573 }, { "epoch": 0.15917914586799778, "grad_norm": 0.20674046874046326, "learning_rate": 2.472814455904144e-05, "loss": 0.5595, "step": 574 }, { "epoch": 0.15945646145313366, "grad_norm": 0.19486385583877563, "learning_rate": 2.4726998056450833e-05, "loss": 0.5783, "step": 575 }, { "epoch": 0.15973377703826955, "grad_norm": 0.2102123498916626, "learning_rate": 2.4725849168048965e-05, "loss": 0.5809, "step": 576 }, { "epoch": 0.16001109262340543, "grad_norm": 0.21006052196025848, "learning_rate": 2.4724697894060005e-05, "loss": 0.5882, "step": 577 }, { "epoch": 0.16028840820854132, "grad_norm": 0.22287555038928986, "learning_rate": 2.47235442347086e-05, "loss": 0.6012, "step": 578 }, { "epoch": 0.1605657237936772, "grad_norm": 0.20599472522735596, "learning_rate": 2.4722388190219852e-05, "loss": 0.5971, "step": 579 }, { "epoch": 0.16084303937881309, "grad_norm": 0.21176591515541077, "learning_rate": 2.4721229760819348e-05, "loss": 0.5954, "step": 580 }, { "epoch": 0.16112035496394897, "grad_norm": 0.24732773005962372, "learning_rate": 2.4720068946733123e-05, "loss": 0.5818, "step": 581 }, { "epoch": 0.16139767054908485, "grad_norm": 0.20434054732322693, "learning_rate": 2.4718905748187677e-05, "loss": 0.5745, "step": 582 }, { "epoch": 0.16167498613422074, "grad_norm": 0.20684310793876648, "learning_rate": 2.4717740165409988e-05, "loss": 0.5663, "step": 583 }, { "epoch": 0.16195230171935662, "grad_norm": 0.2029474377632141, "learning_rate": 2.471657219862749e-05, "loss": 0.5855, "step": 584 }, { "epoch": 0.1622296173044925, "grad_norm": 0.2033785730600357, "learning_rate": 2.4715401848068086e-05, "loss": 0.6119, "step": 585 }, { "epoch": 0.1625069328896284, "grad_norm": 0.21371322870254517, "learning_rate": 2.4714229113960135e-05, "loss": 0.6022, "step": 586 }, { "epoch": 0.16278424847476428, "grad_norm": 0.20918406546115875, "learning_rate": 2.4713053996532477e-05, "loss": 0.569, "step": 587 }, { "epoch": 0.16306156405990016, "grad_norm": 0.2060522437095642, "learning_rate": 2.4711876496014407e-05, "loss": 0.5982, "step": 588 }, { "epoch": 0.16333887964503604, "grad_norm": 0.20782527327537537, "learning_rate": 2.4710696612635688e-05, "loss": 0.6015, "step": 589 }, { "epoch": 0.16361619523017193, "grad_norm": 0.20826764404773712, "learning_rate": 2.4709514346626536e-05, "loss": 0.6094, "step": 590 }, { "epoch": 0.1638935108153078, "grad_norm": 0.20720824599266052, "learning_rate": 2.4708329698217652e-05, "loss": 0.6054, "step": 591 }, { "epoch": 0.1641708264004437, "grad_norm": 0.19394385814666748, "learning_rate": 2.4707142667640193e-05, "loss": 0.5812, "step": 592 }, { "epoch": 0.16444814198557958, "grad_norm": 0.2022271454334259, "learning_rate": 2.4705953255125777e-05, "loss": 0.6084, "step": 593 }, { "epoch": 0.16472545757071547, "grad_norm": 0.21304059028625488, "learning_rate": 2.4704761460906488e-05, "loss": 0.5673, "step": 594 }, { "epoch": 0.16500277315585135, "grad_norm": 0.20137831568717957, "learning_rate": 2.470356728521488e-05, "loss": 0.5945, "step": 595 }, { "epoch": 0.16528008874098724, "grad_norm": 0.20188415050506592, "learning_rate": 2.470237072828397e-05, "loss": 0.5849, "step": 596 }, { "epoch": 0.16555740432612312, "grad_norm": 0.206806018948555, "learning_rate": 2.4701171790347233e-05, "loss": 0.5863, "step": 597 }, { "epoch": 0.165834719911259, "grad_norm": 0.2093089371919632, "learning_rate": 2.4699970471638613e-05, "loss": 0.601, "step": 598 }, { "epoch": 0.1661120354963949, "grad_norm": 0.19595085084438324, "learning_rate": 2.4698766772392524e-05, "loss": 0.5993, "step": 599 }, { "epoch": 0.16638935108153077, "grad_norm": 0.20450963079929352, "learning_rate": 2.469756069284384e-05, "loss": 0.5875, "step": 600 }, { "epoch": 0.16666666666666666, "grad_norm": 0.18902625143527985, "learning_rate": 2.4696352233227894e-05, "loss": 0.5943, "step": 601 }, { "epoch": 0.16694398225180254, "grad_norm": 0.19143831729888916, "learning_rate": 2.469514139378049e-05, "loss": 0.5925, "step": 602 }, { "epoch": 0.16722129783693843, "grad_norm": 0.20280803740024567, "learning_rate": 2.46939281747379e-05, "loss": 0.596, "step": 603 }, { "epoch": 0.1674986134220743, "grad_norm": 0.20762760937213898, "learning_rate": 2.4692712576336848e-05, "loss": 0.5951, "step": 604 }, { "epoch": 0.1677759290072102, "grad_norm": 0.209476500749588, "learning_rate": 2.4691494598814536e-05, "loss": 0.5988, "step": 605 }, { "epoch": 0.16805324459234608, "grad_norm": 0.23190903663635254, "learning_rate": 2.4690274242408617e-05, "loss": 0.5928, "step": 606 }, { "epoch": 0.16833056017748196, "grad_norm": 0.20941099524497986, "learning_rate": 2.4689051507357218e-05, "loss": 0.6001, "step": 607 }, { "epoch": 0.16860787576261785, "grad_norm": 0.20067985355854034, "learning_rate": 2.468782639389893e-05, "loss": 0.572, "step": 608 }, { "epoch": 0.16888519134775373, "grad_norm": 0.20099548995494843, "learning_rate": 2.4686598902272793e-05, "loss": 0.5603, "step": 609 }, { "epoch": 0.16916250693288962, "grad_norm": 0.20298384130001068, "learning_rate": 2.4685369032718343e-05, "loss": 0.5657, "step": 610 }, { "epoch": 0.1694398225180255, "grad_norm": 0.19231431186199188, "learning_rate": 2.4684136785475544e-05, "loss": 0.5628, "step": 611 }, { "epoch": 0.16971713810316139, "grad_norm": 0.20296776294708252, "learning_rate": 2.468290216078485e-05, "loss": 0.6011, "step": 612 }, { "epoch": 0.16999445368829727, "grad_norm": 0.19989420473575592, "learning_rate": 2.468166515888716e-05, "loss": 0.5876, "step": 613 }, { "epoch": 0.17027176927343315, "grad_norm": 0.19636170566082, "learning_rate": 2.4680425780023852e-05, "loss": 0.5852, "step": 614 }, { "epoch": 0.17054908485856904, "grad_norm": 0.21776345372200012, "learning_rate": 2.4679184024436757e-05, "loss": 0.5988, "step": 615 }, { "epoch": 0.17082640044370492, "grad_norm": 0.2017216682434082, "learning_rate": 2.4677939892368183e-05, "loss": 0.6135, "step": 616 }, { "epoch": 0.1711037160288408, "grad_norm": 0.218113973736763, "learning_rate": 2.4676693384060884e-05, "loss": 0.5727, "step": 617 }, { "epoch": 0.1713810316139767, "grad_norm": 0.2070799022912979, "learning_rate": 2.4675444499758093e-05, "loss": 0.6229, "step": 618 }, { "epoch": 0.17165834719911258, "grad_norm": 0.211971253156662, "learning_rate": 2.4674193239703496e-05, "loss": 0.5909, "step": 619 }, { "epoch": 0.1719356627842485, "grad_norm": 0.21103401482105255, "learning_rate": 2.4672939604141248e-05, "loss": 0.5805, "step": 620 }, { "epoch": 0.17221297836938437, "grad_norm": 0.22362381219863892, "learning_rate": 2.467168359331597e-05, "loss": 0.6006, "step": 621 }, { "epoch": 0.17249029395452026, "grad_norm": 0.1989423632621765, "learning_rate": 2.4670425207472737e-05, "loss": 0.5895, "step": 622 }, { "epoch": 0.17276760953965614, "grad_norm": 0.20032650232315063, "learning_rate": 2.46691644468571e-05, "loss": 0.5856, "step": 623 }, { "epoch": 0.17304492512479203, "grad_norm": 0.21420548856258392, "learning_rate": 2.466790131171506e-05, "loss": 0.5835, "step": 624 }, { "epoch": 0.1733222407099279, "grad_norm": 0.2181633710861206, "learning_rate": 2.466663580229309e-05, "loss": 0.5608, "step": 625 }, { "epoch": 0.1735995562950638, "grad_norm": 0.19224753975868225, "learning_rate": 2.4665367918838135e-05, "loss": 0.5826, "step": 626 }, { "epoch": 0.17387687188019968, "grad_norm": 0.20331954956054688, "learning_rate": 2.4664097661597576e-05, "loss": 0.5948, "step": 627 }, { "epoch": 0.17415418746533556, "grad_norm": 0.20249582827091217, "learning_rate": 2.4662825030819282e-05, "loss": 0.5894, "step": 628 }, { "epoch": 0.17443150305047145, "grad_norm": 0.19642974436283112, "learning_rate": 2.466155002675158e-05, "loss": 0.5938, "step": 629 }, { "epoch": 0.17470881863560733, "grad_norm": 0.20837126672267914, "learning_rate": 2.466027264964325e-05, "loss": 0.589, "step": 630 }, { "epoch": 0.17498613422074322, "grad_norm": 0.1986762434244156, "learning_rate": 2.465899289974355e-05, "loss": 0.5677, "step": 631 }, { "epoch": 0.1752634498058791, "grad_norm": 0.47144341468811035, "learning_rate": 2.4657710777302183e-05, "loss": 0.6075, "step": 632 }, { "epoch": 0.17554076539101499, "grad_norm": 0.2005637288093567, "learning_rate": 2.465642628256934e-05, "loss": 0.5863, "step": 633 }, { "epoch": 0.17581808097615087, "grad_norm": 0.2160159796476364, "learning_rate": 2.465513941579564e-05, "loss": 0.5661, "step": 634 }, { "epoch": 0.17609539656128675, "grad_norm": 0.22309495508670807, "learning_rate": 2.4653850177232203e-05, "loss": 0.6029, "step": 635 }, { "epoch": 0.17637271214642264, "grad_norm": 0.20880426466464996, "learning_rate": 2.4652558567130585e-05, "loss": 0.6039, "step": 636 }, { "epoch": 0.17665002773155852, "grad_norm": 0.20371706783771515, "learning_rate": 2.4651264585742813e-05, "loss": 0.5974, "step": 637 }, { "epoch": 0.1769273433166944, "grad_norm": 0.19604718685150146, "learning_rate": 2.464996823332138e-05, "loss": 0.6012, "step": 638 }, { "epoch": 0.1772046589018303, "grad_norm": 0.2000109851360321, "learning_rate": 2.4648669510119235e-05, "loss": 0.6038, "step": 639 }, { "epoch": 0.17748197448696618, "grad_norm": 0.19588269293308258, "learning_rate": 2.46473684163898e-05, "loss": 0.5949, "step": 640 }, { "epoch": 0.17775929007210206, "grad_norm": 0.2040427029132843, "learning_rate": 2.4646064952386945e-05, "loss": 0.5616, "step": 641 }, { "epoch": 0.17803660565723795, "grad_norm": 0.2059299200773239, "learning_rate": 2.4644759118365014e-05, "loss": 0.5785, "step": 642 }, { "epoch": 0.17831392124237383, "grad_norm": 0.1959904134273529, "learning_rate": 2.464345091457881e-05, "loss": 0.5691, "step": 643 }, { "epoch": 0.17859123682750971, "grad_norm": 0.20045484602451324, "learning_rate": 2.46421403412836e-05, "loss": 0.5831, "step": 644 }, { "epoch": 0.1788685524126456, "grad_norm": 0.18542559444904327, "learning_rate": 2.4640827398735105e-05, "loss": 0.5666, "step": 645 }, { "epoch": 0.17914586799778148, "grad_norm": 0.29157590866088867, "learning_rate": 2.463951208718952e-05, "loss": 0.5841, "step": 646 }, { "epoch": 0.17942318358291737, "grad_norm": 0.20582380890846252, "learning_rate": 2.46381944069035e-05, "loss": 0.618, "step": 647 }, { "epoch": 0.17970049916805325, "grad_norm": 0.20922648906707764, "learning_rate": 2.4636874358134153e-05, "loss": 0.5831, "step": 648 }, { "epoch": 0.17997781475318914, "grad_norm": 0.19867978990077972, "learning_rate": 2.463555194113906e-05, "loss": 0.5734, "step": 649 }, { "epoch": 0.18025513033832502, "grad_norm": 0.19810400903224945, "learning_rate": 2.463422715617626e-05, "loss": 0.6086, "step": 650 }, { "epoch": 0.1805324459234609, "grad_norm": 0.22697949409484863, "learning_rate": 2.4632900003504246e-05, "loss": 0.5942, "step": 651 }, { "epoch": 0.1808097615085968, "grad_norm": 0.21418651938438416, "learning_rate": 2.4631570483381992e-05, "loss": 0.5793, "step": 652 }, { "epoch": 0.18108707709373267, "grad_norm": 0.22276844084262848, "learning_rate": 2.4630238596068914e-05, "loss": 0.5998, "step": 653 }, { "epoch": 0.18136439267886856, "grad_norm": 0.237818643450737, "learning_rate": 2.4628904341824898e-05, "loss": 0.5787, "step": 654 }, { "epoch": 0.18164170826400444, "grad_norm": 0.20757392048835754, "learning_rate": 2.46275677209103e-05, "loss": 0.5603, "step": 655 }, { "epoch": 0.18191902384914033, "grad_norm": 0.20871873199939728, "learning_rate": 2.4626228733585926e-05, "loss": 0.5689, "step": 656 }, { "epoch": 0.1821963394342762, "grad_norm": 0.2344467043876648, "learning_rate": 2.4624887380113048e-05, "loss": 0.5887, "step": 657 }, { "epoch": 0.1824736550194121, "grad_norm": 0.20889438688755035, "learning_rate": 2.4623543660753397e-05, "loss": 0.5699, "step": 658 }, { "epoch": 0.18275097060454798, "grad_norm": 0.23723891377449036, "learning_rate": 2.4622197575769173e-05, "loss": 0.5691, "step": 659 }, { "epoch": 0.18302828618968386, "grad_norm": 0.20685730874538422, "learning_rate": 2.462084912542303e-05, "loss": 0.585, "step": 660 }, { "epoch": 0.18330560177481975, "grad_norm": 0.18916456401348114, "learning_rate": 2.4619498309978085e-05, "loss": 0.5785, "step": 661 }, { "epoch": 0.18358291735995563, "grad_norm": 0.19421158730983734, "learning_rate": 2.4618145129697916e-05, "loss": 0.5742, "step": 662 }, { "epoch": 0.18386023294509152, "grad_norm": 0.19799606502056122, "learning_rate": 2.4616789584846575e-05, "loss": 0.5642, "step": 663 }, { "epoch": 0.1841375485302274, "grad_norm": 0.20754088461399078, "learning_rate": 2.4615431675688556e-05, "loss": 0.5793, "step": 664 }, { "epoch": 0.1844148641153633, "grad_norm": 0.20479615032672882, "learning_rate": 2.4614071402488822e-05, "loss": 0.6009, "step": 665 }, { "epoch": 0.18469217970049917, "grad_norm": 0.20695596933364868, "learning_rate": 2.4612708765512803e-05, "loss": 0.607, "step": 666 }, { "epoch": 0.18496949528563505, "grad_norm": 0.21166419982910156, "learning_rate": 2.4611343765026385e-05, "loss": 0.5889, "step": 667 }, { "epoch": 0.18524681087077094, "grad_norm": 0.19035880267620087, "learning_rate": 2.4609976401295914e-05, "loss": 0.5596, "step": 668 }, { "epoch": 0.18552412645590682, "grad_norm": 0.2189519852399826, "learning_rate": 2.4608606674588196e-05, "loss": 0.595, "step": 669 }, { "epoch": 0.1858014420410427, "grad_norm": 0.20535226166248322, "learning_rate": 2.4607234585170506e-05, "loss": 0.5785, "step": 670 }, { "epoch": 0.1860787576261786, "grad_norm": 0.20723526179790497, "learning_rate": 2.4605860133310577e-05, "loss": 0.6205, "step": 671 }, { "epoch": 0.18635607321131448, "grad_norm": 0.22765034437179565, "learning_rate": 2.4604483319276596e-05, "loss": 0.5739, "step": 672 }, { "epoch": 0.18663338879645036, "grad_norm": 0.19783975183963776, "learning_rate": 2.4603104143337212e-05, "loss": 0.6001, "step": 673 }, { "epoch": 0.18691070438158625, "grad_norm": 0.21098697185516357, "learning_rate": 2.4601722605761547e-05, "loss": 0.5636, "step": 674 }, { "epoch": 0.18718801996672213, "grad_norm": 0.20571714639663696, "learning_rate": 2.4600338706819175e-05, "loss": 0.6031, "step": 675 }, { "epoch": 0.18746533555185801, "grad_norm": 0.2101418673992157, "learning_rate": 2.4598952446780127e-05, "loss": 0.5854, "step": 676 }, { "epoch": 0.1877426511369939, "grad_norm": 0.20562447607517242, "learning_rate": 2.45975638259149e-05, "loss": 0.6007, "step": 677 }, { "epoch": 0.18801996672212978, "grad_norm": 0.20215946435928345, "learning_rate": 2.4596172844494454e-05, "loss": 0.601, "step": 678 }, { "epoch": 0.18829728230726567, "grad_norm": 0.2149016410112381, "learning_rate": 2.45947795027902e-05, "loss": 0.5965, "step": 679 }, { "epoch": 0.18857459789240155, "grad_norm": 0.1985412836074829, "learning_rate": 2.4593383801074025e-05, "loss": 0.563, "step": 680 }, { "epoch": 0.18885191347753744, "grad_norm": 0.20387370884418488, "learning_rate": 2.459198573961826e-05, "loss": 0.5816, "step": 681 }, { "epoch": 0.18912922906267332, "grad_norm": 0.19687046110630035, "learning_rate": 2.4590585318695703e-05, "loss": 0.5761, "step": 682 }, { "epoch": 0.1894065446478092, "grad_norm": 0.2053958773612976, "learning_rate": 2.458918253857962e-05, "loss": 0.5931, "step": 683 }, { "epoch": 0.1896838602329451, "grad_norm": 0.20455330610275269, "learning_rate": 2.4587777399543726e-05, "loss": 0.5739, "step": 684 }, { "epoch": 0.18996117581808097, "grad_norm": 0.2084437757730484, "learning_rate": 2.4586369901862204e-05, "loss": 0.5659, "step": 685 }, { "epoch": 0.19023849140321686, "grad_norm": 0.20842646062374115, "learning_rate": 2.4584960045809686e-05, "loss": 0.5863, "step": 686 }, { "epoch": 0.19051580698835274, "grad_norm": 0.19156675040721893, "learning_rate": 2.4583547831661283e-05, "loss": 0.5738, "step": 687 }, { "epoch": 0.19079312257348863, "grad_norm": 0.19893507659435272, "learning_rate": 2.4582133259692546e-05, "loss": 0.5739, "step": 688 }, { "epoch": 0.1910704381586245, "grad_norm": 0.20070527493953705, "learning_rate": 2.4580716330179505e-05, "loss": 0.5703, "step": 689 }, { "epoch": 0.1913477537437604, "grad_norm": 0.20454630255699158, "learning_rate": 2.4579297043398636e-05, "loss": 0.5735, "step": 690 }, { "epoch": 0.19162506932889628, "grad_norm": 0.21206796169281006, "learning_rate": 2.4577875399626877e-05, "loss": 0.5852, "step": 691 }, { "epoch": 0.19190238491403216, "grad_norm": 0.20151346921920776, "learning_rate": 2.4576451399141627e-05, "loss": 0.6033, "step": 692 }, { "epoch": 0.19217970049916805, "grad_norm": 0.1981053203344345, "learning_rate": 2.457502504222075e-05, "loss": 0.5874, "step": 693 }, { "epoch": 0.19245701608430393, "grad_norm": 0.197993203997612, "learning_rate": 2.457359632914257e-05, "loss": 0.5918, "step": 694 }, { "epoch": 0.19273433166943982, "grad_norm": 0.2022867351770401, "learning_rate": 2.4572165260185857e-05, "loss": 0.5786, "step": 695 }, { "epoch": 0.1930116472545757, "grad_norm": 0.20423941314220428, "learning_rate": 2.457073183562986e-05, "loss": 0.5781, "step": 696 }, { "epoch": 0.1932889628397116, "grad_norm": 0.2390977442264557, "learning_rate": 2.4569296055754275e-05, "loss": 0.5727, "step": 697 }, { "epoch": 0.19356627842484747, "grad_norm": 0.2169281542301178, "learning_rate": 2.4567857920839256e-05, "loss": 0.5638, "step": 698 }, { "epoch": 0.19384359400998336, "grad_norm": 0.18818534910678864, "learning_rate": 2.4566417431165427e-05, "loss": 0.5722, "step": 699 }, { "epoch": 0.19412090959511924, "grad_norm": 0.1905328780412674, "learning_rate": 2.456497458701386e-05, "loss": 0.5632, "step": 700 }, { "epoch": 0.19439822518025512, "grad_norm": 0.19498996436595917, "learning_rate": 2.45635293886661e-05, "loss": 0.5953, "step": 701 }, { "epoch": 0.194675540765391, "grad_norm": 0.19959990680217743, "learning_rate": 2.456208183640414e-05, "loss": 0.583, "step": 702 }, { "epoch": 0.1949528563505269, "grad_norm": 0.2034797966480255, "learning_rate": 2.456063193051043e-05, "loss": 0.5883, "step": 703 }, { "epoch": 0.19523017193566278, "grad_norm": 0.20221780240535736, "learning_rate": 2.455917967126789e-05, "loss": 0.5928, "step": 704 }, { "epoch": 0.19550748752079866, "grad_norm": 0.21273115277290344, "learning_rate": 2.4557725058959895e-05, "loss": 0.5719, "step": 705 }, { "epoch": 0.19578480310593455, "grad_norm": 0.1873910129070282, "learning_rate": 2.455626809387028e-05, "loss": 0.5642, "step": 706 }, { "epoch": 0.19606211869107043, "grad_norm": 0.19268092513084412, "learning_rate": 2.4554808776283334e-05, "loss": 0.5555, "step": 707 }, { "epoch": 0.19633943427620631, "grad_norm": 0.20540937781333923, "learning_rate": 2.4553347106483808e-05, "loss": 0.6076, "step": 708 }, { "epoch": 0.1966167498613422, "grad_norm": 0.19650278985500336, "learning_rate": 2.4551883084756917e-05, "loss": 0.5866, "step": 709 }, { "epoch": 0.19689406544647808, "grad_norm": 0.221206396818161, "learning_rate": 2.4550416711388327e-05, "loss": 0.581, "step": 710 }, { "epoch": 0.19717138103161397, "grad_norm": 0.19788506627082825, "learning_rate": 2.4548947986664167e-05, "loss": 0.5667, "step": 711 }, { "epoch": 0.19744869661674985, "grad_norm": 0.22713138163089752, "learning_rate": 2.454747691087102e-05, "loss": 0.5732, "step": 712 }, { "epoch": 0.19772601220188574, "grad_norm": 0.19035859405994415, "learning_rate": 2.454600348429594e-05, "loss": 0.6072, "step": 713 }, { "epoch": 0.19800332778702162, "grad_norm": 0.19724468886852264, "learning_rate": 2.4544527707226428e-05, "loss": 0.5958, "step": 714 }, { "epoch": 0.1982806433721575, "grad_norm": 0.20074382424354553, "learning_rate": 2.4543049579950445e-05, "loss": 0.6006, "step": 715 }, { "epoch": 0.1985579589572934, "grad_norm": 0.19603832066059113, "learning_rate": 2.4541569102756414e-05, "loss": 0.5901, "step": 716 }, { "epoch": 0.19883527454242927, "grad_norm": 0.1956927478313446, "learning_rate": 2.4540086275933215e-05, "loss": 0.5731, "step": 717 }, { "epoch": 0.19911259012756516, "grad_norm": 0.19697944819927216, "learning_rate": 2.4538601099770187e-05, "loss": 0.5778, "step": 718 }, { "epoch": 0.19938990571270104, "grad_norm": 0.19818982481956482, "learning_rate": 2.453711357455713e-05, "loss": 0.5588, "step": 719 }, { "epoch": 0.19966722129783693, "grad_norm": 0.18997104465961456, "learning_rate": 2.4535623700584297e-05, "loss": 0.5789, "step": 720 }, { "epoch": 0.1999445368829728, "grad_norm": 0.21167294681072235, "learning_rate": 2.4534131478142402e-05, "loss": 0.5804, "step": 721 }, { "epoch": 0.2002218524681087, "grad_norm": 0.21374750137329102, "learning_rate": 2.4532636907522617e-05, "loss": 0.5727, "step": 722 }, { "epoch": 0.20049916805324458, "grad_norm": 0.20970353484153748, "learning_rate": 2.453113998901657e-05, "loss": 0.5766, "step": 723 }, { "epoch": 0.20077648363838047, "grad_norm": 0.19419021904468536, "learning_rate": 2.4529640722916355e-05, "loss": 0.5755, "step": 724 }, { "epoch": 0.20105379922351635, "grad_norm": 0.20143002271652222, "learning_rate": 2.4528139109514513e-05, "loss": 0.5627, "step": 725 }, { "epoch": 0.20133111480865223, "grad_norm": 0.20216137170791626, "learning_rate": 2.4526635149104056e-05, "loss": 0.5771, "step": 726 }, { "epoch": 0.20160843039378812, "grad_norm": 0.20338623225688934, "learning_rate": 2.452512884197844e-05, "loss": 0.605, "step": 727 }, { "epoch": 0.201885745978924, "grad_norm": 0.20078277587890625, "learning_rate": 2.4523620188431585e-05, "loss": 0.5954, "step": 728 }, { "epoch": 0.2021630615640599, "grad_norm": 0.2046244889497757, "learning_rate": 2.4522109188757875e-05, "loss": 0.5945, "step": 729 }, { "epoch": 0.20244037714919577, "grad_norm": 0.20462092757225037, "learning_rate": 2.4520595843252138e-05, "loss": 0.5762, "step": 730 }, { "epoch": 0.20271769273433166, "grad_norm": 0.20749370753765106, "learning_rate": 2.4519080152209675e-05, "loss": 0.5435, "step": 731 }, { "epoch": 0.20299500831946754, "grad_norm": 0.21734094619750977, "learning_rate": 2.4517562115926233e-05, "loss": 0.5961, "step": 732 }, { "epoch": 0.20327232390460345, "grad_norm": 0.20754100382328033, "learning_rate": 2.4516041734698024e-05, "loss": 0.5548, "step": 733 }, { "epoch": 0.20354963948973934, "grad_norm": 0.20619900524616241, "learning_rate": 2.451451900882172e-05, "loss": 0.5897, "step": 734 }, { "epoch": 0.20382695507487522, "grad_norm": 0.28535279631614685, "learning_rate": 2.451299393859443e-05, "loss": 0.5815, "step": 735 }, { "epoch": 0.2041042706600111, "grad_norm": 0.19404634833335876, "learning_rate": 2.4511466524313748e-05, "loss": 0.5743, "step": 736 }, { "epoch": 0.204381586245147, "grad_norm": 0.18895412981510162, "learning_rate": 2.4509936766277706e-05, "loss": 0.5876, "step": 737 }, { "epoch": 0.20465890183028287, "grad_norm": 0.22120583057403564, "learning_rate": 2.4508404664784808e-05, "loss": 0.5873, "step": 738 }, { "epoch": 0.20493621741541876, "grad_norm": 0.21361954510211945, "learning_rate": 2.4506870220134e-05, "loss": 0.6002, "step": 739 }, { "epoch": 0.20521353300055464, "grad_norm": 0.19417433440685272, "learning_rate": 2.4505333432624694e-05, "loss": 0.5673, "step": 740 }, { "epoch": 0.20549084858569053, "grad_norm": 0.19543422758579254, "learning_rate": 2.4503794302556765e-05, "loss": 0.5628, "step": 741 }, { "epoch": 0.2057681641708264, "grad_norm": 0.1954490691423416, "learning_rate": 2.450225283023053e-05, "loss": 0.6246, "step": 742 }, { "epoch": 0.2060454797559623, "grad_norm": 0.1868993192911148, "learning_rate": 2.4500709015946776e-05, "loss": 0.5858, "step": 743 }, { "epoch": 0.20632279534109818, "grad_norm": 0.2035941481590271, "learning_rate": 2.449916286000674e-05, "loss": 0.5974, "step": 744 }, { "epoch": 0.20660011092623407, "grad_norm": 0.1918855458498001, "learning_rate": 2.4497614362712118e-05, "loss": 0.5843, "step": 745 }, { "epoch": 0.20687742651136995, "grad_norm": 0.19244706630706787, "learning_rate": 2.4496063524365063e-05, "loss": 0.5535, "step": 746 }, { "epoch": 0.20715474209650583, "grad_norm": 0.20424753427505493, "learning_rate": 2.4494510345268185e-05, "loss": 0.5835, "step": 747 }, { "epoch": 0.20743205768164172, "grad_norm": 0.19604821503162384, "learning_rate": 2.4492954825724544e-05, "loss": 0.5701, "step": 748 }, { "epoch": 0.2077093732667776, "grad_norm": 0.19546863436698914, "learning_rate": 2.4491396966037678e-05, "loss": 0.5898, "step": 749 }, { "epoch": 0.2079866888519135, "grad_norm": 0.19861635565757751, "learning_rate": 2.4489836766511555e-05, "loss": 0.587, "step": 750 }, { "epoch": 0.20826400443704937, "grad_norm": 0.19369752705097198, "learning_rate": 2.4488274227450613e-05, "loss": 0.6027, "step": 751 }, { "epoch": 0.20854132002218526, "grad_norm": 0.4158318340778351, "learning_rate": 2.448670934915975e-05, "loss": 0.602, "step": 752 }, { "epoch": 0.20881863560732114, "grad_norm": 0.20547251403331757, "learning_rate": 2.4485142131944306e-05, "loss": 0.5949, "step": 753 }, { "epoch": 0.20909595119245702, "grad_norm": 0.21317002177238464, "learning_rate": 2.4483572576110093e-05, "loss": 0.5862, "step": 754 }, { "epoch": 0.2093732667775929, "grad_norm": 0.19712896645069122, "learning_rate": 2.448200068196337e-05, "loss": 0.5983, "step": 755 }, { "epoch": 0.2096505823627288, "grad_norm": 0.2054811269044876, "learning_rate": 2.448042644981086e-05, "loss": 0.5983, "step": 756 }, { "epoch": 0.20992789794786468, "grad_norm": 0.19795221090316772, "learning_rate": 2.447884987995973e-05, "loss": 0.6208, "step": 757 }, { "epoch": 0.21020521353300056, "grad_norm": 0.21395504474639893, "learning_rate": 2.447727097271762e-05, "loss": 0.5983, "step": 758 }, { "epoch": 0.21048252911813645, "grad_norm": 0.19311439990997314, "learning_rate": 2.447568972839261e-05, "loss": 0.592, "step": 759 }, { "epoch": 0.21075984470327233, "grad_norm": 0.21382609009742737, "learning_rate": 2.4474106147293242e-05, "loss": 0.5752, "step": 760 }, { "epoch": 0.21103716028840822, "grad_norm": 0.19354097545146942, "learning_rate": 2.447252022972852e-05, "loss": 0.5911, "step": 761 }, { "epoch": 0.2113144758735441, "grad_norm": 0.19984754920005798, "learning_rate": 2.4470931976007894e-05, "loss": 0.5995, "step": 762 }, { "epoch": 0.21159179145867998, "grad_norm": 0.20404407382011414, "learning_rate": 2.4469341386441274e-05, "loss": 0.5551, "step": 763 }, { "epoch": 0.21186910704381587, "grad_norm": 0.2025006264448166, "learning_rate": 2.446774846133903e-05, "loss": 0.6105, "step": 764 }, { "epoch": 0.21214642262895175, "grad_norm": 0.20010975003242493, "learning_rate": 2.446615320101198e-05, "loss": 0.5788, "step": 765 }, { "epoch": 0.21242373821408764, "grad_norm": 0.20225434005260468, "learning_rate": 2.4464555605771404e-05, "loss": 0.5636, "step": 766 }, { "epoch": 0.21270105379922352, "grad_norm": 0.19845524430274963, "learning_rate": 2.4462955675929032e-05, "loss": 0.5758, "step": 767 }, { "epoch": 0.2129783693843594, "grad_norm": 0.19598202407360077, "learning_rate": 2.446135341179706e-05, "loss": 0.5456, "step": 768 }, { "epoch": 0.2132556849694953, "grad_norm": 0.2050497829914093, "learning_rate": 2.445974881368812e-05, "loss": 0.5912, "step": 769 }, { "epoch": 0.21353300055463117, "grad_norm": 0.19924525916576385, "learning_rate": 2.4458141881915324e-05, "loss": 0.5479, "step": 770 }, { "epoch": 0.21381031613976706, "grad_norm": 0.20329277217388153, "learning_rate": 2.445653261679222e-05, "loss": 0.6006, "step": 771 }, { "epoch": 0.21408763172490294, "grad_norm": 0.19327110052108765, "learning_rate": 2.4454921018632827e-05, "loss": 0.5739, "step": 772 }, { "epoch": 0.21436494731003883, "grad_norm": 0.19316452741622925, "learning_rate": 2.4453307087751594e-05, "loss": 0.5953, "step": 773 }, { "epoch": 0.2146422628951747, "grad_norm": 0.19617030024528503, "learning_rate": 2.4451690824463457e-05, "loss": 0.5686, "step": 774 }, { "epoch": 0.2149195784803106, "grad_norm": 0.20528316497802734, "learning_rate": 2.4450072229083786e-05, "loss": 0.5691, "step": 775 }, { "epoch": 0.21519689406544648, "grad_norm": 0.20544420182704926, "learning_rate": 2.4448451301928408e-05, "loss": 0.5776, "step": 776 }, { "epoch": 0.21547420965058237, "grad_norm": 0.21979959309101105, "learning_rate": 2.4446828043313614e-05, "loss": 0.5947, "step": 777 }, { "epoch": 0.21575152523571825, "grad_norm": 0.2081802487373352, "learning_rate": 2.4445202453556145e-05, "loss": 0.5752, "step": 778 }, { "epoch": 0.21602884082085413, "grad_norm": 0.2012367993593216, "learning_rate": 2.4443574532973195e-05, "loss": 0.5671, "step": 779 }, { "epoch": 0.21630615640599002, "grad_norm": 0.1998508721590042, "learning_rate": 2.4441944281882415e-05, "loss": 0.6154, "step": 780 }, { "epoch": 0.2165834719911259, "grad_norm": 0.20325055718421936, "learning_rate": 2.444031170060191e-05, "loss": 0.5743, "step": 781 }, { "epoch": 0.2168607875762618, "grad_norm": 0.20255804061889648, "learning_rate": 2.443867678945024e-05, "loss": 0.5748, "step": 782 }, { "epoch": 0.21713810316139767, "grad_norm": 0.1919908970594406, "learning_rate": 2.4437039548746415e-05, "loss": 0.5709, "step": 783 }, { "epoch": 0.21741541874653356, "grad_norm": 0.20014292001724243, "learning_rate": 2.443539997880991e-05, "loss": 0.5636, "step": 784 }, { "epoch": 0.21769273433166944, "grad_norm": 0.19818776845932007, "learning_rate": 2.4433758079960647e-05, "loss": 0.5649, "step": 785 }, { "epoch": 0.21797004991680533, "grad_norm": 0.18718703091144562, "learning_rate": 2.4432113852519005e-05, "loss": 0.5917, "step": 786 }, { "epoch": 0.2182473655019412, "grad_norm": 0.32280299067497253, "learning_rate": 2.4430467296805816e-05, "loss": 0.5864, "step": 787 }, { "epoch": 0.2185246810870771, "grad_norm": 0.20851466059684753, "learning_rate": 2.442881841314236e-05, "loss": 0.5837, "step": 788 }, { "epoch": 0.21880199667221298, "grad_norm": 0.1917923539876938, "learning_rate": 2.442716720185039e-05, "loss": 0.6032, "step": 789 }, { "epoch": 0.21907931225734886, "grad_norm": 0.25185203552246094, "learning_rate": 2.442551366325209e-05, "loss": 0.5873, "step": 790 }, { "epoch": 0.21935662784248475, "grad_norm": 0.1962638646364212, "learning_rate": 2.4423857797670118e-05, "loss": 0.577, "step": 791 }, { "epoch": 0.21963394342762063, "grad_norm": 0.19475746154785156, "learning_rate": 2.4422199605427572e-05, "loss": 0.5677, "step": 792 }, { "epoch": 0.21991125901275652, "grad_norm": 0.517663836479187, "learning_rate": 2.4420539086848007e-05, "loss": 0.5718, "step": 793 }, { "epoch": 0.2201885745978924, "grad_norm": 0.1952415555715561, "learning_rate": 2.441887624225544e-05, "loss": 0.5801, "step": 794 }, { "epoch": 0.22046589018302828, "grad_norm": 0.20376408100128174, "learning_rate": 2.441721107197433e-05, "loss": 0.6038, "step": 795 }, { "epoch": 0.22074320576816417, "grad_norm": 0.21492497622966766, "learning_rate": 2.4415543576329604e-05, "loss": 0.5626, "step": 796 }, { "epoch": 0.22102052135330005, "grad_norm": 0.21810825169086456, "learning_rate": 2.4413873755646627e-05, "loss": 0.566, "step": 797 }, { "epoch": 0.22129783693843594, "grad_norm": 0.2002691626548767, "learning_rate": 2.4412201610251232e-05, "loss": 0.5706, "step": 798 }, { "epoch": 0.22157515252357182, "grad_norm": 0.24929803609848022, "learning_rate": 2.441052714046969e-05, "loss": 0.5878, "step": 799 }, { "epoch": 0.2218524681087077, "grad_norm": 0.20125854015350342, "learning_rate": 2.440885034662874e-05, "loss": 0.5869, "step": 800 }, { "epoch": 0.2221297836938436, "grad_norm": 0.19206437468528748, "learning_rate": 2.4407171229055574e-05, "loss": 0.5911, "step": 801 }, { "epoch": 0.22240709927897948, "grad_norm": 0.22323836386203766, "learning_rate": 2.4405489788077823e-05, "loss": 0.5725, "step": 802 }, { "epoch": 0.22268441486411536, "grad_norm": 0.2044333517551422, "learning_rate": 2.4403806024023584e-05, "loss": 0.6243, "step": 803 }, { "epoch": 0.22296173044925124, "grad_norm": 0.20750725269317627, "learning_rate": 2.44021199372214e-05, "loss": 0.6002, "step": 804 }, { "epoch": 0.22323904603438713, "grad_norm": 0.2218470573425293, "learning_rate": 2.4400431528000284e-05, "loss": 0.5886, "step": 805 }, { "epoch": 0.223516361619523, "grad_norm": 0.20135878026485443, "learning_rate": 2.4398740796689676e-05, "loss": 0.5711, "step": 806 }, { "epoch": 0.2237936772046589, "grad_norm": 0.19776999950408936, "learning_rate": 2.439704774361949e-05, "loss": 0.5706, "step": 807 }, { "epoch": 0.22407099278979478, "grad_norm": 0.22162839770317078, "learning_rate": 2.4395352369120078e-05, "loss": 0.593, "step": 808 }, { "epoch": 0.22434830837493067, "grad_norm": 0.1974382847547531, "learning_rate": 2.4393654673522264e-05, "loss": 0.5657, "step": 809 }, { "epoch": 0.22462562396006655, "grad_norm": 0.2362552434206009, "learning_rate": 2.4391954657157302e-05, "loss": 0.5844, "step": 810 }, { "epoch": 0.22490293954520243, "grad_norm": 0.2056739628314972, "learning_rate": 2.4390252320356915e-05, "loss": 0.5959, "step": 811 }, { "epoch": 0.22518025513033832, "grad_norm": 0.19037006795406342, "learning_rate": 2.4388547663453275e-05, "loss": 0.5638, "step": 812 }, { "epoch": 0.2254575707154742, "grad_norm": 0.21129223704338074, "learning_rate": 2.4386840686779004e-05, "loss": 0.5954, "step": 813 }, { "epoch": 0.2257348863006101, "grad_norm": 0.2718643546104431, "learning_rate": 2.4385131390667184e-05, "loss": 0.5881, "step": 814 }, { "epoch": 0.22601220188574597, "grad_norm": 0.2181466519832611, "learning_rate": 2.4383419775451334e-05, "loss": 0.5858, "step": 815 }, { "epoch": 0.22628951747088186, "grad_norm": 0.22737659513950348, "learning_rate": 2.438170584146544e-05, "loss": 0.5732, "step": 816 }, { "epoch": 0.22656683305601774, "grad_norm": 0.20686711370944977, "learning_rate": 2.437998958904394e-05, "loss": 0.5524, "step": 817 }, { "epoch": 0.22684414864115363, "grad_norm": 0.2140977680683136, "learning_rate": 2.4378271018521714e-05, "loss": 0.5628, "step": 818 }, { "epoch": 0.2271214642262895, "grad_norm": 0.19793452322483063, "learning_rate": 2.4376550130234104e-05, "loss": 0.5796, "step": 819 }, { "epoch": 0.2273987798114254, "grad_norm": 0.20289914309978485, "learning_rate": 2.4374826924516903e-05, "loss": 0.5848, "step": 820 }, { "epoch": 0.22767609539656128, "grad_norm": 0.19081373512744904, "learning_rate": 2.437310140170635e-05, "loss": 0.6011, "step": 821 }, { "epoch": 0.22795341098169716, "grad_norm": 0.20546457171440125, "learning_rate": 2.437137356213914e-05, "loss": 0.5476, "step": 822 }, { "epoch": 0.22823072656683305, "grad_norm": 0.22531366348266602, "learning_rate": 2.4369643406152422e-05, "loss": 0.577, "step": 823 }, { "epoch": 0.22850804215196893, "grad_norm": 0.1964918076992035, "learning_rate": 2.4367910934083795e-05, "loss": 0.5733, "step": 824 }, { "epoch": 0.22878535773710482, "grad_norm": 0.1976742297410965, "learning_rate": 2.4366176146271313e-05, "loss": 0.576, "step": 825 }, { "epoch": 0.2290626733222407, "grad_norm": 0.20134706795215607, "learning_rate": 2.4364439043053475e-05, "loss": 0.5715, "step": 826 }, { "epoch": 0.22933998890737659, "grad_norm": 0.28538307547569275, "learning_rate": 2.4362699624769236e-05, "loss": 0.6009, "step": 827 }, { "epoch": 0.22961730449251247, "grad_norm": 0.21845568716526031, "learning_rate": 2.4360957891758006e-05, "loss": 0.5936, "step": 828 }, { "epoch": 0.22989462007764835, "grad_norm": 0.1977756768465042, "learning_rate": 2.435921384435964e-05, "loss": 0.5651, "step": 829 }, { "epoch": 0.23017193566278424, "grad_norm": 0.20483648777008057, "learning_rate": 2.4357467482914447e-05, "loss": 0.5861, "step": 830 }, { "epoch": 0.23044925124792012, "grad_norm": 0.191145658493042, "learning_rate": 2.4355718807763196e-05, "loss": 0.5951, "step": 831 }, { "epoch": 0.230726566833056, "grad_norm": 0.20611602067947388, "learning_rate": 2.4353967819247093e-05, "loss": 0.5762, "step": 832 }, { "epoch": 0.2310038824181919, "grad_norm": 0.34163740277290344, "learning_rate": 2.43522145177078e-05, "loss": 0.5573, "step": 833 }, { "epoch": 0.23128119800332778, "grad_norm": 0.1989511400461197, "learning_rate": 2.4350458903487438e-05, "loss": 0.5799, "step": 834 }, { "epoch": 0.23155851358846366, "grad_norm": 0.1951713114976883, "learning_rate": 2.434870097692857e-05, "loss": 0.5763, "step": 835 }, { "epoch": 0.23183582917359954, "grad_norm": 0.19492702186107635, "learning_rate": 2.4346940738374217e-05, "loss": 0.5751, "step": 836 }, { "epoch": 0.23211314475873543, "grad_norm": 0.20524460077285767, "learning_rate": 2.434517818816785e-05, "loss": 0.5959, "step": 837 }, { "epoch": 0.2323904603438713, "grad_norm": 0.19833968579769135, "learning_rate": 2.4343413326653384e-05, "loss": 0.5881, "step": 838 }, { "epoch": 0.2326677759290072, "grad_norm": 0.19568949937820435, "learning_rate": 2.4341646154175192e-05, "loss": 0.5449, "step": 839 }, { "epoch": 0.23294509151414308, "grad_norm": 0.19156986474990845, "learning_rate": 2.43398766710781e-05, "loss": 0.5703, "step": 840 }, { "epoch": 0.23322240709927897, "grad_norm": 0.20440654456615448, "learning_rate": 2.4338104877707372e-05, "loss": 0.5783, "step": 841 }, { "epoch": 0.23349972268441485, "grad_norm": 0.20396758615970612, "learning_rate": 2.4336330774408744e-05, "loss": 0.5911, "step": 842 }, { "epoch": 0.23377703826955074, "grad_norm": 0.19637946784496307, "learning_rate": 2.4334554361528376e-05, "loss": 0.5708, "step": 843 }, { "epoch": 0.23405435385468662, "grad_norm": 0.21237438917160034, "learning_rate": 2.433277563941291e-05, "loss": 0.6104, "step": 844 }, { "epoch": 0.2343316694398225, "grad_norm": 0.18440033495426178, "learning_rate": 2.433099460840941e-05, "loss": 0.5745, "step": 845 }, { "epoch": 0.23460898502495842, "grad_norm": 0.19301645457744598, "learning_rate": 2.4329211268865406e-05, "loss": 0.5621, "step": 846 }, { "epoch": 0.2348863006100943, "grad_norm": 0.2056163102388382, "learning_rate": 2.4327425621128873e-05, "loss": 0.5973, "step": 847 }, { "epoch": 0.23516361619523019, "grad_norm": 0.20398737490177155, "learning_rate": 2.432563766554824e-05, "loss": 0.5795, "step": 848 }, { "epoch": 0.23544093178036607, "grad_norm": 0.19015717506408691, "learning_rate": 2.432384740247239e-05, "loss": 0.5563, "step": 849 }, { "epoch": 0.23571824736550195, "grad_norm": 0.19576147198677063, "learning_rate": 2.4322054832250636e-05, "loss": 0.5757, "step": 850 }, { "epoch": 0.23599556295063784, "grad_norm": 0.1978127360343933, "learning_rate": 2.4320259955232773e-05, "loss": 0.5832, "step": 851 }, { "epoch": 0.23627287853577372, "grad_norm": 0.19191214442253113, "learning_rate": 2.4318462771769012e-05, "loss": 0.5812, "step": 852 }, { "epoch": 0.2365501941209096, "grad_norm": 0.18717870116233826, "learning_rate": 2.4316663282210046e-05, "loss": 0.5753, "step": 853 }, { "epoch": 0.2368275097060455, "grad_norm": 0.18739578127861023, "learning_rate": 2.4314861486906996e-05, "loss": 0.5665, "step": 854 }, { "epoch": 0.23710482529118138, "grad_norm": 0.1928299367427826, "learning_rate": 2.431305738621144e-05, "loss": 0.5836, "step": 855 }, { "epoch": 0.23738214087631726, "grad_norm": 0.20048747956752777, "learning_rate": 2.4311250980475408e-05, "loss": 0.5673, "step": 856 }, { "epoch": 0.23765945646145314, "grad_norm": 0.2141515463590622, "learning_rate": 2.4309442270051376e-05, "loss": 0.5426, "step": 857 }, { "epoch": 0.23793677204658903, "grad_norm": 0.19637706875801086, "learning_rate": 2.4307631255292273e-05, "loss": 0.5542, "step": 858 }, { "epoch": 0.2382140876317249, "grad_norm": 0.21245527267456055, "learning_rate": 2.4305817936551472e-05, "loss": 0.5867, "step": 859 }, { "epoch": 0.2384914032168608, "grad_norm": 0.19474704563617706, "learning_rate": 2.4304002314182804e-05, "loss": 0.5716, "step": 860 }, { "epoch": 0.23876871880199668, "grad_norm": 0.19590826332569122, "learning_rate": 2.4302184388540544e-05, "loss": 0.5746, "step": 861 }, { "epoch": 0.23904603438713257, "grad_norm": 0.21298062801361084, "learning_rate": 2.4300364159979418e-05, "loss": 0.5788, "step": 862 }, { "epoch": 0.23932334997226845, "grad_norm": 0.21597878634929657, "learning_rate": 2.4298541628854597e-05, "loss": 0.5868, "step": 863 }, { "epoch": 0.23960066555740434, "grad_norm": 0.2077784389257431, "learning_rate": 2.4296716795521707e-05, "loss": 0.5878, "step": 864 }, { "epoch": 0.23987798114254022, "grad_norm": 0.1982557773590088, "learning_rate": 2.4294889660336823e-05, "loss": 0.5734, "step": 865 }, { "epoch": 0.2401552967276761, "grad_norm": 0.19443267583847046, "learning_rate": 2.4293060223656465e-05, "loss": 0.5845, "step": 866 }, { "epoch": 0.240432612312812, "grad_norm": 0.2007235586643219, "learning_rate": 2.4291228485837613e-05, "loss": 0.5829, "step": 867 }, { "epoch": 0.24070992789794787, "grad_norm": 0.18996436893939972, "learning_rate": 2.4289394447237674e-05, "loss": 0.5918, "step": 868 }, { "epoch": 0.24098724348308376, "grad_norm": 0.19309580326080322, "learning_rate": 2.4287558108214527e-05, "loss": 0.5962, "step": 869 }, { "epoch": 0.24126455906821964, "grad_norm": 0.19195586442947388, "learning_rate": 2.428571946912649e-05, "loss": 0.5738, "step": 870 }, { "epoch": 0.24154187465335553, "grad_norm": 0.19671426713466644, "learning_rate": 2.4283878530332322e-05, "loss": 0.5551, "step": 871 }, { "epoch": 0.2418191902384914, "grad_norm": 0.19544430077075958, "learning_rate": 2.4282035292191247e-05, "loss": 0.5876, "step": 872 }, { "epoch": 0.2420965058236273, "grad_norm": 0.19258378446102142, "learning_rate": 2.4280189755062928e-05, "loss": 0.559, "step": 873 }, { "epoch": 0.24237382140876318, "grad_norm": 0.21361422538757324, "learning_rate": 2.427834191930748e-05, "loss": 0.5753, "step": 874 }, { "epoch": 0.24265113699389906, "grad_norm": 0.20279040932655334, "learning_rate": 2.4276491785285457e-05, "loss": 0.5724, "step": 875 }, { "epoch": 0.24292845257903495, "grad_norm": 0.20467157661914825, "learning_rate": 2.427463935335788e-05, "loss": 0.5989, "step": 876 }, { "epoch": 0.24320576816417083, "grad_norm": 0.20144453644752502, "learning_rate": 2.4272784623886195e-05, "loss": 0.5943, "step": 877 }, { "epoch": 0.24348308374930672, "grad_norm": 0.19686299562454224, "learning_rate": 2.4270927597232325e-05, "loss": 0.5692, "step": 878 }, { "epoch": 0.2437603993344426, "grad_norm": 0.22722774744033813, "learning_rate": 2.426906827375861e-05, "loss": 0.566, "step": 879 }, { "epoch": 0.24403771491957849, "grad_norm": 0.19703295826911926, "learning_rate": 2.4267206653827856e-05, "loss": 0.5627, "step": 880 }, { "epoch": 0.24431503050471437, "grad_norm": 0.2020971179008484, "learning_rate": 2.4265342737803327e-05, "loss": 0.5836, "step": 881 }, { "epoch": 0.24459234608985025, "grad_norm": 0.1921062171459198, "learning_rate": 2.4263476526048707e-05, "loss": 0.5651, "step": 882 }, { "epoch": 0.24486966167498614, "grad_norm": 0.20176348090171814, "learning_rate": 2.4261608018928147e-05, "loss": 0.5883, "step": 883 }, { "epoch": 0.24514697726012202, "grad_norm": 0.19450893998146057, "learning_rate": 2.425973721680625e-05, "loss": 0.5591, "step": 884 }, { "epoch": 0.2454242928452579, "grad_norm": 0.8580565452575684, "learning_rate": 2.425786412004805e-05, "loss": 0.5592, "step": 885 }, { "epoch": 0.2457016084303938, "grad_norm": 0.24260735511779785, "learning_rate": 2.4255988729019042e-05, "loss": 0.5902, "step": 886 }, { "epoch": 0.24597892401552968, "grad_norm": 0.19789321720600128, "learning_rate": 2.4254111044085163e-05, "loss": 0.5745, "step": 887 }, { "epoch": 0.24625623960066556, "grad_norm": 0.21979939937591553, "learning_rate": 2.4252231065612805e-05, "loss": 0.5551, "step": 888 }, { "epoch": 0.24653355518580145, "grad_norm": 0.20352937281131744, "learning_rate": 2.425034879396879e-05, "loss": 0.5591, "step": 889 }, { "epoch": 0.24681087077093733, "grad_norm": 0.2068743109703064, "learning_rate": 2.424846422952041e-05, "loss": 0.5848, "step": 890 }, { "epoch": 0.24708818635607321, "grad_norm": 0.19195155799388885, "learning_rate": 2.4246577372635387e-05, "loss": 0.5621, "step": 891 }, { "epoch": 0.2473655019412091, "grad_norm": 0.2014138102531433, "learning_rate": 2.42446882236819e-05, "loss": 0.5689, "step": 892 }, { "epoch": 0.24764281752634498, "grad_norm": 0.2104417085647583, "learning_rate": 2.4242796783028573e-05, "loss": 0.5898, "step": 893 }, { "epoch": 0.24792013311148087, "grad_norm": 0.22085507214069366, "learning_rate": 2.4240903051044474e-05, "loss": 0.5762, "step": 894 }, { "epoch": 0.24819744869661675, "grad_norm": 0.20655465126037598, "learning_rate": 2.4239007028099117e-05, "loss": 0.5654, "step": 895 }, { "epoch": 0.24847476428175264, "grad_norm": 0.2050492912530899, "learning_rate": 2.4237108714562474e-05, "loss": 0.5859, "step": 896 }, { "epoch": 0.24875207986688852, "grad_norm": 0.21355165541172028, "learning_rate": 2.4235208110804947e-05, "loss": 0.5695, "step": 897 }, { "epoch": 0.2490293954520244, "grad_norm": 0.20924112200737, "learning_rate": 2.42333052171974e-05, "loss": 0.5593, "step": 898 }, { "epoch": 0.2493067110371603, "grad_norm": 0.22572918236255646, "learning_rate": 2.423140003411114e-05, "loss": 0.6217, "step": 899 }, { "epoch": 0.24958402662229617, "grad_norm": 0.2063211351633072, "learning_rate": 2.4229492561917914e-05, "loss": 0.5765, "step": 900 }, { "epoch": 0.24986134220743206, "grad_norm": 0.21796129643917084, "learning_rate": 2.4227582800989923e-05, "loss": 0.5932, "step": 901 }, { "epoch": 0.25013865779256794, "grad_norm": 0.20169825851917267, "learning_rate": 2.4225670751699808e-05, "loss": 0.5858, "step": 902 }, { "epoch": 0.2504159733777038, "grad_norm": 0.2097984254360199, "learning_rate": 2.4223756414420668e-05, "loss": 0.5888, "step": 903 }, { "epoch": 0.2506932889628397, "grad_norm": 0.2516496479511261, "learning_rate": 2.4221839789526033e-05, "loss": 0.5812, "step": 904 }, { "epoch": 0.2509706045479756, "grad_norm": 0.2110574096441269, "learning_rate": 2.421992087738989e-05, "loss": 0.5859, "step": 905 }, { "epoch": 0.2512479201331115, "grad_norm": 0.1935090869665146, "learning_rate": 2.4217999678386673e-05, "loss": 0.5768, "step": 906 }, { "epoch": 0.25152523571824736, "grad_norm": 0.20760700106620789, "learning_rate": 2.4216076192891257e-05, "loss": 0.5856, "step": 907 }, { "epoch": 0.25180255130338325, "grad_norm": 0.20197226107120514, "learning_rate": 2.4214150421278964e-05, "loss": 0.6041, "step": 908 }, { "epoch": 0.25207986688851913, "grad_norm": 0.20894859731197357, "learning_rate": 2.4212222363925563e-05, "loss": 0.5821, "step": 909 }, { "epoch": 0.252357182473655, "grad_norm": 0.20673821866512299, "learning_rate": 2.4210292021207268e-05, "loss": 0.5813, "step": 910 }, { "epoch": 0.2526344980587909, "grad_norm": 0.23159608244895935, "learning_rate": 2.420835939350074e-05, "loss": 0.5803, "step": 911 }, { "epoch": 0.2529118136439268, "grad_norm": 0.19740775227546692, "learning_rate": 2.420642448118309e-05, "loss": 0.5721, "step": 912 }, { "epoch": 0.25318912922906267, "grad_norm": 0.20442472398281097, "learning_rate": 2.4204487284631866e-05, "loss": 0.5681, "step": 913 }, { "epoch": 0.25346644481419855, "grad_norm": 0.19745062291622162, "learning_rate": 2.420254780422507e-05, "loss": 0.573, "step": 914 }, { "epoch": 0.25374376039933444, "grad_norm": 0.19819265604019165, "learning_rate": 2.4200606040341147e-05, "loss": 0.6015, "step": 915 }, { "epoch": 0.2540210759844703, "grad_norm": 0.20158332586288452, "learning_rate": 2.4198661993358976e-05, "loss": 0.5557, "step": 916 }, { "epoch": 0.2542983915696062, "grad_norm": 0.19300688803195953, "learning_rate": 2.4196715663657903e-05, "loss": 0.5691, "step": 917 }, { "epoch": 0.2545757071547421, "grad_norm": 0.19178220629692078, "learning_rate": 2.4194767051617707e-05, "loss": 0.5877, "step": 918 }, { "epoch": 0.254853022739878, "grad_norm": 0.19551022350788116, "learning_rate": 2.4192816157618615e-05, "loss": 0.5719, "step": 919 }, { "epoch": 0.25513033832501386, "grad_norm": 0.2008076012134552, "learning_rate": 2.419086298204129e-05, "loss": 0.579, "step": 920 }, { "epoch": 0.25540765391014975, "grad_norm": 0.19526442885398865, "learning_rate": 2.4188907525266856e-05, "loss": 0.546, "step": 921 }, { "epoch": 0.25568496949528563, "grad_norm": 0.1844739466905594, "learning_rate": 2.418694978767687e-05, "loss": 0.5759, "step": 922 }, { "epoch": 0.2559622850804215, "grad_norm": 0.20165039598941803, "learning_rate": 2.4184989769653343e-05, "loss": 0.5618, "step": 923 }, { "epoch": 0.2562396006655574, "grad_norm": 0.19225285947322845, "learning_rate": 2.418302747157872e-05, "loss": 0.5627, "step": 924 }, { "epoch": 0.2565169162506933, "grad_norm": 0.19688788056373596, "learning_rate": 2.418106289383591e-05, "loss": 0.5714, "step": 925 }, { "epoch": 0.25679423183582917, "grad_norm": 0.2039179801940918, "learning_rate": 2.417909603680824e-05, "loss": 0.5975, "step": 926 }, { "epoch": 0.25707154742096505, "grad_norm": 0.21120522916316986, "learning_rate": 2.41771269008795e-05, "loss": 0.6026, "step": 927 }, { "epoch": 0.25734886300610094, "grad_norm": 0.21704575419425964, "learning_rate": 2.4175155486433927e-05, "loss": 0.5798, "step": 928 }, { "epoch": 0.2576261785912368, "grad_norm": 0.18682295083999634, "learning_rate": 2.4173181793856187e-05, "loss": 0.5872, "step": 929 }, { "epoch": 0.2579034941763727, "grad_norm": 0.200609490275383, "learning_rate": 2.4171205823531402e-05, "loss": 0.5568, "step": 930 }, { "epoch": 0.2581808097615086, "grad_norm": 0.1996905356645584, "learning_rate": 2.416922757584514e-05, "loss": 0.5965, "step": 931 }, { "epoch": 0.2584581253466445, "grad_norm": 0.1988278329372406, "learning_rate": 2.4167247051183412e-05, "loss": 0.5819, "step": 932 }, { "epoch": 0.25873544093178036, "grad_norm": 0.20553693175315857, "learning_rate": 2.4165264249932662e-05, "loss": 0.591, "step": 933 }, { "epoch": 0.25901275651691624, "grad_norm": 0.19097572565078735, "learning_rate": 2.416327917247979e-05, "loss": 0.5517, "step": 934 }, { "epoch": 0.2592900721020521, "grad_norm": 0.191832035779953, "learning_rate": 2.4161291819212144e-05, "loss": 0.5811, "step": 935 }, { "epoch": 0.259567387687188, "grad_norm": 0.19829009473323822, "learning_rate": 2.4159302190517496e-05, "loss": 0.5966, "step": 936 }, { "epoch": 0.2598447032723239, "grad_norm": 0.1965586543083191, "learning_rate": 2.415731028678409e-05, "loss": 0.5458, "step": 937 }, { "epoch": 0.2601220188574598, "grad_norm": 0.20748619735240936, "learning_rate": 2.4155316108400593e-05, "loss": 0.5843, "step": 938 }, { "epoch": 0.26039933444259566, "grad_norm": 0.20656223595142365, "learning_rate": 2.415331965575612e-05, "loss": 0.5749, "step": 939 }, { "epoch": 0.26067665002773155, "grad_norm": 0.186055988073349, "learning_rate": 2.4151320929240227e-05, "loss": 0.5749, "step": 940 }, { "epoch": 0.26095396561286743, "grad_norm": 0.21665525436401367, "learning_rate": 2.4149319929242934e-05, "loss": 0.5825, "step": 941 }, { "epoch": 0.2612312811980033, "grad_norm": 0.2007439136505127, "learning_rate": 2.4147316656154674e-05, "loss": 0.5904, "step": 942 }, { "epoch": 0.2615085967831392, "grad_norm": 0.20290826261043549, "learning_rate": 2.4145311110366347e-05, "loss": 0.6019, "step": 943 }, { "epoch": 0.2617859123682751, "grad_norm": 0.20062971115112305, "learning_rate": 2.4143303292269286e-05, "loss": 0.5571, "step": 944 }, { "epoch": 0.26206322795341097, "grad_norm": 0.2056947946548462, "learning_rate": 2.414129320225527e-05, "loss": 0.5683, "step": 945 }, { "epoch": 0.26234054353854686, "grad_norm": 0.18966248631477356, "learning_rate": 2.4139280840716517e-05, "loss": 0.5962, "step": 946 }, { "epoch": 0.26261785912368274, "grad_norm": 0.20114421844482422, "learning_rate": 2.41372662080457e-05, "loss": 0.5528, "step": 947 }, { "epoch": 0.2628951747088186, "grad_norm": 0.1974896341562271, "learning_rate": 2.4135249304635914e-05, "loss": 0.5802, "step": 948 }, { "epoch": 0.2631724902939545, "grad_norm": 0.1997508406639099, "learning_rate": 2.4133230130880726e-05, "loss": 0.6122, "step": 949 }, { "epoch": 0.2634498058790904, "grad_norm": 0.18641312420368195, "learning_rate": 2.413120868717412e-05, "loss": 0.5705, "step": 950 }, { "epoch": 0.2637271214642263, "grad_norm": 0.19226068258285522, "learning_rate": 2.4129184973910533e-05, "loss": 0.5737, "step": 951 }, { "epoch": 0.26400443704936216, "grad_norm": 0.18802867829799652, "learning_rate": 2.4127158991484855e-05, "loss": 0.562, "step": 952 }, { "epoch": 0.26428175263449805, "grad_norm": 0.19342157244682312, "learning_rate": 2.41251307402924e-05, "loss": 0.5483, "step": 953 }, { "epoch": 0.26455906821963393, "grad_norm": 0.1865961253643036, "learning_rate": 2.4123100220728935e-05, "loss": 0.5638, "step": 954 }, { "epoch": 0.2648363838047698, "grad_norm": 0.22456099092960358, "learning_rate": 2.4121067433190666e-05, "loss": 0.5918, "step": 955 }, { "epoch": 0.2651136993899057, "grad_norm": 0.24317651987075806, "learning_rate": 2.4119032378074245e-05, "loss": 0.5477, "step": 956 }, { "epoch": 0.2653910149750416, "grad_norm": 0.19641970098018646, "learning_rate": 2.411699505577677e-05, "loss": 0.5554, "step": 957 }, { "epoch": 0.26566833056017747, "grad_norm": 0.20111410319805145, "learning_rate": 2.4114955466695773e-05, "loss": 0.562, "step": 958 }, { "epoch": 0.26594564614531335, "grad_norm": 0.19789332151412964, "learning_rate": 2.411291361122923e-05, "loss": 0.5919, "step": 959 }, { "epoch": 0.26622296173044924, "grad_norm": 0.20293276011943817, "learning_rate": 2.4110869489775567e-05, "loss": 0.5839, "step": 960 }, { "epoch": 0.2665002773155851, "grad_norm": 0.18560869991779327, "learning_rate": 2.410882310273364e-05, "loss": 0.5476, "step": 961 }, { "epoch": 0.266777592900721, "grad_norm": 0.1932571828365326, "learning_rate": 2.410677445050276e-05, "loss": 0.5736, "step": 962 }, { "epoch": 0.2670549084858569, "grad_norm": 0.21673519909381866, "learning_rate": 2.4104723533482664e-05, "loss": 0.5558, "step": 963 }, { "epoch": 0.2673322240709928, "grad_norm": 0.21258150041103363, "learning_rate": 2.4102670352073548e-05, "loss": 0.591, "step": 964 }, { "epoch": 0.26760953965612866, "grad_norm": 0.20553314685821533, "learning_rate": 2.4100614906676036e-05, "loss": 0.5901, "step": 965 }, { "epoch": 0.26788685524126454, "grad_norm": 0.23216207325458527, "learning_rate": 2.4098557197691204e-05, "loss": 0.5677, "step": 966 }, { "epoch": 0.2681641708264004, "grad_norm": 0.20041371881961823, "learning_rate": 2.4096497225520564e-05, "loss": 0.566, "step": 967 }, { "epoch": 0.2684414864115363, "grad_norm": 0.20803777873516083, "learning_rate": 2.4094434990566076e-05, "loss": 0.5714, "step": 968 }, { "epoch": 0.2687188019966722, "grad_norm": 0.19973017275333405, "learning_rate": 2.409237049323013e-05, "loss": 0.5805, "step": 969 }, { "epoch": 0.2689961175818081, "grad_norm": 0.20199733972549438, "learning_rate": 2.4090303733915567e-05, "loss": 0.5475, "step": 970 }, { "epoch": 0.26927343316694397, "grad_norm": 0.2112300544977188, "learning_rate": 2.4088234713025664e-05, "loss": 0.5821, "step": 971 }, { "epoch": 0.26955074875207985, "grad_norm": 0.18687258660793304, "learning_rate": 2.408616343096415e-05, "loss": 0.5369, "step": 972 }, { "epoch": 0.26982806433721573, "grad_norm": 0.20027992129325867, "learning_rate": 2.4084089888135176e-05, "loss": 0.551, "step": 973 }, { "epoch": 0.2701053799223516, "grad_norm": 0.1915608048439026, "learning_rate": 2.408201408494335e-05, "loss": 0.5653, "step": 974 }, { "epoch": 0.2703826955074875, "grad_norm": 0.2044133096933365, "learning_rate": 2.407993602179372e-05, "loss": 0.597, "step": 975 }, { "epoch": 0.2706600110926234, "grad_norm": 0.20056426525115967, "learning_rate": 2.4077855699091764e-05, "loss": 0.5864, "step": 976 }, { "epoch": 0.27093732667775927, "grad_norm": 0.19527383148670197, "learning_rate": 2.407577311724341e-05, "loss": 0.565, "step": 977 }, { "epoch": 0.27121464226289516, "grad_norm": 0.2120949625968933, "learning_rate": 2.407368827665503e-05, "loss": 0.5621, "step": 978 }, { "epoch": 0.27149195784803104, "grad_norm": 0.18631702661514282, "learning_rate": 2.407160117773343e-05, "loss": 0.5657, "step": 979 }, { "epoch": 0.2717692734331669, "grad_norm": 0.19784550368785858, "learning_rate": 2.4069511820885854e-05, "loss": 0.5547, "step": 980 }, { "epoch": 0.2720465890183028, "grad_norm": 0.19714047014713287, "learning_rate": 2.4067420206519993e-05, "loss": 0.5793, "step": 981 }, { "epoch": 0.2723239046034387, "grad_norm": 0.2170424610376358, "learning_rate": 2.4065326335043976e-05, "loss": 0.5745, "step": 982 }, { "epoch": 0.2726012201885746, "grad_norm": 0.20346680283546448, "learning_rate": 2.4063230206866377e-05, "loss": 0.5828, "step": 983 }, { "epoch": 0.27287853577371046, "grad_norm": 0.2216998040676117, "learning_rate": 2.40611318223962e-05, "loss": 0.5762, "step": 984 }, { "epoch": 0.27315585135884635, "grad_norm": 0.20975996553897858, "learning_rate": 2.4059031182042897e-05, "loss": 0.5442, "step": 985 }, { "epoch": 0.27343316694398223, "grad_norm": 0.19896754622459412, "learning_rate": 2.405692828621636e-05, "loss": 0.5977, "step": 986 }, { "epoch": 0.2737104825291181, "grad_norm": 0.18813903629779816, "learning_rate": 2.4054823135326922e-05, "loss": 0.5641, "step": 987 }, { "epoch": 0.273987798114254, "grad_norm": 0.19095094501972198, "learning_rate": 2.4052715729785348e-05, "loss": 0.5427, "step": 988 }, { "epoch": 0.2742651136993899, "grad_norm": 0.19185671210289001, "learning_rate": 2.405060607000285e-05, "loss": 0.557, "step": 989 }, { "epoch": 0.27454242928452577, "grad_norm": 0.19244584441184998, "learning_rate": 2.4048494156391087e-05, "loss": 0.5558, "step": 990 }, { "epoch": 0.27481974486966165, "grad_norm": 0.20083992183208466, "learning_rate": 2.404637998936214e-05, "loss": 0.5635, "step": 991 }, { "epoch": 0.27509706045479754, "grad_norm": 0.19767695665359497, "learning_rate": 2.404426356932854e-05, "loss": 0.5814, "step": 992 }, { "epoch": 0.2753743760399334, "grad_norm": 0.19133426249027252, "learning_rate": 2.4042144896703256e-05, "loss": 0.5951, "step": 993 }, { "epoch": 0.2756516916250693, "grad_norm": 0.19364149868488312, "learning_rate": 2.40400239718997e-05, "loss": 0.5695, "step": 994 }, { "epoch": 0.2759290072102052, "grad_norm": 0.19669091701507568, "learning_rate": 2.4037900795331722e-05, "loss": 0.5801, "step": 995 }, { "epoch": 0.2762063227953411, "grad_norm": 0.2011607140302658, "learning_rate": 2.403577536741361e-05, "loss": 0.5736, "step": 996 }, { "epoch": 0.27648363838047696, "grad_norm": 0.19536298513412476, "learning_rate": 2.4033647688560084e-05, "loss": 0.5404, "step": 997 }, { "epoch": 0.27676095396561284, "grad_norm": 0.1903197318315506, "learning_rate": 2.403151775918632e-05, "loss": 0.5939, "step": 998 }, { "epoch": 0.27703826955074873, "grad_norm": 0.19172310829162598, "learning_rate": 2.4029385579707916e-05, "loss": 0.5688, "step": 999 }, { "epoch": 0.2773155851358846, "grad_norm": 0.22239771485328674, "learning_rate": 2.402725115054092e-05, "loss": 0.5754, "step": 1000 }, { "epoch": 0.2775929007210205, "grad_norm": 0.18384471535682678, "learning_rate": 2.402511447210182e-05, "loss": 0.5668, "step": 1001 }, { "epoch": 0.2778702163061564, "grad_norm": 0.2017565667629242, "learning_rate": 2.402297554480753e-05, "loss": 0.5711, "step": 1002 }, { "epoch": 0.27814753189129227, "grad_norm": 0.1890055239200592, "learning_rate": 2.402083436907542e-05, "loss": 0.5552, "step": 1003 }, { "epoch": 0.27842484747642815, "grad_norm": 0.1961050033569336, "learning_rate": 2.4018690945323284e-05, "loss": 0.5744, "step": 1004 }, { "epoch": 0.27870216306156403, "grad_norm": 0.2047930508852005, "learning_rate": 2.401654527396936e-05, "loss": 0.5794, "step": 1005 }, { "epoch": 0.2789794786466999, "grad_norm": 0.19990523159503937, "learning_rate": 2.4014397355432335e-05, "loss": 0.6046, "step": 1006 }, { "epoch": 0.2792567942318358, "grad_norm": 0.19778995215892792, "learning_rate": 2.401224719013131e-05, "loss": 0.5621, "step": 1007 }, { "epoch": 0.2795341098169717, "grad_norm": 0.1909160017967224, "learning_rate": 2.4010094778485846e-05, "loss": 0.5943, "step": 1008 }, { "epoch": 0.27981142540210757, "grad_norm": 0.20408110320568085, "learning_rate": 2.4007940120915946e-05, "loss": 0.5991, "step": 1009 }, { "epoch": 0.28008874098724346, "grad_norm": 0.20102624595165253, "learning_rate": 2.4005783217842024e-05, "loss": 0.6022, "step": 1010 }, { "epoch": 0.28036605657237934, "grad_norm": 0.1910308301448822, "learning_rate": 2.4003624069684957e-05, "loss": 0.5874, "step": 1011 }, { "epoch": 0.2806433721575152, "grad_norm": 0.189175084233284, "learning_rate": 2.4001462676866054e-05, "loss": 0.5698, "step": 1012 }, { "epoch": 0.2809206877426511, "grad_norm": 0.19003014266490936, "learning_rate": 2.3999299039807055e-05, "loss": 0.5819, "step": 1013 }, { "epoch": 0.281198003327787, "grad_norm": 0.2033187299966812, "learning_rate": 2.3997133158930145e-05, "loss": 0.5979, "step": 1014 }, { "epoch": 0.28147531891292293, "grad_norm": 0.1880473643541336, "learning_rate": 2.3994965034657946e-05, "loss": 0.5472, "step": 1015 }, { "epoch": 0.2817526344980588, "grad_norm": 0.19091346859931946, "learning_rate": 2.3992794667413514e-05, "loss": 0.5698, "step": 1016 }, { "epoch": 0.2820299500831947, "grad_norm": 0.19986368715763092, "learning_rate": 2.399062205762035e-05, "loss": 0.5956, "step": 1017 }, { "epoch": 0.2823072656683306, "grad_norm": 0.19207067787647247, "learning_rate": 2.398844720570238e-05, "loss": 0.549, "step": 1018 }, { "epoch": 0.28258458125346647, "grad_norm": 0.19408905506134033, "learning_rate": 2.398627011208398e-05, "loss": 0.5938, "step": 1019 }, { "epoch": 0.28286189683860236, "grad_norm": 0.20354047417640686, "learning_rate": 2.398409077718996e-05, "loss": 0.5488, "step": 1020 }, { "epoch": 0.28313921242373824, "grad_norm": 0.19870953261852264, "learning_rate": 2.3981909201445563e-05, "loss": 0.565, "step": 1021 }, { "epoch": 0.2834165280088741, "grad_norm": 0.19812439382076263, "learning_rate": 2.3979725385276475e-05, "loss": 0.5455, "step": 1022 }, { "epoch": 0.28369384359401, "grad_norm": 0.1886581927537918, "learning_rate": 2.3977539329108813e-05, "loss": 0.5458, "step": 1023 }, { "epoch": 0.2839711591791459, "grad_norm": 0.1987125426530838, "learning_rate": 2.3975351033369138e-05, "loss": 0.5669, "step": 1024 }, { "epoch": 0.2842484747642818, "grad_norm": 0.22528746724128723, "learning_rate": 2.397316049848444e-05, "loss": 0.579, "step": 1025 }, { "epoch": 0.28452579034941766, "grad_norm": 0.20103305578231812, "learning_rate": 2.3970967724882154e-05, "loss": 0.5843, "step": 1026 }, { "epoch": 0.28480310593455355, "grad_norm": 0.21639437973499298, "learning_rate": 2.396877271299015e-05, "loss": 0.5709, "step": 1027 }, { "epoch": 0.28508042151968943, "grad_norm": 0.24263976514339447, "learning_rate": 2.3966575463236725e-05, "loss": 0.5926, "step": 1028 }, { "epoch": 0.2853577371048253, "grad_norm": 0.19815106689929962, "learning_rate": 2.396437597605063e-05, "loss": 0.5722, "step": 1029 }, { "epoch": 0.2856350526899612, "grad_norm": 0.19260184466838837, "learning_rate": 2.396217425186104e-05, "loss": 0.5621, "step": 1030 }, { "epoch": 0.2859123682750971, "grad_norm": 0.20668724179267883, "learning_rate": 2.3959970291097566e-05, "loss": 0.5924, "step": 1031 }, { "epoch": 0.28618968386023297, "grad_norm": 0.2019844353199005, "learning_rate": 2.3957764094190265e-05, "loss": 0.5821, "step": 1032 }, { "epoch": 0.28646699944536885, "grad_norm": 0.1919315904378891, "learning_rate": 2.3955555661569617e-05, "loss": 0.5985, "step": 1033 }, { "epoch": 0.28674431503050474, "grad_norm": 0.19516989588737488, "learning_rate": 2.3953344993666555e-05, "loss": 0.5993, "step": 1034 }, { "epoch": 0.2870216306156406, "grad_norm": 0.18881164491176605, "learning_rate": 2.3951132090912432e-05, "loss": 0.5911, "step": 1035 }, { "epoch": 0.2872989462007765, "grad_norm": 0.20282314717769623, "learning_rate": 2.3948916953739045e-05, "loss": 0.5939, "step": 1036 }, { "epoch": 0.2875762617859124, "grad_norm": 0.19400392472743988, "learning_rate": 2.394669958257863e-05, "loss": 0.5993, "step": 1037 }, { "epoch": 0.2878535773710483, "grad_norm": 0.21767151355743408, "learning_rate": 2.3944479977863847e-05, "loss": 0.5796, "step": 1038 }, { "epoch": 0.28813089295618416, "grad_norm": 0.21109527349472046, "learning_rate": 2.3942258140027805e-05, "loss": 0.5678, "step": 1039 }, { "epoch": 0.28840820854132004, "grad_norm": 0.19805195927619934, "learning_rate": 2.3940034069504048e-05, "loss": 0.5645, "step": 1040 }, { "epoch": 0.28868552412645593, "grad_norm": 0.18621553480625153, "learning_rate": 2.3937807766726545e-05, "loss": 0.5639, "step": 1041 }, { "epoch": 0.2889628397115918, "grad_norm": 0.19867920875549316, "learning_rate": 2.3935579232129705e-05, "loss": 0.578, "step": 1042 }, { "epoch": 0.2892401552967277, "grad_norm": 0.1965349018573761, "learning_rate": 2.393334846614838e-05, "loss": 0.5895, "step": 1043 }, { "epoch": 0.2895174708818636, "grad_norm": 0.21346546709537506, "learning_rate": 2.3931115469217848e-05, "loss": 0.5592, "step": 1044 }, { "epoch": 0.28979478646699947, "grad_norm": 0.19675497710704803, "learning_rate": 2.392888024177382e-05, "loss": 0.5709, "step": 1045 }, { "epoch": 0.29007210205213535, "grad_norm": 0.18906576931476593, "learning_rate": 2.392664278425246e-05, "loss": 0.5721, "step": 1046 }, { "epoch": 0.29034941763727123, "grad_norm": 0.1987171620130539, "learning_rate": 2.3924403097090348e-05, "loss": 0.5796, "step": 1047 }, { "epoch": 0.2906267332224071, "grad_norm": 0.20017191767692566, "learning_rate": 2.392216118072451e-05, "loss": 0.5803, "step": 1048 }, { "epoch": 0.290904048807543, "grad_norm": 0.1904933750629425, "learning_rate": 2.39199170355924e-05, "loss": 0.5847, "step": 1049 }, { "epoch": 0.2911813643926789, "grad_norm": 0.20761634409427643, "learning_rate": 2.3917670662131914e-05, "loss": 0.5678, "step": 1050 }, { "epoch": 0.29145867997781477, "grad_norm": 0.1986733376979828, "learning_rate": 2.391542206078137e-05, "loss": 0.5886, "step": 1051 }, { "epoch": 0.29173599556295066, "grad_norm": 0.2128080427646637, "learning_rate": 2.3913171231979543e-05, "loss": 0.5649, "step": 1052 }, { "epoch": 0.29201331114808654, "grad_norm": 0.1908857524394989, "learning_rate": 2.391091817616562e-05, "loss": 0.5951, "step": 1053 }, { "epoch": 0.2922906267332224, "grad_norm": 0.19354763627052307, "learning_rate": 2.3908662893779228e-05, "loss": 0.5878, "step": 1054 }, { "epoch": 0.2925679423183583, "grad_norm": 0.19191789627075195, "learning_rate": 2.3906405385260443e-05, "loss": 0.5842, "step": 1055 }, { "epoch": 0.2928452579034942, "grad_norm": 0.19070293009281158, "learning_rate": 2.3904145651049764e-05, "loss": 0.5707, "step": 1056 }, { "epoch": 0.2931225734886301, "grad_norm": 0.20022979378700256, "learning_rate": 2.3901883691588116e-05, "loss": 0.5838, "step": 1057 }, { "epoch": 0.29339988907376596, "grad_norm": 0.18544460833072662, "learning_rate": 2.3899619507316878e-05, "loss": 0.5732, "step": 1058 }, { "epoch": 0.29367720465890185, "grad_norm": 0.19487732648849487, "learning_rate": 2.3897353098677845e-05, "loss": 0.5681, "step": 1059 }, { "epoch": 0.29395452024403773, "grad_norm": 0.19073253870010376, "learning_rate": 2.3895084466113253e-05, "loss": 0.5638, "step": 1060 }, { "epoch": 0.2942318358291736, "grad_norm": 0.19357110559940338, "learning_rate": 2.3892813610065778e-05, "loss": 0.5617, "step": 1061 }, { "epoch": 0.2945091514143095, "grad_norm": 0.19819645583629608, "learning_rate": 2.3890540530978518e-05, "loss": 0.5601, "step": 1062 }, { "epoch": 0.2947864669994454, "grad_norm": 0.20150421559810638, "learning_rate": 2.3888265229295014e-05, "loss": 0.5675, "step": 1063 }, { "epoch": 0.29506378258458127, "grad_norm": 0.19517284631729126, "learning_rate": 2.388598770545924e-05, "loss": 0.5516, "step": 1064 }, { "epoch": 0.29534109816971715, "grad_norm": 0.2073058784008026, "learning_rate": 2.3883707959915594e-05, "loss": 0.5729, "step": 1065 }, { "epoch": 0.29561841375485304, "grad_norm": 0.190653994679451, "learning_rate": 2.3881425993108922e-05, "loss": 0.5932, "step": 1066 }, { "epoch": 0.2958957293399889, "grad_norm": 0.19685585796833038, "learning_rate": 2.3879141805484492e-05, "loss": 0.5579, "step": 1067 }, { "epoch": 0.2961730449251248, "grad_norm": 0.19154155254364014, "learning_rate": 2.3876855397488014e-05, "loss": 0.5622, "step": 1068 }, { "epoch": 0.2964503605102607, "grad_norm": 0.23048245906829834, "learning_rate": 2.387456676956562e-05, "loss": 0.5951, "step": 1069 }, { "epoch": 0.2967276760953966, "grad_norm": 0.2001733034849167, "learning_rate": 2.387227592216389e-05, "loss": 0.5723, "step": 1070 }, { "epoch": 0.29700499168053246, "grad_norm": 0.20289377868175507, "learning_rate": 2.3869982855729822e-05, "loss": 0.6023, "step": 1071 }, { "epoch": 0.29728230726566834, "grad_norm": 0.18517981469631195, "learning_rate": 2.386768757071086e-05, "loss": 0.6075, "step": 1072 }, { "epoch": 0.29755962285080423, "grad_norm": 0.18674753606319427, "learning_rate": 2.3865390067554865e-05, "loss": 0.5725, "step": 1073 }, { "epoch": 0.2978369384359401, "grad_norm": 0.19436419010162354, "learning_rate": 2.3863090346710153e-05, "loss": 0.595, "step": 1074 }, { "epoch": 0.298114254021076, "grad_norm": 0.1953345537185669, "learning_rate": 2.3860788408625456e-05, "loss": 0.5845, "step": 1075 }, { "epoch": 0.2983915696062119, "grad_norm": 0.23673701286315918, "learning_rate": 2.385848425374994e-05, "loss": 0.5716, "step": 1076 }, { "epoch": 0.29866888519134777, "grad_norm": 0.20462685823440552, "learning_rate": 2.385617788253321e-05, "loss": 0.5486, "step": 1077 }, { "epoch": 0.29894620077648365, "grad_norm": 0.19857102632522583, "learning_rate": 2.3853869295425296e-05, "loss": 0.5744, "step": 1078 }, { "epoch": 0.29922351636161953, "grad_norm": 0.21950219571590424, "learning_rate": 2.385155849287667e-05, "loss": 0.5644, "step": 1079 }, { "epoch": 0.2995008319467554, "grad_norm": 0.25962188839912415, "learning_rate": 2.384924547533823e-05, "loss": 0.5826, "step": 1080 }, { "epoch": 0.2997781475318913, "grad_norm": 0.21206709742546082, "learning_rate": 2.3846930243261302e-05, "loss": 0.5726, "step": 1081 }, { "epoch": 0.3000554631170272, "grad_norm": 0.20144885778427124, "learning_rate": 2.384461279709765e-05, "loss": 0.5736, "step": 1082 }, { "epoch": 0.3003327787021631, "grad_norm": 0.19098758697509766, "learning_rate": 2.3842293137299475e-05, "loss": 0.57, "step": 1083 }, { "epoch": 0.30061009428729896, "grad_norm": 0.20457082986831665, "learning_rate": 2.38399712643194e-05, "loss": 0.5527, "step": 1084 }, { "epoch": 0.30088740987243484, "grad_norm": 0.22459912300109863, "learning_rate": 2.3837647178610482e-05, "loss": 0.5907, "step": 1085 }, { "epoch": 0.3011647254575707, "grad_norm": 0.2092091143131256, "learning_rate": 2.3835320880626216e-05, "loss": 0.5796, "step": 1086 }, { "epoch": 0.3014420410427066, "grad_norm": 0.18621404469013214, "learning_rate": 2.3832992370820523e-05, "loss": 0.6085, "step": 1087 }, { "epoch": 0.3017193566278425, "grad_norm": 0.1938057541847229, "learning_rate": 2.3830661649647757e-05, "loss": 0.5642, "step": 1088 }, { "epoch": 0.3019966722129784, "grad_norm": 0.2136821299791336, "learning_rate": 2.3828328717562704e-05, "loss": 0.5621, "step": 1089 }, { "epoch": 0.30227398779811426, "grad_norm": 0.200357124209404, "learning_rate": 2.3825993575020577e-05, "loss": 0.5728, "step": 1090 }, { "epoch": 0.30255130338325015, "grad_norm": 0.19953665137290955, "learning_rate": 2.382365622247703e-05, "loss": 0.5677, "step": 1091 }, { "epoch": 0.30282861896838603, "grad_norm": 0.19972378015518188, "learning_rate": 2.382131666038814e-05, "loss": 0.5623, "step": 1092 }, { "epoch": 0.3031059345535219, "grad_norm": 0.21744661033153534, "learning_rate": 2.381897488921041e-05, "loss": 0.5508, "step": 1093 }, { "epoch": 0.3033832501386578, "grad_norm": 0.18918602168560028, "learning_rate": 2.3816630909400793e-05, "loss": 0.5805, "step": 1094 }, { "epoch": 0.3036605657237937, "grad_norm": 0.1894434541463852, "learning_rate": 2.3814284721416656e-05, "loss": 0.563, "step": 1095 }, { "epoch": 0.30393788130892957, "grad_norm": 0.19141651690006256, "learning_rate": 2.3811936325715807e-05, "loss": 0.5839, "step": 1096 }, { "epoch": 0.30421519689406545, "grad_norm": 0.1895507425069809, "learning_rate": 2.3809585722756472e-05, "loss": 0.5864, "step": 1097 }, { "epoch": 0.30449251247920134, "grad_norm": 0.21037045121192932, "learning_rate": 2.3807232912997324e-05, "loss": 0.5806, "step": 1098 }, { "epoch": 0.3047698280643372, "grad_norm": 0.20355623960494995, "learning_rate": 2.3804877896897455e-05, "loss": 0.5677, "step": 1099 }, { "epoch": 0.3050471436494731, "grad_norm": 0.20115728676319122, "learning_rate": 2.380252067491639e-05, "loss": 0.5635, "step": 1100 }, { "epoch": 0.305324459234609, "grad_norm": 0.19460982084274292, "learning_rate": 2.3800161247514086e-05, "loss": 0.5774, "step": 1101 }, { "epoch": 0.3056017748197449, "grad_norm": 0.18929602205753326, "learning_rate": 2.3797799615150934e-05, "loss": 0.5708, "step": 1102 }, { "epoch": 0.30587909040488076, "grad_norm": 0.20288972556591034, "learning_rate": 2.3795435778287745e-05, "loss": 0.5852, "step": 1103 }, { "epoch": 0.30615640599001664, "grad_norm": 0.20964893698692322, "learning_rate": 2.379306973738577e-05, "loss": 0.5731, "step": 1104 }, { "epoch": 0.30643372157515253, "grad_norm": 0.2250620722770691, "learning_rate": 2.379070149290668e-05, "loss": 0.5741, "step": 1105 }, { "epoch": 0.3067110371602884, "grad_norm": 0.19434000551700592, "learning_rate": 2.3788331045312592e-05, "loss": 0.5949, "step": 1106 }, { "epoch": 0.3069883527454243, "grad_norm": 0.1980692446231842, "learning_rate": 2.3785958395066037e-05, "loss": 0.5591, "step": 1107 }, { "epoch": 0.3072656683305602, "grad_norm": 0.20279406011104584, "learning_rate": 2.3783583542629984e-05, "loss": 0.5925, "step": 1108 }, { "epoch": 0.30754298391569607, "grad_norm": 0.19802772998809814, "learning_rate": 2.378120648846783e-05, "loss": 0.5756, "step": 1109 }, { "epoch": 0.30782029950083195, "grad_norm": 0.19455523788928986, "learning_rate": 2.37788272330434e-05, "loss": 0.5846, "step": 1110 }, { "epoch": 0.30809761508596784, "grad_norm": 0.19493591785430908, "learning_rate": 2.3776445776820948e-05, "loss": 0.5788, "step": 1111 }, { "epoch": 0.3083749306711037, "grad_norm": 0.19869014620780945, "learning_rate": 2.3774062120265163e-05, "loss": 0.5836, "step": 1112 }, { "epoch": 0.3086522462562396, "grad_norm": 0.19301962852478027, "learning_rate": 2.3771676263841157e-05, "loss": 0.5689, "step": 1113 }, { "epoch": 0.3089295618413755, "grad_norm": 0.2125353217124939, "learning_rate": 2.3769288208014473e-05, "loss": 0.579, "step": 1114 }, { "epoch": 0.3092068774265114, "grad_norm": 0.20093408226966858, "learning_rate": 2.376689795325109e-05, "loss": 0.5489, "step": 1115 }, { "epoch": 0.30948419301164726, "grad_norm": 0.20457975566387177, "learning_rate": 2.37645055000174e-05, "loss": 0.5734, "step": 1116 }, { "epoch": 0.30976150859678314, "grad_norm": 0.2070866972208023, "learning_rate": 2.376211084878024e-05, "loss": 0.5711, "step": 1117 }, { "epoch": 0.310038824181919, "grad_norm": 0.19698989391326904, "learning_rate": 2.375971400000687e-05, "loss": 0.5781, "step": 1118 }, { "epoch": 0.3103161397670549, "grad_norm": 0.19639791548252106, "learning_rate": 2.3757314954164982e-05, "loss": 0.5496, "step": 1119 }, { "epoch": 0.3105934553521908, "grad_norm": 0.19890196621418, "learning_rate": 2.3754913711722687e-05, "loss": 0.5658, "step": 1120 }, { "epoch": 0.3108707709373267, "grad_norm": 0.2007942795753479, "learning_rate": 2.3752510273148533e-05, "loss": 0.5679, "step": 1121 }, { "epoch": 0.31114808652246256, "grad_norm": 0.21742717921733856, "learning_rate": 2.3750104638911493e-05, "loss": 0.5603, "step": 1122 }, { "epoch": 0.31142540210759845, "grad_norm": 0.19708824157714844, "learning_rate": 2.3747696809480974e-05, "loss": 0.561, "step": 1123 }, { "epoch": 0.31170271769273433, "grad_norm": 0.20064733922481537, "learning_rate": 2.374528678532681e-05, "loss": 0.5535, "step": 1124 }, { "epoch": 0.3119800332778702, "grad_norm": 0.19142742455005646, "learning_rate": 2.3742874566919248e-05, "loss": 0.5621, "step": 1125 }, { "epoch": 0.3122573488630061, "grad_norm": 0.20165249705314636, "learning_rate": 2.3740460154728987e-05, "loss": 0.5862, "step": 1126 }, { "epoch": 0.312534664448142, "grad_norm": 0.19451411068439484, "learning_rate": 2.373804354922714e-05, "loss": 0.5611, "step": 1127 }, { "epoch": 0.31281198003327787, "grad_norm": 0.20111672580242157, "learning_rate": 2.373562475088525e-05, "loss": 0.5629, "step": 1128 }, { "epoch": 0.31308929561841375, "grad_norm": 0.21810267865657806, "learning_rate": 2.3733203760175292e-05, "loss": 0.5746, "step": 1129 }, { "epoch": 0.31336661120354964, "grad_norm": 0.20461341738700867, "learning_rate": 2.3730780577569654e-05, "loss": 0.5429, "step": 1130 }, { "epoch": 0.3136439267886855, "grad_norm": 0.18515266478061676, "learning_rate": 2.3728355203541182e-05, "loss": 0.5547, "step": 1131 }, { "epoch": 0.3139212423738214, "grad_norm": 0.1941545456647873, "learning_rate": 2.3725927638563112e-05, "loss": 0.5566, "step": 1132 }, { "epoch": 0.3141985579589573, "grad_norm": 0.20392583310604095, "learning_rate": 2.3723497883109137e-05, "loss": 0.5506, "step": 1133 }, { "epoch": 0.3144758735440932, "grad_norm": 0.19422155618667603, "learning_rate": 2.3721065937653363e-05, "loss": 0.5826, "step": 1134 }, { "epoch": 0.31475318912922906, "grad_norm": 0.19887231290340424, "learning_rate": 2.3718631802670334e-05, "loss": 0.5651, "step": 1135 }, { "epoch": 0.31503050471436495, "grad_norm": 0.20396895706653595, "learning_rate": 2.3716195478635e-05, "loss": 0.5396, "step": 1136 }, { "epoch": 0.31530782029950083, "grad_norm": 0.2106340378522873, "learning_rate": 2.3713756966022766e-05, "loss": 0.552, "step": 1137 }, { "epoch": 0.3155851358846367, "grad_norm": 0.20762008428573608, "learning_rate": 2.371131626530944e-05, "loss": 0.5942, "step": 1138 }, { "epoch": 0.3158624514697726, "grad_norm": 0.19259460270404816, "learning_rate": 2.3708873376971277e-05, "loss": 0.5364, "step": 1139 }, { "epoch": 0.3161397670549085, "grad_norm": 0.19473238289356232, "learning_rate": 2.3706428301484946e-05, "loss": 0.5862, "step": 1140 }, { "epoch": 0.31641708264004437, "grad_norm": 0.20146487653255463, "learning_rate": 2.370398103932754e-05, "loss": 0.5924, "step": 1141 }, { "epoch": 0.31669439822518025, "grad_norm": 0.19463180005550385, "learning_rate": 2.370153159097659e-05, "loss": 0.554, "step": 1142 }, { "epoch": 0.31697171381031614, "grad_norm": 0.22867369651794434, "learning_rate": 2.3699079956910052e-05, "loss": 0.58, "step": 1143 }, { "epoch": 0.317249029395452, "grad_norm": 0.1938031017780304, "learning_rate": 2.3696626137606297e-05, "loss": 0.5998, "step": 1144 }, { "epoch": 0.3175263449805879, "grad_norm": 0.1907264143228531, "learning_rate": 2.369417013354413e-05, "loss": 0.5601, "step": 1145 }, { "epoch": 0.3178036605657238, "grad_norm": 0.21385140717029572, "learning_rate": 2.369171194520279e-05, "loss": 0.5919, "step": 1146 }, { "epoch": 0.3180809761508597, "grad_norm": 0.23154176771640778, "learning_rate": 2.3689251573061932e-05, "loss": 0.5901, "step": 1147 }, { "epoch": 0.31835829173599556, "grad_norm": 0.19392211735248566, "learning_rate": 2.3686789017601634e-05, "loss": 0.5514, "step": 1148 }, { "epoch": 0.31863560732113144, "grad_norm": 0.19515374302864075, "learning_rate": 2.3684324279302418e-05, "loss": 0.558, "step": 1149 }, { "epoch": 0.3189129229062673, "grad_norm": 0.18750956654548645, "learning_rate": 2.3681857358645205e-05, "loss": 0.5544, "step": 1150 }, { "epoch": 0.3191902384914032, "grad_norm": 0.20247051119804382, "learning_rate": 2.3679388256111368e-05, "loss": 0.5724, "step": 1151 }, { "epoch": 0.3194675540765391, "grad_norm": 0.19525887072086334, "learning_rate": 2.3676916972182686e-05, "loss": 0.583, "step": 1152 }, { "epoch": 0.319744869661675, "grad_norm": 0.20457209646701813, "learning_rate": 2.3674443507341377e-05, "loss": 0.5592, "step": 1153 }, { "epoch": 0.32002218524681086, "grad_norm": 0.19369947910308838, "learning_rate": 2.367196786207008e-05, "loss": 0.5889, "step": 1154 }, { "epoch": 0.32029950083194675, "grad_norm": 0.18851204216480255, "learning_rate": 2.3669490036851856e-05, "loss": 0.5804, "step": 1155 }, { "epoch": 0.32057681641708263, "grad_norm": 0.1838391125202179, "learning_rate": 2.3667010032170196e-05, "loss": 0.5251, "step": 1156 }, { "epoch": 0.3208541320022185, "grad_norm": 0.1949407011270523, "learning_rate": 2.3664527848509015e-05, "loss": 0.5719, "step": 1157 }, { "epoch": 0.3211314475873544, "grad_norm": 0.2245536744594574, "learning_rate": 2.3662043486352653e-05, "loss": 0.5699, "step": 1158 }, { "epoch": 0.3214087631724903, "grad_norm": 0.18889588117599487, "learning_rate": 2.3659556946185875e-05, "loss": 0.5449, "step": 1159 }, { "epoch": 0.32168607875762617, "grad_norm": 0.18529140949249268, "learning_rate": 2.3657068228493863e-05, "loss": 0.5469, "step": 1160 }, { "epoch": 0.32196339434276205, "grad_norm": 0.19042176008224487, "learning_rate": 2.3654577333762246e-05, "loss": 0.5662, "step": 1161 }, { "epoch": 0.32224070992789794, "grad_norm": 0.22052356600761414, "learning_rate": 2.3652084262477055e-05, "loss": 0.5603, "step": 1162 }, { "epoch": 0.3225180255130338, "grad_norm": 0.19664537906646729, "learning_rate": 2.364958901512475e-05, "loss": 0.5837, "step": 1163 }, { "epoch": 0.3227953410981697, "grad_norm": 0.1836601048707962, "learning_rate": 2.3647091592192234e-05, "loss": 0.5626, "step": 1164 }, { "epoch": 0.3230726566833056, "grad_norm": 0.19754944741725922, "learning_rate": 2.3644591994166805e-05, "loss": 0.5867, "step": 1165 }, { "epoch": 0.3233499722684415, "grad_norm": 0.18801425397396088, "learning_rate": 2.364209022153621e-05, "loss": 0.5447, "step": 1166 }, { "epoch": 0.32362728785357736, "grad_norm": 0.19709810614585876, "learning_rate": 2.363958627478861e-05, "loss": 0.5953, "step": 1167 }, { "epoch": 0.32390460343871325, "grad_norm": 0.19867432117462158, "learning_rate": 2.3637080154412588e-05, "loss": 0.6041, "step": 1168 }, { "epoch": 0.32418191902384913, "grad_norm": 0.19217659533023834, "learning_rate": 2.363457186089716e-05, "loss": 0.5693, "step": 1169 }, { "epoch": 0.324459234608985, "grad_norm": 0.20366674661636353, "learning_rate": 2.3632061394731753e-05, "loss": 0.5957, "step": 1170 }, { "epoch": 0.3247365501941209, "grad_norm": 0.19761385023593903, "learning_rate": 2.362954875640623e-05, "loss": 0.5722, "step": 1171 }, { "epoch": 0.3250138657792568, "grad_norm": 0.20664581656455994, "learning_rate": 2.362703394641087e-05, "loss": 0.5763, "step": 1172 }, { "epoch": 0.32529118136439267, "grad_norm": 0.18691927194595337, "learning_rate": 2.3624516965236386e-05, "loss": 0.5707, "step": 1173 }, { "epoch": 0.32556849694952855, "grad_norm": 0.18426918983459473, "learning_rate": 2.36219978133739e-05, "loss": 0.5337, "step": 1174 }, { "epoch": 0.32584581253466444, "grad_norm": 0.19540777802467346, "learning_rate": 2.3619476491314977e-05, "loss": 0.5549, "step": 1175 }, { "epoch": 0.3261231281198003, "grad_norm": 0.2000686079263687, "learning_rate": 2.3616952999551576e-05, "loss": 0.5765, "step": 1176 }, { "epoch": 0.3264004437049362, "grad_norm": 0.19267399609088898, "learning_rate": 2.3614427338576114e-05, "loss": 0.5585, "step": 1177 }, { "epoch": 0.3266777592900721, "grad_norm": 0.21964752674102783, "learning_rate": 2.3611899508881403e-05, "loss": 0.5742, "step": 1178 }, { "epoch": 0.326955074875208, "grad_norm": 0.21694490313529968, "learning_rate": 2.3609369510960696e-05, "loss": 0.5586, "step": 1179 }, { "epoch": 0.32723239046034386, "grad_norm": 0.2035355418920517, "learning_rate": 2.360683734530766e-05, "loss": 0.5712, "step": 1180 }, { "epoch": 0.32750970604547974, "grad_norm": 0.1984809935092926, "learning_rate": 2.3604303012416383e-05, "loss": 0.5846, "step": 1181 }, { "epoch": 0.3277870216306156, "grad_norm": 0.19246408343315125, "learning_rate": 2.360176651278139e-05, "loss": 0.5931, "step": 1182 }, { "epoch": 0.3280643372157515, "grad_norm": 0.18938469886779785, "learning_rate": 2.3599227846897615e-05, "loss": 0.5722, "step": 1183 }, { "epoch": 0.3283416528008874, "grad_norm": 0.1882466822862625, "learning_rate": 2.359668701526042e-05, "loss": 0.5693, "step": 1184 }, { "epoch": 0.3286189683860233, "grad_norm": 0.18115603923797607, "learning_rate": 2.3594144018365584e-05, "loss": 0.5694, "step": 1185 }, { "epoch": 0.32889628397115916, "grad_norm": 0.21563617885112762, "learning_rate": 2.3591598856709317e-05, "loss": 0.5541, "step": 1186 }, { "epoch": 0.32917359955629505, "grad_norm": 0.19328701496124268, "learning_rate": 2.3589051530788246e-05, "loss": 0.5469, "step": 1187 }, { "epoch": 0.32945091514143093, "grad_norm": 0.19828177988529205, "learning_rate": 2.358650204109942e-05, "loss": 0.5785, "step": 1188 }, { "epoch": 0.3297282307265668, "grad_norm": 0.1821170300245285, "learning_rate": 2.358395038814032e-05, "loss": 0.5727, "step": 1189 }, { "epoch": 0.3300055463117027, "grad_norm": 0.19181084632873535, "learning_rate": 2.3581396572408833e-05, "loss": 0.5699, "step": 1190 }, { "epoch": 0.3302828618968386, "grad_norm": 0.18721237778663635, "learning_rate": 2.3578840594403275e-05, "loss": 0.5786, "step": 1191 }, { "epoch": 0.33056017748197447, "grad_norm": 0.2080426812171936, "learning_rate": 2.3576282454622394e-05, "loss": 0.6082, "step": 1192 }, { "epoch": 0.33083749306711036, "grad_norm": 0.19874081015586853, "learning_rate": 2.3573722153565343e-05, "loss": 0.5802, "step": 1193 }, { "epoch": 0.33111480865224624, "grad_norm": 0.20292651653289795, "learning_rate": 2.357115969173171e-05, "loss": 0.5562, "step": 1194 }, { "epoch": 0.3313921242373821, "grad_norm": 0.1992533951997757, "learning_rate": 2.356859506962149e-05, "loss": 0.5491, "step": 1195 }, { "epoch": 0.331669439822518, "grad_norm": 0.19271281361579895, "learning_rate": 2.356602828773512e-05, "loss": 0.5713, "step": 1196 }, { "epoch": 0.3319467554076539, "grad_norm": 0.19548431038856506, "learning_rate": 2.356345934657344e-05, "loss": 0.5455, "step": 1197 }, { "epoch": 0.3322240709927898, "grad_norm": 0.19355355203151703, "learning_rate": 2.3560888246637726e-05, "loss": 0.576, "step": 1198 }, { "epoch": 0.33250138657792566, "grad_norm": 0.19047684967517853, "learning_rate": 2.3558314988429657e-05, "loss": 0.5569, "step": 1199 }, { "epoch": 0.33277870216306155, "grad_norm": 0.19171833992004395, "learning_rate": 2.3555739572451353e-05, "loss": 0.5974, "step": 1200 }, { "epoch": 0.33305601774819743, "grad_norm": 0.18777206540107727, "learning_rate": 2.3553161999205337e-05, "loss": 0.5614, "step": 1201 }, { "epoch": 0.3333333333333333, "grad_norm": 0.1937248855829239, "learning_rate": 2.355058226919457e-05, "loss": 0.5725, "step": 1202 }, { "epoch": 0.3336106489184692, "grad_norm": 0.19912956655025482, "learning_rate": 2.3548000382922422e-05, "loss": 0.5595, "step": 1203 }, { "epoch": 0.3338879645036051, "grad_norm": 0.19780032336711884, "learning_rate": 2.354541634089269e-05, "loss": 0.5766, "step": 1204 }, { "epoch": 0.33416528008874097, "grad_norm": 0.19286835193634033, "learning_rate": 2.3542830143609584e-05, "loss": 0.5529, "step": 1205 }, { "epoch": 0.33444259567387685, "grad_norm": 0.19875440001487732, "learning_rate": 2.3540241791577745e-05, "loss": 0.5616, "step": 1206 }, { "epoch": 0.33471991125901274, "grad_norm": 0.20458170771598816, "learning_rate": 2.3537651285302224e-05, "loss": 0.5754, "step": 1207 }, { "epoch": 0.3349972268441486, "grad_norm": 0.19896847009658813, "learning_rate": 2.3535058625288503e-05, "loss": 0.586, "step": 1208 }, { "epoch": 0.3352745424292845, "grad_norm": 0.2005191296339035, "learning_rate": 2.3532463812042478e-05, "loss": 0.57, "step": 1209 }, { "epoch": 0.3355518580144204, "grad_norm": 0.19629941880702972, "learning_rate": 2.3529866846070457e-05, "loss": 0.5632, "step": 1210 }, { "epoch": 0.3358291735995563, "grad_norm": 0.2009962797164917, "learning_rate": 2.3527267727879187e-05, "loss": 0.5926, "step": 1211 }, { "epoch": 0.33610648918469216, "grad_norm": 0.18961815536022186, "learning_rate": 2.3524666457975826e-05, "loss": 0.566, "step": 1212 }, { "epoch": 0.33638380476982804, "grad_norm": 0.19457431137561798, "learning_rate": 2.3522063036867938e-05, "loss": 0.5302, "step": 1213 }, { "epoch": 0.3366611203549639, "grad_norm": 0.20213200151920319, "learning_rate": 2.351945746506353e-05, "loss": 0.5726, "step": 1214 }, { "epoch": 0.3369384359400998, "grad_norm": 0.1976713389158249, "learning_rate": 2.351684974307102e-05, "loss": 0.5425, "step": 1215 }, { "epoch": 0.3372157515252357, "grad_norm": 0.19058452546596527, "learning_rate": 2.3514239871399235e-05, "loss": 0.5695, "step": 1216 }, { "epoch": 0.3374930671103716, "grad_norm": 0.18397174775600433, "learning_rate": 2.3511627850557432e-05, "loss": 0.535, "step": 1217 }, { "epoch": 0.33777038269550747, "grad_norm": 0.2004639059305191, "learning_rate": 2.3509013681055293e-05, "loss": 0.5698, "step": 1218 }, { "epoch": 0.33804769828064335, "grad_norm": 0.1970200389623642, "learning_rate": 2.3506397363402905e-05, "loss": 0.5831, "step": 1219 }, { "epoch": 0.33832501386577923, "grad_norm": 0.24607634544372559, "learning_rate": 2.3503778898110782e-05, "loss": 0.6011, "step": 1220 }, { "epoch": 0.3386023294509151, "grad_norm": 0.1974974125623703, "learning_rate": 2.3501158285689857e-05, "loss": 0.5402, "step": 1221 }, { "epoch": 0.338879645036051, "grad_norm": 0.20804527401924133, "learning_rate": 2.3498535526651478e-05, "loss": 0.5804, "step": 1222 }, { "epoch": 0.3391569606211869, "grad_norm": 0.21283218264579773, "learning_rate": 2.349591062150742e-05, "loss": 0.5498, "step": 1223 }, { "epoch": 0.33943427620632277, "grad_norm": 0.20082467794418335, "learning_rate": 2.3493283570769863e-05, "loss": 0.5589, "step": 1224 }, { "epoch": 0.33971159179145866, "grad_norm": 0.1998477727174759, "learning_rate": 2.3490654374951426e-05, "loss": 0.5496, "step": 1225 }, { "epoch": 0.33998890737659454, "grad_norm": 0.18160581588745117, "learning_rate": 2.3488023034565127e-05, "loss": 0.569, "step": 1226 }, { "epoch": 0.3402662229617304, "grad_norm": 0.18750114738941193, "learning_rate": 2.3485389550124413e-05, "loss": 0.5768, "step": 1227 }, { "epoch": 0.3405435385468663, "grad_norm": 0.19347476959228516, "learning_rate": 2.3482753922143143e-05, "loss": 0.5354, "step": 1228 }, { "epoch": 0.3408208541320022, "grad_norm": 0.19742092490196228, "learning_rate": 2.34801161511356e-05, "loss": 0.5656, "step": 1229 }, { "epoch": 0.3410981697171381, "grad_norm": 0.19319242238998413, "learning_rate": 2.3477476237616487e-05, "loss": 0.5654, "step": 1230 }, { "epoch": 0.34137548530227396, "grad_norm": 0.18535441160202026, "learning_rate": 2.3474834182100914e-05, "loss": 0.5419, "step": 1231 }, { "epoch": 0.34165280088740985, "grad_norm": 0.2077123373746872, "learning_rate": 2.347218998510442e-05, "loss": 0.5719, "step": 1232 }, { "epoch": 0.34193011647254573, "grad_norm": 0.1924944818019867, "learning_rate": 2.3469543647142954e-05, "loss": 0.5493, "step": 1233 }, { "epoch": 0.3422074320576816, "grad_norm": 0.1943088173866272, "learning_rate": 2.3466895168732894e-05, "loss": 0.5418, "step": 1234 }, { "epoch": 0.3424847476428175, "grad_norm": 0.19199106097221375, "learning_rate": 2.3464244550391023e-05, "loss": 0.5463, "step": 1235 }, { "epoch": 0.3427620632279534, "grad_norm": 0.19151833653450012, "learning_rate": 2.3461591792634548e-05, "loss": 0.5595, "step": 1236 }, { "epoch": 0.34303937881308927, "grad_norm": 0.20270255208015442, "learning_rate": 2.3458936895981093e-05, "loss": 0.5963, "step": 1237 }, { "epoch": 0.34331669439822515, "grad_norm": 0.1867821365594864, "learning_rate": 2.3456279860948696e-05, "loss": 0.5716, "step": 1238 }, { "epoch": 0.34359400998336104, "grad_norm": 0.20384635031223297, "learning_rate": 2.3453620688055817e-05, "loss": 0.5726, "step": 1239 }, { "epoch": 0.343871325568497, "grad_norm": 0.19507844746112823, "learning_rate": 2.3450959377821334e-05, "loss": 0.5461, "step": 1240 }, { "epoch": 0.34414864115363286, "grad_norm": 0.18854713439941406, "learning_rate": 2.3448295930764536e-05, "loss": 0.5617, "step": 1241 }, { "epoch": 0.34442595673876875, "grad_norm": 0.1924966722726822, "learning_rate": 2.344563034740513e-05, "loss": 0.5716, "step": 1242 }, { "epoch": 0.34470327232390463, "grad_norm": 0.1878724843263626, "learning_rate": 2.3442962628263245e-05, "loss": 0.5588, "step": 1243 }, { "epoch": 0.3449805879090405, "grad_norm": 0.20984706282615662, "learning_rate": 2.3440292773859422e-05, "loss": 0.5623, "step": 1244 }, { "epoch": 0.3452579034941764, "grad_norm": 0.19068847596645355, "learning_rate": 2.343762078471462e-05, "loss": 0.5573, "step": 1245 }, { "epoch": 0.3455352190793123, "grad_norm": 0.1986820548772812, "learning_rate": 2.343494666135022e-05, "loss": 0.5799, "step": 1246 }, { "epoch": 0.34581253466444817, "grad_norm": 0.21772533655166626, "learning_rate": 2.343227040428801e-05, "loss": 0.5674, "step": 1247 }, { "epoch": 0.34608985024958405, "grad_norm": 0.20012469589710236, "learning_rate": 2.3429592014050198e-05, "loss": 0.5713, "step": 1248 }, { "epoch": 0.34636716583471994, "grad_norm": 0.21233108639717102, "learning_rate": 2.3426911491159408e-05, "loss": 0.5232, "step": 1249 }, { "epoch": 0.3466444814198558, "grad_norm": 0.19460223615169525, "learning_rate": 2.3424228836138686e-05, "loss": 0.5547, "step": 1250 }, { "epoch": 0.3469217970049917, "grad_norm": 0.18516409397125244, "learning_rate": 2.3421544049511484e-05, "loss": 0.5445, "step": 1251 }, { "epoch": 0.3471991125901276, "grad_norm": 0.20658938586711884, "learning_rate": 2.341885713180168e-05, "loss": 0.5551, "step": 1252 }, { "epoch": 0.3474764281752635, "grad_norm": 0.19541315734386444, "learning_rate": 2.3416168083533556e-05, "loss": 0.5608, "step": 1253 }, { "epoch": 0.34775374376039936, "grad_norm": 0.22885319590568542, "learning_rate": 2.3413476905231825e-05, "loss": 0.5836, "step": 1254 }, { "epoch": 0.34803105934553524, "grad_norm": 0.2285439521074295, "learning_rate": 2.3410783597421597e-05, "loss": 0.5444, "step": 1255 }, { "epoch": 0.3483083749306711, "grad_norm": 0.200783833861351, "learning_rate": 2.3408088160628422e-05, "loss": 0.5601, "step": 1256 }, { "epoch": 0.348585690515807, "grad_norm": 0.19225940108299255, "learning_rate": 2.3405390595378236e-05, "loss": 0.5602, "step": 1257 }, { "epoch": 0.3488630061009429, "grad_norm": 0.19936451315879822, "learning_rate": 2.340269090219741e-05, "loss": 0.5593, "step": 1258 }, { "epoch": 0.3491403216860788, "grad_norm": 0.20055221021175385, "learning_rate": 2.3399989081612732e-05, "loss": 0.568, "step": 1259 }, { "epoch": 0.34941763727121466, "grad_norm": 0.1941661238670349, "learning_rate": 2.3397285134151394e-05, "loss": 0.5688, "step": 1260 }, { "epoch": 0.34969495285635055, "grad_norm": 0.19209027290344238, "learning_rate": 2.3394579060341008e-05, "loss": 0.5635, "step": 1261 }, { "epoch": 0.34997226844148643, "grad_norm": 0.20127955079078674, "learning_rate": 2.33918708607096e-05, "loss": 0.5833, "step": 1262 }, { "epoch": 0.3502495840266223, "grad_norm": 0.20718038082122803, "learning_rate": 2.3389160535785612e-05, "loss": 0.5883, "step": 1263 }, { "epoch": 0.3505268996117582, "grad_norm": 0.19489611685276031, "learning_rate": 2.3386448086097902e-05, "loss": 0.5618, "step": 1264 }, { "epoch": 0.3508042151968941, "grad_norm": 0.19366618990898132, "learning_rate": 2.338373351217574e-05, "loss": 0.5813, "step": 1265 }, { "epoch": 0.35108153078202997, "grad_norm": 0.19742882251739502, "learning_rate": 2.3381016814548806e-05, "loss": 0.5866, "step": 1266 }, { "epoch": 0.35135884636716586, "grad_norm": 0.2014235556125641, "learning_rate": 2.337829799374721e-05, "loss": 0.5503, "step": 1267 }, { "epoch": 0.35163616195230174, "grad_norm": 0.19284577667713165, "learning_rate": 2.337557705030146e-05, "loss": 0.5683, "step": 1268 }, { "epoch": 0.3519134775374376, "grad_norm": 0.19096529483795166, "learning_rate": 2.3372853984742482e-05, "loss": 0.5535, "step": 1269 }, { "epoch": 0.3521907931225735, "grad_norm": 0.19602730870246887, "learning_rate": 2.337012879760162e-05, "loss": 0.5717, "step": 1270 }, { "epoch": 0.3524681087077094, "grad_norm": 0.19658134877681732, "learning_rate": 2.3367401489410635e-05, "loss": 0.5945, "step": 1271 }, { "epoch": 0.3527454242928453, "grad_norm": 0.20233921706676483, "learning_rate": 2.3364672060701688e-05, "loss": 0.5887, "step": 1272 }, { "epoch": 0.35302273987798116, "grad_norm": 0.20138201117515564, "learning_rate": 2.3361940512007368e-05, "loss": 0.5424, "step": 1273 }, { "epoch": 0.35330005546311705, "grad_norm": 0.1879206895828247, "learning_rate": 2.3359206843860675e-05, "loss": 0.546, "step": 1274 }, { "epoch": 0.35357737104825293, "grad_norm": 0.19074855744838715, "learning_rate": 2.335647105679502e-05, "loss": 0.5486, "step": 1275 }, { "epoch": 0.3538546866333888, "grad_norm": 0.2010781466960907, "learning_rate": 2.335373315134422e-05, "loss": 0.6002, "step": 1276 }, { "epoch": 0.3541320022185247, "grad_norm": 0.19462116062641144, "learning_rate": 2.3350993128042523e-05, "loss": 0.5937, "step": 1277 }, { "epoch": 0.3544093178036606, "grad_norm": 0.21550050377845764, "learning_rate": 2.3348250987424573e-05, "loss": 0.5783, "step": 1278 }, { "epoch": 0.35468663338879647, "grad_norm": 0.20210538804531097, "learning_rate": 2.3345506730025434e-05, "loss": 0.5784, "step": 1279 }, { "epoch": 0.35496394897393235, "grad_norm": 0.19225560128688812, "learning_rate": 2.3342760356380588e-05, "loss": 0.566, "step": 1280 }, { "epoch": 0.35524126455906824, "grad_norm": 0.2126346081495285, "learning_rate": 2.3340011867025924e-05, "loss": 0.5369, "step": 1281 }, { "epoch": 0.3555185801442041, "grad_norm": 0.19172579050064087, "learning_rate": 2.333726126249774e-05, "loss": 0.5736, "step": 1282 }, { "epoch": 0.35579589572934, "grad_norm": 0.20066869258880615, "learning_rate": 2.333450854333276e-05, "loss": 0.5538, "step": 1283 }, { "epoch": 0.3560732113144759, "grad_norm": 0.2043704241514206, "learning_rate": 2.3331753710068106e-05, "loss": 0.5682, "step": 1284 }, { "epoch": 0.3563505268996118, "grad_norm": 0.1963237076997757, "learning_rate": 2.3328996763241323e-05, "loss": 0.5342, "step": 1285 }, { "epoch": 0.35662784248474766, "grad_norm": 0.18448245525360107, "learning_rate": 2.332623770339036e-05, "loss": 0.5717, "step": 1286 }, { "epoch": 0.35690515806988354, "grad_norm": 0.1940702348947525, "learning_rate": 2.3323476531053587e-05, "loss": 0.5491, "step": 1287 }, { "epoch": 0.35718247365501943, "grad_norm": 0.1945616453886032, "learning_rate": 2.3320713246769782e-05, "loss": 0.5464, "step": 1288 }, { "epoch": 0.3574597892401553, "grad_norm": 0.19436432421207428, "learning_rate": 2.331794785107813e-05, "loss": 0.5475, "step": 1289 }, { "epoch": 0.3577371048252912, "grad_norm": 0.19309785962104797, "learning_rate": 2.3315180344518236e-05, "loss": 0.5413, "step": 1290 }, { "epoch": 0.3580144204104271, "grad_norm": 0.19348269701004028, "learning_rate": 2.331241072763012e-05, "loss": 0.5713, "step": 1291 }, { "epoch": 0.35829173599556297, "grad_norm": 0.18745654821395874, "learning_rate": 2.330963900095419e-05, "loss": 0.5661, "step": 1292 }, { "epoch": 0.35856905158069885, "grad_norm": 0.20293056964874268, "learning_rate": 2.3306865165031305e-05, "loss": 0.5855, "step": 1293 }, { "epoch": 0.35884636716583473, "grad_norm": 0.202471524477005, "learning_rate": 2.3304089220402702e-05, "loss": 0.5905, "step": 1294 }, { "epoch": 0.3591236827509706, "grad_norm": 0.18649210035800934, "learning_rate": 2.330131116761004e-05, "loss": 0.5768, "step": 1295 }, { "epoch": 0.3594009983361065, "grad_norm": 0.18877407908439636, "learning_rate": 2.3298531007195398e-05, "loss": 0.576, "step": 1296 }, { "epoch": 0.3596783139212424, "grad_norm": 0.18760234117507935, "learning_rate": 2.329574873970125e-05, "loss": 0.54, "step": 1297 }, { "epoch": 0.35995562950637827, "grad_norm": 0.2042498141527176, "learning_rate": 2.32929643656705e-05, "loss": 0.5595, "step": 1298 }, { "epoch": 0.36023294509151416, "grad_norm": 0.19519071280956268, "learning_rate": 2.3290177885646448e-05, "loss": 0.5446, "step": 1299 }, { "epoch": 0.36051026067665004, "grad_norm": 0.19533094763755798, "learning_rate": 2.3287389300172806e-05, "loss": 0.619, "step": 1300 }, { "epoch": 0.3607875762617859, "grad_norm": 0.18442007899284363, "learning_rate": 2.3284598609793705e-05, "loss": 0.5651, "step": 1301 }, { "epoch": 0.3610648918469218, "grad_norm": 0.1934802085161209, "learning_rate": 2.3281805815053688e-05, "loss": 0.5894, "step": 1302 }, { "epoch": 0.3613422074320577, "grad_norm": 0.20552469789981842, "learning_rate": 2.327901091649769e-05, "loss": 0.5943, "step": 1303 }, { "epoch": 0.3616195230171936, "grad_norm": 0.19476066529750824, "learning_rate": 2.3276213914671084e-05, "loss": 0.5729, "step": 1304 }, { "epoch": 0.36189683860232946, "grad_norm": 0.19621802866458893, "learning_rate": 2.3273414810119632e-05, "loss": 0.5616, "step": 1305 }, { "epoch": 0.36217415418746535, "grad_norm": 0.192935511469841, "learning_rate": 2.3270613603389513e-05, "loss": 0.5507, "step": 1306 }, { "epoch": 0.36245146977260123, "grad_norm": 0.18750174343585968, "learning_rate": 2.3267810295027317e-05, "loss": 0.5866, "step": 1307 }, { "epoch": 0.3627287853577371, "grad_norm": 0.18611189723014832, "learning_rate": 2.3265004885580047e-05, "loss": 0.5638, "step": 1308 }, { "epoch": 0.363006100942873, "grad_norm": 0.205692857503891, "learning_rate": 2.3262197375595108e-05, "loss": 0.5628, "step": 1309 }, { "epoch": 0.3632834165280089, "grad_norm": 0.19303978979587555, "learning_rate": 2.3259387765620322e-05, "loss": 0.5785, "step": 1310 }, { "epoch": 0.36356073211314477, "grad_norm": 0.19016319513320923, "learning_rate": 2.325657605620392e-05, "loss": 0.5637, "step": 1311 }, { "epoch": 0.36383804769828065, "grad_norm": 0.18235303461551666, "learning_rate": 2.325376224789454e-05, "loss": 0.5518, "step": 1312 }, { "epoch": 0.36411536328341654, "grad_norm": 0.1889420747756958, "learning_rate": 2.325094634124123e-05, "loss": 0.5539, "step": 1313 }, { "epoch": 0.3643926788685524, "grad_norm": 0.1905851662158966, "learning_rate": 2.3248128336793444e-05, "loss": 0.5892, "step": 1314 }, { "epoch": 0.3646699944536883, "grad_norm": 0.18385791778564453, "learning_rate": 2.324530823510106e-05, "loss": 0.5746, "step": 1315 }, { "epoch": 0.3649473100388242, "grad_norm": 0.18778762221336365, "learning_rate": 2.3242486036714343e-05, "loss": 0.5578, "step": 1316 }, { "epoch": 0.3652246256239601, "grad_norm": 0.18907684087753296, "learning_rate": 2.3239661742183984e-05, "loss": 0.5912, "step": 1317 }, { "epoch": 0.36550194120909596, "grad_norm": 0.19764885306358337, "learning_rate": 2.3236835352061076e-05, "loss": 0.5719, "step": 1318 }, { "epoch": 0.36577925679423184, "grad_norm": 0.1823858916759491, "learning_rate": 2.3234006866897125e-05, "loss": 0.5619, "step": 1319 }, { "epoch": 0.36605657237936773, "grad_norm": 0.31975796818733215, "learning_rate": 2.3231176287244044e-05, "loss": 0.5935, "step": 1320 }, { "epoch": 0.3663338879645036, "grad_norm": 0.17738473415374756, "learning_rate": 2.322834361365415e-05, "loss": 0.56, "step": 1321 }, { "epoch": 0.3666112035496395, "grad_norm": 0.19397957623004913, "learning_rate": 2.3225508846680173e-05, "loss": 0.5767, "step": 1322 }, { "epoch": 0.3668885191347754, "grad_norm": 0.1985812485218048, "learning_rate": 2.3222671986875255e-05, "loss": 0.5322, "step": 1323 }, { "epoch": 0.36716583471991127, "grad_norm": 0.19608476758003235, "learning_rate": 2.3219833034792943e-05, "loss": 0.5758, "step": 1324 }, { "epoch": 0.36744315030504715, "grad_norm": 0.196056067943573, "learning_rate": 2.3216991990987186e-05, "loss": 0.5658, "step": 1325 }, { "epoch": 0.36772046589018303, "grad_norm": 0.18968339264392853, "learning_rate": 2.3214148856012354e-05, "loss": 0.5472, "step": 1326 }, { "epoch": 0.3679977814753189, "grad_norm": 0.19283819198608398, "learning_rate": 2.3211303630423208e-05, "loss": 0.5554, "step": 1327 }, { "epoch": 0.3682750970604548, "grad_norm": 0.19521844387054443, "learning_rate": 2.320845631477494e-05, "loss": 0.5252, "step": 1328 }, { "epoch": 0.3685524126455907, "grad_norm": 0.19548355042934418, "learning_rate": 2.3205606909623122e-05, "loss": 0.569, "step": 1329 }, { "epoch": 0.3688297282307266, "grad_norm": 0.18432483077049255, "learning_rate": 2.3202755415523763e-05, "loss": 0.5612, "step": 1330 }, { "epoch": 0.36910704381586246, "grad_norm": 0.19396983087062836, "learning_rate": 2.3199901833033255e-05, "loss": 0.586, "step": 1331 }, { "epoch": 0.36938435940099834, "grad_norm": 0.18808341026306152, "learning_rate": 2.3197046162708413e-05, "loss": 0.566, "step": 1332 }, { "epoch": 0.3696616749861342, "grad_norm": 0.19177477061748505, "learning_rate": 2.3194188405106453e-05, "loss": 0.5673, "step": 1333 }, { "epoch": 0.3699389905712701, "grad_norm": 0.19362227618694305, "learning_rate": 2.3191328560784992e-05, "loss": 0.5367, "step": 1334 }, { "epoch": 0.370216306156406, "grad_norm": 0.1922491490840912, "learning_rate": 2.3188466630302072e-05, "loss": 0.5466, "step": 1335 }, { "epoch": 0.3704936217415419, "grad_norm": 0.19545325636863708, "learning_rate": 2.3185602614216125e-05, "loss": 0.5861, "step": 1336 }, { "epoch": 0.37077093732667776, "grad_norm": 0.1878175288438797, "learning_rate": 2.3182736513086002e-05, "loss": 0.5429, "step": 1337 }, { "epoch": 0.37104825291181365, "grad_norm": 0.20909211039543152, "learning_rate": 2.3179868327470948e-05, "loss": 0.579, "step": 1338 }, { "epoch": 0.37132556849694953, "grad_norm": 0.2049614042043686, "learning_rate": 2.3176998057930626e-05, "loss": 0.5877, "step": 1339 }, { "epoch": 0.3716028840820854, "grad_norm": 0.20033860206604004, "learning_rate": 2.3174125705025103e-05, "loss": 0.5576, "step": 1340 }, { "epoch": 0.3718801996672213, "grad_norm": 0.19559535384178162, "learning_rate": 2.3171251269314846e-05, "loss": 0.581, "step": 1341 }, { "epoch": 0.3721575152523572, "grad_norm": 0.19223865866661072, "learning_rate": 2.3168374751360737e-05, "loss": 0.5742, "step": 1342 }, { "epoch": 0.37243483083749307, "grad_norm": 0.18235072493553162, "learning_rate": 2.316549615172406e-05, "loss": 0.5839, "step": 1343 }, { "epoch": 0.37271214642262895, "grad_norm": 0.1987343281507492, "learning_rate": 2.3162615470966512e-05, "loss": 0.5531, "step": 1344 }, { "epoch": 0.37298946200776484, "grad_norm": 0.20500993728637695, "learning_rate": 2.3159732709650182e-05, "loss": 0.5849, "step": 1345 }, { "epoch": 0.3732667775929007, "grad_norm": 0.19410103559494019, "learning_rate": 2.3156847868337574e-05, "loss": 0.602, "step": 1346 }, { "epoch": 0.3735440931780366, "grad_norm": 0.19419234991073608, "learning_rate": 2.31539609475916e-05, "loss": 0.5639, "step": 1347 }, { "epoch": 0.3738214087631725, "grad_norm": 0.19289056956768036, "learning_rate": 2.3151071947975578e-05, "loss": 0.5833, "step": 1348 }, { "epoch": 0.3740987243483084, "grad_norm": 0.1862688809633255, "learning_rate": 2.314818087005322e-05, "loss": 0.5587, "step": 1349 }, { "epoch": 0.37437603993344426, "grad_norm": 0.1773182898759842, "learning_rate": 2.314528771438866e-05, "loss": 0.5297, "step": 1350 }, { "epoch": 0.37465335551858014, "grad_norm": 0.18808980286121368, "learning_rate": 2.314239248154642e-05, "loss": 0.5478, "step": 1351 }, { "epoch": 0.37493067110371603, "grad_norm": 0.1850731521844864, "learning_rate": 2.3139495172091447e-05, "loss": 0.5631, "step": 1352 }, { "epoch": 0.3752079866888519, "grad_norm": 0.233089417219162, "learning_rate": 2.313659578658907e-05, "loss": 0.5688, "step": 1353 }, { "epoch": 0.3754853022739878, "grad_norm": 0.19679111242294312, "learning_rate": 2.313369432560505e-05, "loss": 0.5713, "step": 1354 }, { "epoch": 0.3757626178591237, "grad_norm": 0.22906848788261414, "learning_rate": 2.3130790789705535e-05, "loss": 0.5727, "step": 1355 }, { "epoch": 0.37603993344425957, "grad_norm": 0.20593827962875366, "learning_rate": 2.3127885179457077e-05, "loss": 0.5382, "step": 1356 }, { "epoch": 0.37631724902939545, "grad_norm": 0.18781960010528564, "learning_rate": 2.3124977495426637e-05, "loss": 0.5746, "step": 1357 }, { "epoch": 0.37659456461453134, "grad_norm": 0.2294550985097885, "learning_rate": 2.3122067738181587e-05, "loss": 0.5783, "step": 1358 }, { "epoch": 0.3768718801996672, "grad_norm": 0.2007582187652588, "learning_rate": 2.311915590828969e-05, "loss": 0.5586, "step": 1359 }, { "epoch": 0.3771491957848031, "grad_norm": 0.20126573741436005, "learning_rate": 2.3116242006319132e-05, "loss": 0.5621, "step": 1360 }, { "epoch": 0.377426511369939, "grad_norm": 0.19607853889465332, "learning_rate": 2.3113326032838487e-05, "loss": 0.5305, "step": 1361 }, { "epoch": 0.3777038269550749, "grad_norm": 0.193894624710083, "learning_rate": 2.3110407988416736e-05, "loss": 0.5578, "step": 1362 }, { "epoch": 0.37798114254021076, "grad_norm": 0.1813306212425232, "learning_rate": 2.310748787362327e-05, "loss": 0.5787, "step": 1363 }, { "epoch": 0.37825845812534664, "grad_norm": 0.19636552035808563, "learning_rate": 2.3104565689027875e-05, "loss": 0.5615, "step": 1364 }, { "epoch": 0.3785357737104825, "grad_norm": 0.1957857310771942, "learning_rate": 2.3101641435200756e-05, "loss": 0.5821, "step": 1365 }, { "epoch": 0.3788130892956184, "grad_norm": 0.18413352966308594, "learning_rate": 2.3098715112712507e-05, "loss": 0.5388, "step": 1366 }, { "epoch": 0.3790904048807543, "grad_norm": 0.20153377950191498, "learning_rate": 2.3095786722134133e-05, "loss": 0.5748, "step": 1367 }, { "epoch": 0.3793677204658902, "grad_norm": 0.22044618427753448, "learning_rate": 2.309285626403704e-05, "loss": 0.5668, "step": 1368 }, { "epoch": 0.37964503605102606, "grad_norm": 0.19130001962184906, "learning_rate": 2.3089923738993034e-05, "loss": 0.5752, "step": 1369 }, { "epoch": 0.37992235163616195, "grad_norm": 0.18104785680770874, "learning_rate": 2.3086989147574333e-05, "loss": 0.5577, "step": 1370 }, { "epoch": 0.38019966722129783, "grad_norm": 0.18427520990371704, "learning_rate": 2.3084052490353553e-05, "loss": 0.5537, "step": 1371 }, { "epoch": 0.3804769828064337, "grad_norm": 0.1886986941099167, "learning_rate": 2.3081113767903713e-05, "loss": 0.5646, "step": 1372 }, { "epoch": 0.3807542983915696, "grad_norm": 0.19212138652801514, "learning_rate": 2.3078172980798236e-05, "loss": 0.5415, "step": 1373 }, { "epoch": 0.3810316139767055, "grad_norm": 0.18980136513710022, "learning_rate": 2.3075230129610946e-05, "loss": 0.5725, "step": 1374 }, { "epoch": 0.38130892956184137, "grad_norm": 0.1848769187927246, "learning_rate": 2.3072285214916072e-05, "loss": 0.536, "step": 1375 }, { "epoch": 0.38158624514697725, "grad_norm": 0.18508492410182953, "learning_rate": 2.3069338237288247e-05, "loss": 0.5753, "step": 1376 }, { "epoch": 0.38186356073211314, "grad_norm": 0.1909710019826889, "learning_rate": 2.30663891973025e-05, "loss": 0.563, "step": 1377 }, { "epoch": 0.382140876317249, "grad_norm": 0.20639832317829132, "learning_rate": 2.3063438095534272e-05, "loss": 0.5713, "step": 1378 }, { "epoch": 0.3824181919023849, "grad_norm": 0.1900801658630371, "learning_rate": 2.3060484932559395e-05, "loss": 0.5579, "step": 1379 }, { "epoch": 0.3826955074875208, "grad_norm": 0.19208571314811707, "learning_rate": 2.305752970895412e-05, "loss": 0.5571, "step": 1380 }, { "epoch": 0.3829728230726567, "grad_norm": 0.1912533938884735, "learning_rate": 2.3054572425295075e-05, "loss": 0.5452, "step": 1381 }, { "epoch": 0.38325013865779256, "grad_norm": 0.19920608401298523, "learning_rate": 2.3051613082159313e-05, "loss": 0.5799, "step": 1382 }, { "epoch": 0.38352745424292845, "grad_norm": 0.19880720973014832, "learning_rate": 2.3048651680124283e-05, "loss": 0.5504, "step": 1383 }, { "epoch": 0.38380476982806433, "grad_norm": 0.1889009028673172, "learning_rate": 2.3045688219767824e-05, "loss": 0.5751, "step": 1384 }, { "epoch": 0.3840820854132002, "grad_norm": 0.20132310688495636, "learning_rate": 2.3042722701668194e-05, "loss": 0.5723, "step": 1385 }, { "epoch": 0.3843594009983361, "grad_norm": 0.20646221935749054, "learning_rate": 2.3039755126404037e-05, "loss": 0.581, "step": 1386 }, { "epoch": 0.384636716583472, "grad_norm": 0.19051332771778107, "learning_rate": 2.3036785494554415e-05, "loss": 0.5609, "step": 1387 }, { "epoch": 0.38491403216860787, "grad_norm": 0.1971728354692459, "learning_rate": 2.303381380669877e-05, "loss": 0.563, "step": 1388 }, { "epoch": 0.38519134775374375, "grad_norm": 0.19891361892223358, "learning_rate": 2.303084006341697e-05, "loss": 0.571, "step": 1389 }, { "epoch": 0.38546866333887964, "grad_norm": 0.18801699578762054, "learning_rate": 2.302786426528926e-05, "loss": 0.5772, "step": 1390 }, { "epoch": 0.3857459789240155, "grad_norm": 0.1934468299150467, "learning_rate": 2.3024886412896302e-05, "loss": 0.5426, "step": 1391 }, { "epoch": 0.3860232945091514, "grad_norm": 0.20585696399211884, "learning_rate": 2.3021906506819152e-05, "loss": 0.5521, "step": 1392 }, { "epoch": 0.3863006100942873, "grad_norm": 0.19114629924297333, "learning_rate": 2.3018924547639272e-05, "loss": 0.597, "step": 1393 }, { "epoch": 0.3865779256794232, "grad_norm": 0.18134154379367828, "learning_rate": 2.301594053593852e-05, "loss": 0.5473, "step": 1394 }, { "epoch": 0.38685524126455906, "grad_norm": 0.18706971406936646, "learning_rate": 2.301295447229915e-05, "loss": 0.5843, "step": 1395 }, { "epoch": 0.38713255684969494, "grad_norm": 0.183674156665802, "learning_rate": 2.300996635730383e-05, "loss": 0.5268, "step": 1396 }, { "epoch": 0.3874098724348308, "grad_norm": 0.18946042656898499, "learning_rate": 2.3006976191535616e-05, "loss": 0.5529, "step": 1397 }, { "epoch": 0.3876871880199667, "grad_norm": 0.20170167088508606, "learning_rate": 2.3003983975577975e-05, "loss": 0.5781, "step": 1398 }, { "epoch": 0.3879645036051026, "grad_norm": 0.1854201704263687, "learning_rate": 2.300098971001476e-05, "loss": 0.536, "step": 1399 }, { "epoch": 0.3882418191902385, "grad_norm": 0.19159002602100372, "learning_rate": 2.299799339543023e-05, "loss": 0.584, "step": 1400 }, { "epoch": 0.38851913477537436, "grad_norm": 0.18878300487995148, "learning_rate": 2.299499503240905e-05, "loss": 0.5613, "step": 1401 }, { "epoch": 0.38879645036051025, "grad_norm": 0.2083284854888916, "learning_rate": 2.2991994621536283e-05, "loss": 0.5607, "step": 1402 }, { "epoch": 0.38907376594564613, "grad_norm": 0.19140368700027466, "learning_rate": 2.2988992163397386e-05, "loss": 0.5723, "step": 1403 }, { "epoch": 0.389351081530782, "grad_norm": 0.19407185912132263, "learning_rate": 2.2985987658578217e-05, "loss": 0.5592, "step": 1404 }, { "epoch": 0.3896283971159179, "grad_norm": 0.202662855386734, "learning_rate": 2.298298110766503e-05, "loss": 0.5682, "step": 1405 }, { "epoch": 0.3899057127010538, "grad_norm": 0.1902836710214615, "learning_rate": 2.2979972511244493e-05, "loss": 0.5626, "step": 1406 }, { "epoch": 0.39018302828618967, "grad_norm": 0.19113852083683014, "learning_rate": 2.2976961869903657e-05, "loss": 0.5713, "step": 1407 }, { "epoch": 0.39046034387132555, "grad_norm": 0.19765017926692963, "learning_rate": 2.2973949184229975e-05, "loss": 0.5811, "step": 1408 }, { "epoch": 0.39073765945646144, "grad_norm": 0.1867290586233139, "learning_rate": 2.2970934454811306e-05, "loss": 0.5094, "step": 1409 }, { "epoch": 0.3910149750415973, "grad_norm": 0.19188763201236725, "learning_rate": 2.2967917682235905e-05, "loss": 0.5736, "step": 1410 }, { "epoch": 0.3912922906267332, "grad_norm": 0.19784973561763763, "learning_rate": 2.296489886709242e-05, "loss": 0.5739, "step": 1411 }, { "epoch": 0.3915696062118691, "grad_norm": 0.19761349260807037, "learning_rate": 2.2961878009969904e-05, "loss": 0.5702, "step": 1412 }, { "epoch": 0.391846921797005, "grad_norm": 0.19187502562999725, "learning_rate": 2.2958855111457804e-05, "loss": 0.5575, "step": 1413 }, { "epoch": 0.39212423738214086, "grad_norm": 0.19723129272460938, "learning_rate": 2.2955830172145975e-05, "loss": 0.5757, "step": 1414 }, { "epoch": 0.39240155296727675, "grad_norm": 0.1946565806865692, "learning_rate": 2.2952803192624653e-05, "loss": 0.5696, "step": 1415 }, { "epoch": 0.39267886855241263, "grad_norm": 0.1714453399181366, "learning_rate": 2.2949774173484488e-05, "loss": 0.5783, "step": 1416 }, { "epoch": 0.3929561841375485, "grad_norm": 0.1907140165567398, "learning_rate": 2.2946743115316518e-05, "loss": 0.5642, "step": 1417 }, { "epoch": 0.3932334997226844, "grad_norm": 0.18922662734985352, "learning_rate": 2.294371001871219e-05, "loss": 0.5403, "step": 1418 }, { "epoch": 0.3935108153078203, "grad_norm": 0.18016541004180908, "learning_rate": 2.294067488426333e-05, "loss": 0.5428, "step": 1419 }, { "epoch": 0.39378813089295617, "grad_norm": 0.18581193685531616, "learning_rate": 2.293763771256218e-05, "loss": 0.5671, "step": 1420 }, { "epoch": 0.39406544647809205, "grad_norm": 0.19479569792747498, "learning_rate": 2.293459850420138e-05, "loss": 0.5471, "step": 1421 }, { "epoch": 0.39434276206322794, "grad_norm": 0.19193242490291595, "learning_rate": 2.2931557259773944e-05, "loss": 0.5422, "step": 1422 }, { "epoch": 0.3946200776483638, "grad_norm": 0.1958051174879074, "learning_rate": 2.2928513979873312e-05, "loss": 0.5605, "step": 1423 }, { "epoch": 0.3948973932334997, "grad_norm": 0.1831037700176239, "learning_rate": 2.2925468665093304e-05, "loss": 0.5632, "step": 1424 }, { "epoch": 0.3951747088186356, "grad_norm": 0.19089363515377045, "learning_rate": 2.2922421316028142e-05, "loss": 0.5534, "step": 1425 }, { "epoch": 0.3954520244037715, "grad_norm": 0.18920765817165375, "learning_rate": 2.2919371933272445e-05, "loss": 0.5708, "step": 1426 }, { "epoch": 0.39572933998890736, "grad_norm": 0.1790907382965088, "learning_rate": 2.2916320517421224e-05, "loss": 0.54, "step": 1427 }, { "epoch": 0.39600665557404324, "grad_norm": 0.19448032975196838, "learning_rate": 2.29132670690699e-05, "loss": 0.5713, "step": 1428 }, { "epoch": 0.3962839711591791, "grad_norm": 0.18597151339054108, "learning_rate": 2.2910211588814272e-05, "loss": 0.5619, "step": 1429 }, { "epoch": 0.396561286744315, "grad_norm": 0.19088095426559448, "learning_rate": 2.2907154077250554e-05, "loss": 0.5567, "step": 1430 }, { "epoch": 0.3968386023294509, "grad_norm": 0.2470846325159073, "learning_rate": 2.290409453497534e-05, "loss": 0.5567, "step": 1431 }, { "epoch": 0.3971159179145868, "grad_norm": 0.189782977104187, "learning_rate": 2.2901032962585633e-05, "loss": 0.5689, "step": 1432 }, { "epoch": 0.39739323349972266, "grad_norm": 0.18323121964931488, "learning_rate": 2.289796936067882e-05, "loss": 0.5695, "step": 1433 }, { "epoch": 0.39767054908485855, "grad_norm": 0.19183968007564545, "learning_rate": 2.28949037298527e-05, "loss": 0.5527, "step": 1434 }, { "epoch": 0.39794786466999443, "grad_norm": 0.2490936517715454, "learning_rate": 2.2891836070705454e-05, "loss": 0.5705, "step": 1435 }, { "epoch": 0.3982251802551303, "grad_norm": 0.18432928621768951, "learning_rate": 2.2888766383835664e-05, "loss": 0.5542, "step": 1436 }, { "epoch": 0.3985024958402662, "grad_norm": 0.18222655355930328, "learning_rate": 2.2885694669842305e-05, "loss": 0.5693, "step": 1437 }, { "epoch": 0.3987798114254021, "grad_norm": 0.18472431600093842, "learning_rate": 2.2882620929324758e-05, "loss": 0.5664, "step": 1438 }, { "epoch": 0.39905712701053797, "grad_norm": 0.19013690948486328, "learning_rate": 2.2879545162882782e-05, "loss": 0.534, "step": 1439 }, { "epoch": 0.39933444259567386, "grad_norm": 0.1890943944454193, "learning_rate": 2.2876467371116546e-05, "loss": 0.5537, "step": 1440 }, { "epoch": 0.39961175818080974, "grad_norm": 0.19494980573654175, "learning_rate": 2.28733875546266e-05, "loss": 0.5502, "step": 1441 }, { "epoch": 0.3998890737659456, "grad_norm": 0.215170755982399, "learning_rate": 2.2870305714013908e-05, "loss": 0.57, "step": 1442 }, { "epoch": 0.4001663893510815, "grad_norm": 0.20500238239765167, "learning_rate": 2.2867221849879816e-05, "loss": 0.5648, "step": 1443 }, { "epoch": 0.4004437049362174, "grad_norm": 0.1898437738418579, "learning_rate": 2.2864135962826067e-05, "loss": 0.5685, "step": 1444 }, { "epoch": 0.4007210205213533, "grad_norm": 0.1926756501197815, "learning_rate": 2.2861048053454797e-05, "loss": 0.5514, "step": 1445 }, { "epoch": 0.40099833610648916, "grad_norm": 0.19258753955364227, "learning_rate": 2.2857958122368545e-05, "loss": 0.5397, "step": 1446 }, { "epoch": 0.40127565169162505, "grad_norm": 0.18636788427829742, "learning_rate": 2.285486617017023e-05, "loss": 0.5421, "step": 1447 }, { "epoch": 0.40155296727676093, "grad_norm": 0.20043087005615234, "learning_rate": 2.2851772197463184e-05, "loss": 0.5241, "step": 1448 }, { "epoch": 0.4018302828618968, "grad_norm": 0.2062230408191681, "learning_rate": 2.284867620485111e-05, "loss": 0.5702, "step": 1449 }, { "epoch": 0.4021075984470327, "grad_norm": 0.1930972784757614, "learning_rate": 2.284557819293813e-05, "loss": 0.5533, "step": 1450 }, { "epoch": 0.4023849140321686, "grad_norm": 0.20658205449581146, "learning_rate": 2.284247816232874e-05, "loss": 0.5613, "step": 1451 }, { "epoch": 0.40266222961730447, "grad_norm": 0.2103574126958847, "learning_rate": 2.2839376113627848e-05, "loss": 0.5668, "step": 1452 }, { "epoch": 0.40293954520244035, "grad_norm": 0.19861139357089996, "learning_rate": 2.2836272047440733e-05, "loss": 0.5549, "step": 1453 }, { "epoch": 0.40321686078757624, "grad_norm": 0.1885387897491455, "learning_rate": 2.2833165964373093e-05, "loss": 0.565, "step": 1454 }, { "epoch": 0.4034941763727121, "grad_norm": 0.18594177067279816, "learning_rate": 2.2830057865030997e-05, "loss": 0.5129, "step": 1455 }, { "epoch": 0.403771491957848, "grad_norm": 0.19150151312351227, "learning_rate": 2.282694775002092e-05, "loss": 0.5591, "step": 1456 }, { "epoch": 0.4040488075429839, "grad_norm": 0.1973596066236496, "learning_rate": 2.2823835619949735e-05, "loss": 0.5795, "step": 1457 }, { "epoch": 0.4043261231281198, "grad_norm": 0.21641992032527924, "learning_rate": 2.2820721475424693e-05, "loss": 0.5286, "step": 1458 }, { "epoch": 0.40460343871325566, "grad_norm": 0.20730352401733398, "learning_rate": 2.281760531705345e-05, "loss": 0.578, "step": 1459 }, { "epoch": 0.40488075429839154, "grad_norm": 0.19733300805091858, "learning_rate": 2.281448714544405e-05, "loss": 0.6048, "step": 1460 }, { "epoch": 0.4051580698835274, "grad_norm": 0.2007417231798172, "learning_rate": 2.281136696120493e-05, "loss": 0.588, "step": 1461 }, { "epoch": 0.4054353854686633, "grad_norm": 0.1785038709640503, "learning_rate": 2.280824476494492e-05, "loss": 0.5505, "step": 1462 }, { "epoch": 0.4057127010537992, "grad_norm": 0.18670029938220978, "learning_rate": 2.2805120557273246e-05, "loss": 0.5533, "step": 1463 }, { "epoch": 0.4059900166389351, "grad_norm": 0.18775032460689545, "learning_rate": 2.2801994338799525e-05, "loss": 0.5457, "step": 1464 }, { "epoch": 0.406267332224071, "grad_norm": 0.18645282089710236, "learning_rate": 2.2798866110133758e-05, "loss": 0.5595, "step": 1465 }, { "epoch": 0.4065446478092069, "grad_norm": 0.2067873477935791, "learning_rate": 2.279573587188635e-05, "loss": 0.5438, "step": 1466 }, { "epoch": 0.4068219633943428, "grad_norm": 0.18860718607902527, "learning_rate": 2.2792603624668097e-05, "loss": 0.541, "step": 1467 }, { "epoch": 0.4070992789794787, "grad_norm": 0.18750540912151337, "learning_rate": 2.2789469369090173e-05, "loss": 0.5567, "step": 1468 }, { "epoch": 0.40737659456461456, "grad_norm": 0.18958985805511475, "learning_rate": 2.2786333105764162e-05, "loss": 0.5413, "step": 1469 }, { "epoch": 0.40765391014975044, "grad_norm": 0.27570971846580505, "learning_rate": 2.2783194835302035e-05, "loss": 0.5548, "step": 1470 }, { "epoch": 0.4079312257348863, "grad_norm": 0.1889680027961731, "learning_rate": 2.2780054558316146e-05, "loss": 0.5507, "step": 1471 }, { "epoch": 0.4082085413200222, "grad_norm": 0.18410347402095795, "learning_rate": 2.277691227541925e-05, "loss": 0.5689, "step": 1472 }, { "epoch": 0.4084858569051581, "grad_norm": 0.18935447931289673, "learning_rate": 2.277376798722448e-05, "loss": 0.5631, "step": 1473 }, { "epoch": 0.408763172490294, "grad_norm": 0.19542036950588226, "learning_rate": 2.2770621694345385e-05, "loss": 0.5614, "step": 1474 }, { "epoch": 0.40904048807542986, "grad_norm": 0.2004593461751938, "learning_rate": 2.2767473397395876e-05, "loss": 0.5707, "step": 1475 }, { "epoch": 0.40931780366056575, "grad_norm": 0.19551704823970795, "learning_rate": 2.276432309699028e-05, "loss": 0.5656, "step": 1476 }, { "epoch": 0.40959511924570163, "grad_norm": 0.18432258069515228, "learning_rate": 2.27611707937433e-05, "loss": 0.5696, "step": 1477 }, { "epoch": 0.4098724348308375, "grad_norm": 0.20155014097690582, "learning_rate": 2.2758016488270033e-05, "loss": 0.581, "step": 1478 }, { "epoch": 0.4101497504159734, "grad_norm": 0.1944228559732437, "learning_rate": 2.2754860181185967e-05, "loss": 0.565, "step": 1479 }, { "epoch": 0.4104270660011093, "grad_norm": 0.20183973014354706, "learning_rate": 2.2751701873106983e-05, "loss": 0.5743, "step": 1480 }, { "epoch": 0.41070438158624517, "grad_norm": 0.18106213212013245, "learning_rate": 2.274854156464935e-05, "loss": 0.5726, "step": 1481 }, { "epoch": 0.41098169717138106, "grad_norm": 0.19121475517749786, "learning_rate": 2.2745379256429728e-05, "loss": 0.5856, "step": 1482 }, { "epoch": 0.41125901275651694, "grad_norm": 0.1916564702987671, "learning_rate": 2.2742214949065166e-05, "loss": 0.5494, "step": 1483 }, { "epoch": 0.4115363283416528, "grad_norm": 0.2309359312057495, "learning_rate": 2.2739048643173105e-05, "loss": 0.575, "step": 1484 }, { "epoch": 0.4118136439267887, "grad_norm": 0.3025970757007599, "learning_rate": 2.2735880339371373e-05, "loss": 0.5717, "step": 1485 }, { "epoch": 0.4120909595119246, "grad_norm": 0.19141024351119995, "learning_rate": 2.27327100382782e-05, "loss": 0.5743, "step": 1486 }, { "epoch": 0.4123682750970605, "grad_norm": 0.19072705507278442, "learning_rate": 2.272953774051218e-05, "loss": 0.5498, "step": 1487 }, { "epoch": 0.41264559068219636, "grad_norm": 0.23282243311405182, "learning_rate": 2.2726363446692324e-05, "loss": 0.5653, "step": 1488 }, { "epoch": 0.41292290626733225, "grad_norm": 0.1904718577861786, "learning_rate": 2.2723187157438015e-05, "loss": 0.5557, "step": 1489 }, { "epoch": 0.41320022185246813, "grad_norm": 0.18210548162460327, "learning_rate": 2.2720008873369036e-05, "loss": 0.5648, "step": 1490 }, { "epoch": 0.413477537437604, "grad_norm": 0.1940220296382904, "learning_rate": 2.271682859510555e-05, "loss": 0.5824, "step": 1491 }, { "epoch": 0.4137548530227399, "grad_norm": 0.18632103502750397, "learning_rate": 2.2713646323268113e-05, "loss": 0.5694, "step": 1492 }, { "epoch": 0.4140321686078758, "grad_norm": 0.19417639076709747, "learning_rate": 2.2710462058477676e-05, "loss": 0.5385, "step": 1493 }, { "epoch": 0.41430948419301167, "grad_norm": 0.1876698136329651, "learning_rate": 2.270727580135557e-05, "loss": 0.5441, "step": 1494 }, { "epoch": 0.41458679977814755, "grad_norm": 0.19295859336853027, "learning_rate": 2.270408755252352e-05, "loss": 0.5587, "step": 1495 }, { "epoch": 0.41486411536328344, "grad_norm": 0.1876155138015747, "learning_rate": 2.2700897312603635e-05, "loss": 0.5597, "step": 1496 }, { "epoch": 0.4151414309484193, "grad_norm": 0.18888817727565765, "learning_rate": 2.2697705082218417e-05, "loss": 0.58, "step": 1497 }, { "epoch": 0.4154187465335552, "grad_norm": 0.1931556612253189, "learning_rate": 2.2694510861990755e-05, "loss": 0.5195, "step": 1498 }, { "epoch": 0.4156960621186911, "grad_norm": 0.20737329125404358, "learning_rate": 2.2691314652543922e-05, "loss": 0.5742, "step": 1499 }, { "epoch": 0.415973377703827, "grad_norm": 0.18405389785766602, "learning_rate": 2.268811645450159e-05, "loss": 0.5572, "step": 1500 }, { "epoch": 0.41625069328896286, "grad_norm": 0.2047012746334076, "learning_rate": 2.2684916268487805e-05, "loss": 0.5682, "step": 1501 }, { "epoch": 0.41652800887409874, "grad_norm": 0.19159288704395294, "learning_rate": 2.2681714095127016e-05, "loss": 0.5642, "step": 1502 }, { "epoch": 0.4168053244592346, "grad_norm": 0.19872340559959412, "learning_rate": 2.2678509935044046e-05, "loss": 0.5801, "step": 1503 }, { "epoch": 0.4170826400443705, "grad_norm": 0.22284448146820068, "learning_rate": 2.267530378886411e-05, "loss": 0.5468, "step": 1504 }, { "epoch": 0.4173599556295064, "grad_norm": 0.18655410408973694, "learning_rate": 2.2672095657212822e-05, "loss": 0.5557, "step": 1505 }, { "epoch": 0.4176372712146423, "grad_norm": 0.1884729266166687, "learning_rate": 2.266888554071616e-05, "loss": 0.5641, "step": 1506 }, { "epoch": 0.41791458679977816, "grad_norm": 0.1828029453754425, "learning_rate": 2.2665673440000512e-05, "loss": 0.5295, "step": 1507 }, { "epoch": 0.41819190238491405, "grad_norm": 0.19014927744865417, "learning_rate": 2.2662459355692645e-05, "loss": 0.5385, "step": 1508 }, { "epoch": 0.41846921797004993, "grad_norm": 0.20038922131061554, "learning_rate": 2.26592432884197e-05, "loss": 0.5763, "step": 1509 }, { "epoch": 0.4187465335551858, "grad_norm": 0.19587905704975128, "learning_rate": 2.2656025238809233e-05, "loss": 0.5642, "step": 1510 }, { "epoch": 0.4190238491403217, "grad_norm": 0.18529456853866577, "learning_rate": 2.265280520748916e-05, "loss": 0.5467, "step": 1511 }, { "epoch": 0.4193011647254576, "grad_norm": 0.19396060705184937, "learning_rate": 2.26495831950878e-05, "loss": 0.5756, "step": 1512 }, { "epoch": 0.41957848031059347, "grad_norm": 0.19712162017822266, "learning_rate": 2.2646359202233848e-05, "loss": 0.5856, "step": 1513 }, { "epoch": 0.41985579589572936, "grad_norm": 0.1929578334093094, "learning_rate": 2.264313322955639e-05, "loss": 0.5677, "step": 1514 }, { "epoch": 0.42013311148086524, "grad_norm": 0.19393184781074524, "learning_rate": 2.263990527768491e-05, "loss": 0.5409, "step": 1515 }, { "epoch": 0.4204104270660011, "grad_norm": 0.19869175553321838, "learning_rate": 2.2636675347249252e-05, "loss": 0.5344, "step": 1516 }, { "epoch": 0.420687742651137, "grad_norm": 0.19600443542003632, "learning_rate": 2.263344343887967e-05, "loss": 0.5627, "step": 1517 }, { "epoch": 0.4209650582362729, "grad_norm": 0.19098101556301117, "learning_rate": 2.263020955320679e-05, "loss": 0.5704, "step": 1518 }, { "epoch": 0.4212423738214088, "grad_norm": 0.20919351279735565, "learning_rate": 2.2626973690861635e-05, "loss": 0.55, "step": 1519 }, { "epoch": 0.42151968940654466, "grad_norm": 0.21177563071250916, "learning_rate": 2.2623735852475602e-05, "loss": 0.5723, "step": 1520 }, { "epoch": 0.42179700499168055, "grad_norm": 0.19463296234607697, "learning_rate": 2.262049603868048e-05, "loss": 0.5601, "step": 1521 }, { "epoch": 0.42207432057681643, "grad_norm": 0.20798444747924805, "learning_rate": 2.2617254250108445e-05, "loss": 0.5606, "step": 1522 }, { "epoch": 0.4223516361619523, "grad_norm": 0.20048515498638153, "learning_rate": 2.2614010487392053e-05, "loss": 0.5628, "step": 1523 }, { "epoch": 0.4226289517470882, "grad_norm": 0.1931045949459076, "learning_rate": 2.2610764751164253e-05, "loss": 0.5662, "step": 1524 }, { "epoch": 0.4229062673322241, "grad_norm": 0.19520334899425507, "learning_rate": 2.2607517042058367e-05, "loss": 0.5552, "step": 1525 }, { "epoch": 0.42318358291735997, "grad_norm": 0.1984749138355255, "learning_rate": 2.2604267360708113e-05, "loss": 0.5672, "step": 1526 }, { "epoch": 0.42346089850249585, "grad_norm": 0.19392019510269165, "learning_rate": 2.2601015707747585e-05, "loss": 0.5689, "step": 1527 }, { "epoch": 0.42373821408763174, "grad_norm": 0.1880037933588028, "learning_rate": 2.2597762083811276e-05, "loss": 0.5606, "step": 1528 }, { "epoch": 0.4240155296727676, "grad_norm": 0.19556698203086853, "learning_rate": 2.259450648953405e-05, "loss": 0.5626, "step": 1529 }, { "epoch": 0.4242928452579035, "grad_norm": 0.18473385274410248, "learning_rate": 2.2591248925551156e-05, "loss": 0.541, "step": 1530 }, { "epoch": 0.4245701608430394, "grad_norm": 0.2530158460140228, "learning_rate": 2.2587989392498237e-05, "loss": 0.5429, "step": 1531 }, { "epoch": 0.4248474764281753, "grad_norm": 0.22607286274433136, "learning_rate": 2.258472789101131e-05, "loss": 0.5578, "step": 1532 }, { "epoch": 0.42512479201331116, "grad_norm": 0.19067604839801788, "learning_rate": 2.258146442172678e-05, "loss": 0.5474, "step": 1533 }, { "epoch": 0.42540210759844704, "grad_norm": 0.18085655570030212, "learning_rate": 2.257819898528144e-05, "loss": 0.5506, "step": 1534 }, { "epoch": 0.42567942318358293, "grad_norm": 0.18807153403759003, "learning_rate": 2.257493158231246e-05, "loss": 0.5461, "step": 1535 }, { "epoch": 0.4259567387687188, "grad_norm": 0.18786919116973877, "learning_rate": 2.25716622134574e-05, "loss": 0.5781, "step": 1536 }, { "epoch": 0.4262340543538547, "grad_norm": 0.19066756963729858, "learning_rate": 2.2568390879354195e-05, "loss": 0.5378, "step": 1537 }, { "epoch": 0.4265113699389906, "grad_norm": 0.1895960569381714, "learning_rate": 2.2565117580641175e-05, "loss": 0.5661, "step": 1538 }, { "epoch": 0.42678868552412647, "grad_norm": 0.21506737172603607, "learning_rate": 2.2561842317957045e-05, "loss": 0.661, "step": 1539 }, { "epoch": 0.42706600110926235, "grad_norm": 0.1904166042804718, "learning_rate": 2.2558565091940895e-05, "loss": 0.5643, "step": 1540 }, { "epoch": 0.42734331669439823, "grad_norm": 0.18517723679542542, "learning_rate": 2.2555285903232197e-05, "loss": 0.5509, "step": 1541 }, { "epoch": 0.4276206322795341, "grad_norm": 0.2003047913312912, "learning_rate": 2.2552004752470814e-05, "loss": 0.5487, "step": 1542 }, { "epoch": 0.42789794786467, "grad_norm": 0.19767479598522186, "learning_rate": 2.2548721640296976e-05, "loss": 0.5534, "step": 1543 }, { "epoch": 0.4281752634498059, "grad_norm": 0.1890031397342682, "learning_rate": 2.2545436567351312e-05, "loss": 0.5762, "step": 1544 }, { "epoch": 0.42845257903494177, "grad_norm": 0.22215117514133453, "learning_rate": 2.2542149534274827e-05, "loss": 0.5433, "step": 1545 }, { "epoch": 0.42872989462007766, "grad_norm": 0.17825603485107422, "learning_rate": 2.2538860541708902e-05, "loss": 0.5724, "step": 1546 }, { "epoch": 0.42900721020521354, "grad_norm": 0.1967187076807022, "learning_rate": 2.2535569590295313e-05, "loss": 0.5632, "step": 1547 }, { "epoch": 0.4292845257903494, "grad_norm": 0.18680186569690704, "learning_rate": 2.253227668067621e-05, "loss": 0.5704, "step": 1548 }, { "epoch": 0.4295618413754853, "grad_norm": 0.17610576748847961, "learning_rate": 2.2528981813494127e-05, "loss": 0.5295, "step": 1549 }, { "epoch": 0.4298391569606212, "grad_norm": 0.19055548310279846, "learning_rate": 2.2525684989391975e-05, "loss": 0.5651, "step": 1550 }, { "epoch": 0.4301164725457571, "grad_norm": 0.19191017746925354, "learning_rate": 2.2522386209013062e-05, "loss": 0.5366, "step": 1551 }, { "epoch": 0.43039378813089296, "grad_norm": 0.20109041035175323, "learning_rate": 2.2519085473001055e-05, "loss": 0.5508, "step": 1552 }, { "epoch": 0.43067110371602885, "grad_norm": 0.18693628907203674, "learning_rate": 2.2515782782000027e-05, "loss": 0.5603, "step": 1553 }, { "epoch": 0.43094841930116473, "grad_norm": 0.18805643916130066, "learning_rate": 2.2512478136654412e-05, "loss": 0.5197, "step": 1554 }, { "epoch": 0.4312257348863006, "grad_norm": 0.19016428291797638, "learning_rate": 2.2509171537609042e-05, "loss": 0.5719, "step": 1555 }, { "epoch": 0.4315030504714365, "grad_norm": 0.18083550035953522, "learning_rate": 2.2505862985509112e-05, "loss": 0.5502, "step": 1556 }, { "epoch": 0.4317803660565724, "grad_norm": 0.18382135033607483, "learning_rate": 2.2502552481000218e-05, "loss": 0.5437, "step": 1557 }, { "epoch": 0.43205768164170827, "grad_norm": 0.20305176079273224, "learning_rate": 2.2499240024728316e-05, "loss": 0.5559, "step": 1558 }, { "epoch": 0.43233499722684415, "grad_norm": 0.19593636691570282, "learning_rate": 2.2495925617339765e-05, "loss": 0.5695, "step": 1559 }, { "epoch": 0.43261231281198004, "grad_norm": 0.18708528578281403, "learning_rate": 2.2492609259481283e-05, "loss": 0.5828, "step": 1560 }, { "epoch": 0.4328896283971159, "grad_norm": 0.20516738295555115, "learning_rate": 2.248929095179999e-05, "loss": 0.5691, "step": 1561 }, { "epoch": 0.4331669439822518, "grad_norm": 0.19292466342449188, "learning_rate": 2.248597069494337e-05, "loss": 0.5567, "step": 1562 }, { "epoch": 0.4334442595673877, "grad_norm": 0.18612238764762878, "learning_rate": 2.2482648489559296e-05, "loss": 0.5497, "step": 1563 }, { "epoch": 0.4337215751525236, "grad_norm": 0.18284805119037628, "learning_rate": 2.2479324336296016e-05, "loss": 0.5305, "step": 1564 }, { "epoch": 0.43399889073765946, "grad_norm": 0.19658204913139343, "learning_rate": 2.247599823580216e-05, "loss": 0.5631, "step": 1565 }, { "epoch": 0.43427620632279534, "grad_norm": 0.19752554595470428, "learning_rate": 2.2472670188726737e-05, "loss": 0.5647, "step": 1566 }, { "epoch": 0.43455352190793123, "grad_norm": 0.18531207740306854, "learning_rate": 2.246934019571914e-05, "loss": 0.5667, "step": 1567 }, { "epoch": 0.4348308374930671, "grad_norm": 0.18967878818511963, "learning_rate": 2.2466008257429142e-05, "loss": 0.5572, "step": 1568 }, { "epoch": 0.435108153078203, "grad_norm": 0.20551453530788422, "learning_rate": 2.2462674374506886e-05, "loss": 0.5554, "step": 1569 }, { "epoch": 0.4353854686633389, "grad_norm": 0.1888595074415207, "learning_rate": 2.2459338547602905e-05, "loss": 0.5986, "step": 1570 }, { "epoch": 0.43566278424847477, "grad_norm": 0.1824692189693451, "learning_rate": 2.2456000777368102e-05, "loss": 0.5223, "step": 1571 }, { "epoch": 0.43594009983361065, "grad_norm": 0.189620703458786, "learning_rate": 2.245266106445377e-05, "loss": 0.5465, "step": 1572 }, { "epoch": 0.43621741541874653, "grad_norm": 0.20219853520393372, "learning_rate": 2.2449319409511574e-05, "loss": 0.5655, "step": 1573 }, { "epoch": 0.4364947310038824, "grad_norm": 0.18667539954185486, "learning_rate": 2.244597581319356e-05, "loss": 0.5307, "step": 1574 }, { "epoch": 0.4367720465890183, "grad_norm": 0.19490988552570343, "learning_rate": 2.2442630276152148e-05, "loss": 0.5666, "step": 1575 }, { "epoch": 0.4370493621741542, "grad_norm": 0.18786631524562836, "learning_rate": 2.2439282799040146e-05, "loss": 0.535, "step": 1576 }, { "epoch": 0.4373266777592901, "grad_norm": 0.1902604103088379, "learning_rate": 2.2435933382510735e-05, "loss": 0.5362, "step": 1577 }, { "epoch": 0.43760399334442596, "grad_norm": 0.17974701523780823, "learning_rate": 2.2432582027217473e-05, "loss": 0.5538, "step": 1578 }, { "epoch": 0.43788130892956184, "grad_norm": 0.1953812539577484, "learning_rate": 2.2429228733814294e-05, "loss": 0.5584, "step": 1579 }, { "epoch": 0.4381586245146977, "grad_norm": 0.17894363403320312, "learning_rate": 2.2425873502955524e-05, "loss": 0.5667, "step": 1580 }, { "epoch": 0.4384359400998336, "grad_norm": 0.2096855342388153, "learning_rate": 2.2422516335295852e-05, "loss": 0.5634, "step": 1581 }, { "epoch": 0.4387132556849695, "grad_norm": 0.1870642900466919, "learning_rate": 2.241915723149035e-05, "loss": 0.5691, "step": 1582 }, { "epoch": 0.4389905712701054, "grad_norm": 0.2048119157552719, "learning_rate": 2.241579619219447e-05, "loss": 0.5645, "step": 1583 }, { "epoch": 0.43926788685524126, "grad_norm": 0.19508272409439087, "learning_rate": 2.2412433218064037e-05, "loss": 0.5543, "step": 1584 }, { "epoch": 0.43954520244037715, "grad_norm": 0.1895490139722824, "learning_rate": 2.240906830975526e-05, "loss": 0.5522, "step": 1585 }, { "epoch": 0.43982251802551303, "grad_norm": 0.18243685364723206, "learning_rate": 2.240570146792472e-05, "loss": 0.5695, "step": 1586 }, { "epoch": 0.4400998336106489, "grad_norm": 0.18209905922412872, "learning_rate": 2.2402332693229377e-05, "loss": 0.5447, "step": 1587 }, { "epoch": 0.4403771491957848, "grad_norm": 0.1850498467683792, "learning_rate": 2.2398961986326567e-05, "loss": 0.5289, "step": 1588 }, { "epoch": 0.4406544647809207, "grad_norm": 0.19210520386695862, "learning_rate": 2.2395589347874005e-05, "loss": 0.5792, "step": 1589 }, { "epoch": 0.44093178036605657, "grad_norm": 0.19306769967079163, "learning_rate": 2.239221477852978e-05, "loss": 0.5771, "step": 1590 }, { "epoch": 0.44120909595119245, "grad_norm": 0.18809685111045837, "learning_rate": 2.2388838278952367e-05, "loss": 0.5648, "step": 1591 }, { "epoch": 0.44148641153632834, "grad_norm": 0.19211184978485107, "learning_rate": 2.2385459849800606e-05, "loss": 0.5867, "step": 1592 }, { "epoch": 0.4417637271214642, "grad_norm": 0.2126152366399765, "learning_rate": 2.2382079491733715e-05, "loss": 0.5705, "step": 1593 }, { "epoch": 0.4420410427066001, "grad_norm": 0.18234777450561523, "learning_rate": 2.23786972054113e-05, "loss": 0.5373, "step": 1594 }, { "epoch": 0.442318358291736, "grad_norm": 0.18619130551815033, "learning_rate": 2.2375312991493324e-05, "loss": 0.5525, "step": 1595 }, { "epoch": 0.4425956738768719, "grad_norm": 0.19054090976715088, "learning_rate": 2.237192685064014e-05, "loss": 0.5606, "step": 1596 }, { "epoch": 0.44287298946200776, "grad_norm": 0.19798876345157623, "learning_rate": 2.236853878351248e-05, "loss": 0.5389, "step": 1597 }, { "epoch": 0.44315030504714364, "grad_norm": 0.20198500156402588, "learning_rate": 2.2365148790771442e-05, "loss": 0.577, "step": 1598 }, { "epoch": 0.44342762063227953, "grad_norm": 0.22898751497268677, "learning_rate": 2.2361756873078502e-05, "loss": 0.5516, "step": 1599 }, { "epoch": 0.4437049362174154, "grad_norm": 0.1942586600780487, "learning_rate": 2.2358363031095513e-05, "loss": 0.5583, "step": 1600 }, { "epoch": 0.4439822518025513, "grad_norm": 0.18665020167827606, "learning_rate": 2.23549672654847e-05, "loss": 0.562, "step": 1601 }, { "epoch": 0.4442595673876872, "grad_norm": 0.20378893613815308, "learning_rate": 2.2351569576908675e-05, "loss": 0.5783, "step": 1602 }, { "epoch": 0.44453688297282307, "grad_norm": 0.21167699992656708, "learning_rate": 2.2348169966030416e-05, "loss": 0.5453, "step": 1603 }, { "epoch": 0.44481419855795895, "grad_norm": 0.19410103559494019, "learning_rate": 2.234476843351327e-05, "loss": 0.5527, "step": 1604 }, { "epoch": 0.44509151414309484, "grad_norm": 0.17932045459747314, "learning_rate": 2.2341364980020973e-05, "loss": 0.568, "step": 1605 }, { "epoch": 0.4453688297282307, "grad_norm": 0.2007569819688797, "learning_rate": 2.2337959606217624e-05, "loss": 0.5699, "step": 1606 }, { "epoch": 0.4456461453133666, "grad_norm": 0.18869513273239136, "learning_rate": 2.2334552312767705e-05, "loss": 0.5506, "step": 1607 }, { "epoch": 0.4459234608985025, "grad_norm": 0.1867865025997162, "learning_rate": 2.2331143100336072e-05, "loss": 0.5758, "step": 1608 }, { "epoch": 0.4462007764836384, "grad_norm": 0.19275221228599548, "learning_rate": 2.2327731969587947e-05, "loss": 0.5408, "step": 1609 }, { "epoch": 0.44647809206877426, "grad_norm": 0.19611743092536926, "learning_rate": 2.2324318921188932e-05, "loss": 0.5974, "step": 1610 }, { "epoch": 0.44675540765391014, "grad_norm": 0.20041632652282715, "learning_rate": 2.2320903955805e-05, "loss": 0.5598, "step": 1611 }, { "epoch": 0.447032723239046, "grad_norm": 0.18512395024299622, "learning_rate": 2.2317487074102514e-05, "loss": 0.5661, "step": 1612 }, { "epoch": 0.4473100388241819, "grad_norm": 0.1952909678220749, "learning_rate": 2.2314068276748188e-05, "loss": 0.5635, "step": 1613 }, { "epoch": 0.4475873544093178, "grad_norm": 0.1981881707906723, "learning_rate": 2.231064756440912e-05, "loss": 0.5601, "step": 1614 }, { "epoch": 0.4478646699944537, "grad_norm": 0.1989242285490036, "learning_rate": 2.230722493775279e-05, "loss": 0.5635, "step": 1615 }, { "epoch": 0.44814198557958956, "grad_norm": 0.18769480288028717, "learning_rate": 2.2303800397447034e-05, "loss": 0.5589, "step": 1616 }, { "epoch": 0.44841930116472545, "grad_norm": 0.19833675026893616, "learning_rate": 2.230037394416007e-05, "loss": 0.5622, "step": 1617 }, { "epoch": 0.44869661674986133, "grad_norm": 0.19801415503025055, "learning_rate": 2.2296945578560498e-05, "loss": 0.5862, "step": 1618 }, { "epoch": 0.4489739323349972, "grad_norm": 0.1936810463666916, "learning_rate": 2.2293515301317274e-05, "loss": 0.5452, "step": 1619 }, { "epoch": 0.4492512479201331, "grad_norm": 0.20558279752731323, "learning_rate": 2.2290083113099748e-05, "loss": 0.5573, "step": 1620 }, { "epoch": 0.449528563505269, "grad_norm": 0.2063780575990677, "learning_rate": 2.2286649014577615e-05, "loss": 0.6017, "step": 1621 }, { "epoch": 0.44980587909040487, "grad_norm": 0.18592418730258942, "learning_rate": 2.2283213006420973e-05, "loss": 0.5421, "step": 1622 }, { "epoch": 0.45008319467554075, "grad_norm": 0.19346946477890015, "learning_rate": 2.227977508930027e-05, "loss": 0.53, "step": 1623 }, { "epoch": 0.45036051026067664, "grad_norm": 0.2029414027929306, "learning_rate": 2.2276335263886336e-05, "loss": 0.5808, "step": 1624 }, { "epoch": 0.4506378258458125, "grad_norm": 0.21214988827705383, "learning_rate": 2.2272893530850373e-05, "loss": 0.564, "step": 1625 }, { "epoch": 0.4509151414309484, "grad_norm": 0.19748808443546295, "learning_rate": 2.2269449890863956e-05, "loss": 0.5562, "step": 1626 }, { "epoch": 0.4511924570160843, "grad_norm": 0.19283835589885712, "learning_rate": 2.2266004344599028e-05, "loss": 0.5511, "step": 1627 }, { "epoch": 0.4514697726012202, "grad_norm": 0.1917610466480255, "learning_rate": 2.2262556892727904e-05, "loss": 0.5744, "step": 1628 }, { "epoch": 0.45174708818635606, "grad_norm": 0.20705603063106537, "learning_rate": 2.225910753592328e-05, "loss": 0.5786, "step": 1629 }, { "epoch": 0.45202440377149194, "grad_norm": 0.19348299503326416, "learning_rate": 2.225565627485821e-05, "loss": 0.5899, "step": 1630 }, { "epoch": 0.45230171935662783, "grad_norm": 0.19649489223957062, "learning_rate": 2.2252203110206134e-05, "loss": 0.5317, "step": 1631 }, { "epoch": 0.4525790349417637, "grad_norm": 0.18169252574443817, "learning_rate": 2.224874804264085e-05, "loss": 0.5243, "step": 1632 }, { "epoch": 0.4528563505268996, "grad_norm": 0.188733771443367, "learning_rate": 2.224529107283653e-05, "loss": 0.5376, "step": 1633 }, { "epoch": 0.4531336661120355, "grad_norm": 0.18059036135673523, "learning_rate": 2.2241832201467727e-05, "loss": 0.5603, "step": 1634 }, { "epoch": 0.45341098169717137, "grad_norm": 0.19139643013477325, "learning_rate": 2.223837142920936e-05, "loss": 0.5603, "step": 1635 }, { "epoch": 0.45368829728230725, "grad_norm": 0.19182981550693512, "learning_rate": 2.2234908756736712e-05, "loss": 0.5805, "step": 1636 }, { "epoch": 0.45396561286744314, "grad_norm": 0.187539204955101, "learning_rate": 2.223144418472544e-05, "loss": 0.5546, "step": 1637 }, { "epoch": 0.454242928452579, "grad_norm": 0.19611942768096924, "learning_rate": 2.2227977713851587e-05, "loss": 0.5349, "step": 1638 }, { "epoch": 0.4545202440377149, "grad_norm": 0.19672465324401855, "learning_rate": 2.2224509344791536e-05, "loss": 0.5342, "step": 1639 }, { "epoch": 0.4547975596228508, "grad_norm": 0.17679478228092194, "learning_rate": 2.222103907822207e-05, "loss": 0.5473, "step": 1640 }, { "epoch": 0.4550748752079867, "grad_norm": 0.1936340481042862, "learning_rate": 2.2217566914820322e-05, "loss": 0.5543, "step": 1641 }, { "epoch": 0.45535219079312256, "grad_norm": 0.18610352277755737, "learning_rate": 2.2214092855263813e-05, "loss": 0.5412, "step": 1642 }, { "epoch": 0.45562950637825844, "grad_norm": 0.18969598412513733, "learning_rate": 2.2210616900230412e-05, "loss": 0.5707, "step": 1643 }, { "epoch": 0.4559068219633943, "grad_norm": 0.18808507919311523, "learning_rate": 2.220713905039838e-05, "loss": 0.5288, "step": 1644 }, { "epoch": 0.4561841375485302, "grad_norm": 0.19103705883026123, "learning_rate": 2.220365930644633e-05, "loss": 0.5925, "step": 1645 }, { "epoch": 0.4564614531336661, "grad_norm": 0.1837342530488968, "learning_rate": 2.2200177669053258e-05, "loss": 0.5893, "step": 1646 }, { "epoch": 0.456738768718802, "grad_norm": 0.1928233504295349, "learning_rate": 2.2196694138898517e-05, "loss": 0.5445, "step": 1647 }, { "epoch": 0.45701608430393786, "grad_norm": 0.195438414812088, "learning_rate": 2.2193208716661846e-05, "loss": 0.5561, "step": 1648 }, { "epoch": 0.45729339988907375, "grad_norm": 0.18883578479290009, "learning_rate": 2.2189721403023334e-05, "loss": 0.5463, "step": 1649 }, { "epoch": 0.45757071547420963, "grad_norm": 0.19664356112480164, "learning_rate": 2.2186232198663455e-05, "loss": 0.576, "step": 1650 }, { "epoch": 0.4578480310593455, "grad_norm": 0.5387895703315735, "learning_rate": 2.218274110426304e-05, "loss": 0.562, "step": 1651 }, { "epoch": 0.4581253466444814, "grad_norm": 0.18329556286334991, "learning_rate": 2.21792481205033e-05, "loss": 0.5737, "step": 1652 }, { "epoch": 0.4584026622296173, "grad_norm": 0.20161500573158264, "learning_rate": 2.21757532480658e-05, "loss": 0.5711, "step": 1653 }, { "epoch": 0.45867997781475317, "grad_norm": 0.20053423941135406, "learning_rate": 2.2172256487632488e-05, "loss": 0.5842, "step": 1654 }, { "epoch": 0.45895729339988905, "grad_norm": 0.1900150030851364, "learning_rate": 2.2168757839885672e-05, "loss": 0.5702, "step": 1655 }, { "epoch": 0.45923460898502494, "grad_norm": 0.19815103709697723, "learning_rate": 2.2165257305508035e-05, "loss": 0.5661, "step": 1656 }, { "epoch": 0.4595119245701608, "grad_norm": 0.18877148628234863, "learning_rate": 2.2161754885182623e-05, "loss": 0.5278, "step": 1657 }, { "epoch": 0.4597892401552967, "grad_norm": 0.1913890838623047, "learning_rate": 2.215825057959285e-05, "loss": 0.5342, "step": 1658 }, { "epoch": 0.4600665557404326, "grad_norm": 0.18911883234977722, "learning_rate": 2.2154744389422493e-05, "loss": 0.5473, "step": 1659 }, { "epoch": 0.4603438713255685, "grad_norm": 0.2087319940328598, "learning_rate": 2.2151236315355714e-05, "loss": 0.5839, "step": 1660 }, { "epoch": 0.46062118691070436, "grad_norm": 0.19037386775016785, "learning_rate": 2.214772635807702e-05, "loss": 0.5518, "step": 1661 }, { "epoch": 0.46089850249584025, "grad_norm": 0.1889956146478653, "learning_rate": 2.2144214518271307e-05, "loss": 0.5527, "step": 1662 }, { "epoch": 0.46117581808097613, "grad_norm": 0.24889707565307617, "learning_rate": 2.214070079662382e-05, "loss": 0.5766, "step": 1663 }, { "epoch": 0.461453133666112, "grad_norm": 0.19265350699424744, "learning_rate": 2.213718519382018e-05, "loss": 0.5721, "step": 1664 }, { "epoch": 0.4617304492512479, "grad_norm": 0.19039899110794067, "learning_rate": 2.213366771054638e-05, "loss": 0.5678, "step": 1665 }, { "epoch": 0.4620077648363838, "grad_norm": 0.203168585896492, "learning_rate": 2.2130148347488773e-05, "loss": 0.5486, "step": 1666 }, { "epoch": 0.46228508042151967, "grad_norm": 0.19051045179367065, "learning_rate": 2.2126627105334073e-05, "loss": 0.544, "step": 1667 }, { "epoch": 0.46256239600665555, "grad_norm": 0.18888403475284576, "learning_rate": 2.212310398476937e-05, "loss": 0.5436, "step": 1668 }, { "epoch": 0.46283971159179144, "grad_norm": 0.1800607144832611, "learning_rate": 2.2119578986482127e-05, "loss": 0.5659, "step": 1669 }, { "epoch": 0.4631170271769273, "grad_norm": 0.19722715020179749, "learning_rate": 2.211605211116015e-05, "loss": 0.5582, "step": 1670 }, { "epoch": 0.4633943427620632, "grad_norm": 0.19088581204414368, "learning_rate": 2.2112523359491637e-05, "loss": 0.5471, "step": 1671 }, { "epoch": 0.4636716583471991, "grad_norm": 0.19284148514270782, "learning_rate": 2.210899273216514e-05, "loss": 0.5408, "step": 1672 }, { "epoch": 0.463948973932335, "grad_norm": 0.1880495250225067, "learning_rate": 2.2105460229869574e-05, "loss": 0.561, "step": 1673 }, { "epoch": 0.46422628951747086, "grad_norm": 0.186056986451149, "learning_rate": 2.2101925853294226e-05, "loss": 0.5811, "step": 1674 }, { "epoch": 0.46450360510260674, "grad_norm": 0.1980651617050171, "learning_rate": 2.2098389603128744e-05, "loss": 0.5456, "step": 1675 }, { "epoch": 0.4647809206877426, "grad_norm": 0.2039021998643875, "learning_rate": 2.2094851480063143e-05, "loss": 0.6018, "step": 1676 }, { "epoch": 0.4650582362728785, "grad_norm": 0.19567370414733887, "learning_rate": 2.2091311484787815e-05, "loss": 0.5499, "step": 1677 }, { "epoch": 0.4653355518580144, "grad_norm": 0.19234254956245422, "learning_rate": 2.208776961799349e-05, "loss": 0.5522, "step": 1678 }, { "epoch": 0.4656128674431503, "grad_norm": 0.1954323649406433, "learning_rate": 2.20842258803713e-05, "loss": 0.5702, "step": 1679 }, { "epoch": 0.46589018302828616, "grad_norm": 0.1974237710237503, "learning_rate": 2.20806802726127e-05, "loss": 0.5604, "step": 1680 }, { "epoch": 0.46616749861342205, "grad_norm": 0.19580432772636414, "learning_rate": 2.2077132795409552e-05, "loss": 0.5184, "step": 1681 }, { "epoch": 0.46644481419855793, "grad_norm": 0.19890138506889343, "learning_rate": 2.207358344945405e-05, "loss": 0.5767, "step": 1682 }, { "epoch": 0.4667221297836938, "grad_norm": 0.18458129465579987, "learning_rate": 2.2070032235438776e-05, "loss": 0.5316, "step": 1683 }, { "epoch": 0.4669994453688297, "grad_norm": 0.1938208043575287, "learning_rate": 2.206647915405665e-05, "loss": 0.5809, "step": 1684 }, { "epoch": 0.4672767609539656, "grad_norm": 0.20221275091171265, "learning_rate": 2.206292420600099e-05, "loss": 0.5731, "step": 1685 }, { "epoch": 0.46755407653910147, "grad_norm": 0.194106787443161, "learning_rate": 2.205936739196545e-05, "loss": 0.5624, "step": 1686 }, { "epoch": 0.46783139212423736, "grad_norm": 0.1918519288301468, "learning_rate": 2.205580871264406e-05, "loss": 0.535, "step": 1687 }, { "epoch": 0.46810870770937324, "grad_norm": 0.19345992803573608, "learning_rate": 2.2052248168731216e-05, "loss": 0.5686, "step": 1688 }, { "epoch": 0.4683860232945091, "grad_norm": 0.20070746541023254, "learning_rate": 2.2048685760921674e-05, "loss": 0.567, "step": 1689 }, { "epoch": 0.468663338879645, "grad_norm": 0.1972619593143463, "learning_rate": 2.204512148991055e-05, "loss": 0.555, "step": 1690 }, { "epoch": 0.46894065446478095, "grad_norm": 0.19085273146629333, "learning_rate": 2.2041555356393327e-05, "loss": 0.5985, "step": 1691 }, { "epoch": 0.46921797004991683, "grad_norm": 0.20294739305973053, "learning_rate": 2.2037987361065855e-05, "loss": 0.5762, "step": 1692 }, { "epoch": 0.4694952856350527, "grad_norm": 0.1896994560956955, "learning_rate": 2.203441750462435e-05, "loss": 0.5857, "step": 1693 }, { "epoch": 0.4697726012201886, "grad_norm": 0.19331398606300354, "learning_rate": 2.2030845787765377e-05, "loss": 0.5654, "step": 1694 }, { "epoch": 0.4700499168053245, "grad_norm": 0.20668423175811768, "learning_rate": 2.2027272211185875e-05, "loss": 0.5812, "step": 1695 }, { "epoch": 0.47032723239046037, "grad_norm": 0.1910814493894577, "learning_rate": 2.2023696775583146e-05, "loss": 0.5479, "step": 1696 }, { "epoch": 0.47060454797559625, "grad_norm": 0.1902536004781723, "learning_rate": 2.2020119481654848e-05, "loss": 0.5647, "step": 1697 }, { "epoch": 0.47088186356073214, "grad_norm": 0.1955711394548416, "learning_rate": 2.201654033009901e-05, "loss": 0.581, "step": 1698 }, { "epoch": 0.471159179145868, "grad_norm": 0.1895892173051834, "learning_rate": 2.2012959321614018e-05, "loss": 0.5658, "step": 1699 }, { "epoch": 0.4714364947310039, "grad_norm": 0.1803213357925415, "learning_rate": 2.2009376456898622e-05, "loss": 0.558, "step": 1700 }, { "epoch": 0.4717138103161398, "grad_norm": 0.19255508482456207, "learning_rate": 2.200579173665193e-05, "loss": 0.5649, "step": 1701 }, { "epoch": 0.4719911259012757, "grad_norm": 0.20155562460422516, "learning_rate": 2.2002205161573426e-05, "loss": 0.5592, "step": 1702 }, { "epoch": 0.47226844148641156, "grad_norm": 0.20530745387077332, "learning_rate": 2.1998616732362935e-05, "loss": 0.5677, "step": 1703 }, { "epoch": 0.47254575707154745, "grad_norm": 0.2158524990081787, "learning_rate": 2.1995026449720657e-05, "loss": 0.5476, "step": 1704 }, { "epoch": 0.47282307265668333, "grad_norm": 0.2383381724357605, "learning_rate": 2.1991434314347155e-05, "loss": 0.5413, "step": 1705 }, { "epoch": 0.4731003882418192, "grad_norm": 0.19787725806236267, "learning_rate": 2.1987840326943343e-05, "loss": 0.5637, "step": 1706 }, { "epoch": 0.4733777038269551, "grad_norm": 0.18500499427318573, "learning_rate": 2.1984244488210508e-05, "loss": 0.5333, "step": 1707 }, { "epoch": 0.473655019412091, "grad_norm": 0.19429421424865723, "learning_rate": 2.1980646798850295e-05, "loss": 0.5611, "step": 1708 }, { "epoch": 0.47393233499722687, "grad_norm": 0.18553559482097626, "learning_rate": 2.197704725956471e-05, "loss": 0.5516, "step": 1709 }, { "epoch": 0.47420965058236275, "grad_norm": 0.1934727132320404, "learning_rate": 2.197344587105611e-05, "loss": 0.5464, "step": 1710 }, { "epoch": 0.47448696616749864, "grad_norm": 0.20638912916183472, "learning_rate": 2.1969842634027233e-05, "loss": 0.5664, "step": 1711 }, { "epoch": 0.4747642817526345, "grad_norm": 0.19581542909145355, "learning_rate": 2.196623754918115e-05, "loss": 0.5597, "step": 1712 }, { "epoch": 0.4750415973377704, "grad_norm": 0.19786013662815094, "learning_rate": 2.1962630617221325e-05, "loss": 0.5729, "step": 1713 }, { "epoch": 0.4753189129229063, "grad_norm": 0.19448676705360413, "learning_rate": 2.1959021838851556e-05, "loss": 0.5573, "step": 1714 }, { "epoch": 0.4755962285080422, "grad_norm": 0.2131812423467636, "learning_rate": 2.1955411214776015e-05, "loss": 0.5862, "step": 1715 }, { "epoch": 0.47587354409317806, "grad_norm": 0.2251943200826645, "learning_rate": 2.195179874569923e-05, "loss": 0.5847, "step": 1716 }, { "epoch": 0.47615085967831394, "grad_norm": 0.20042872428894043, "learning_rate": 2.1948184432326084e-05, "loss": 0.5742, "step": 1717 }, { "epoch": 0.4764281752634498, "grad_norm": 0.193080872297287, "learning_rate": 2.1944568275361838e-05, "loss": 0.5441, "step": 1718 }, { "epoch": 0.4767054908485857, "grad_norm": 0.20040108263492584, "learning_rate": 2.194095027551209e-05, "loss": 0.5635, "step": 1719 }, { "epoch": 0.4769828064337216, "grad_norm": 0.19599542021751404, "learning_rate": 2.193733043348281e-05, "loss": 0.5852, "step": 1720 }, { "epoch": 0.4772601220188575, "grad_norm": 0.1938834935426712, "learning_rate": 2.1933708749980324e-05, "loss": 0.5644, "step": 1721 }, { "epoch": 0.47753743760399336, "grad_norm": 0.19517837464809418, "learning_rate": 2.1930085225711317e-05, "loss": 0.5568, "step": 1722 }, { "epoch": 0.47781475318912925, "grad_norm": 0.1954992413520813, "learning_rate": 2.1926459861382843e-05, "loss": 0.5459, "step": 1723 }, { "epoch": 0.47809206877426513, "grad_norm": 0.17570015788078308, "learning_rate": 2.1922832657702297e-05, "loss": 0.539, "step": 1724 }, { "epoch": 0.478369384359401, "grad_norm": 0.24834416806697845, "learning_rate": 2.1919203615377442e-05, "loss": 0.5401, "step": 1725 }, { "epoch": 0.4786466999445369, "grad_norm": 0.19633722305297852, "learning_rate": 2.1915572735116413e-05, "loss": 0.5469, "step": 1726 }, { "epoch": 0.4789240155296728, "grad_norm": 0.19104620814323425, "learning_rate": 2.1911940017627676e-05, "loss": 0.5472, "step": 1727 }, { "epoch": 0.47920133111480867, "grad_norm": 0.18845802545547485, "learning_rate": 2.1908305463620084e-05, "loss": 0.5742, "step": 1728 }, { "epoch": 0.47947864669994456, "grad_norm": 0.20180946588516235, "learning_rate": 2.190466907380282e-05, "loss": 0.5402, "step": 1729 }, { "epoch": 0.47975596228508044, "grad_norm": 0.19500130414962769, "learning_rate": 2.190103084888545e-05, "loss": 0.5476, "step": 1730 }, { "epoch": 0.4800332778702163, "grad_norm": 0.1933142989873886, "learning_rate": 2.1897390789577887e-05, "loss": 0.5426, "step": 1731 }, { "epoch": 0.4803105934553522, "grad_norm": 0.1977783739566803, "learning_rate": 2.1893748896590404e-05, "loss": 0.5614, "step": 1732 }, { "epoch": 0.4805879090404881, "grad_norm": 0.2301134616136551, "learning_rate": 2.1890105170633624e-05, "loss": 0.564, "step": 1733 }, { "epoch": 0.480865224625624, "grad_norm": 0.219647616147995, "learning_rate": 2.1886459612418542e-05, "loss": 0.5289, "step": 1734 }, { "epoch": 0.48114254021075986, "grad_norm": 0.19821031391620636, "learning_rate": 2.18828122226565e-05, "loss": 0.5903, "step": 1735 }, { "epoch": 0.48141985579589575, "grad_norm": 0.24226263165473938, "learning_rate": 2.18791630020592e-05, "loss": 0.5795, "step": 1736 }, { "epoch": 0.48169717138103163, "grad_norm": 0.200203076004982, "learning_rate": 2.18755119513387e-05, "loss": 0.568, "step": 1737 }, { "epoch": 0.4819744869661675, "grad_norm": 0.19729411602020264, "learning_rate": 2.1871859071207425e-05, "loss": 0.5633, "step": 1738 }, { "epoch": 0.4822518025513034, "grad_norm": 0.19362856447696686, "learning_rate": 2.1868204362378136e-05, "loss": 0.5709, "step": 1739 }, { "epoch": 0.4825291181364393, "grad_norm": 0.21311257779598236, "learning_rate": 2.1864547825563968e-05, "loss": 0.5481, "step": 1740 }, { "epoch": 0.48280643372157517, "grad_norm": 0.1957651972770691, "learning_rate": 2.1860889461478416e-05, "loss": 0.5481, "step": 1741 }, { "epoch": 0.48308374930671105, "grad_norm": 0.20088225603103638, "learning_rate": 2.1857229270835316e-05, "loss": 0.5774, "step": 1742 }, { "epoch": 0.48336106489184694, "grad_norm": 0.19715815782546997, "learning_rate": 2.1853567254348873e-05, "loss": 0.5737, "step": 1743 }, { "epoch": 0.4836383804769828, "grad_norm": 0.2071049064397812, "learning_rate": 2.184990341273364e-05, "loss": 0.5745, "step": 1744 }, { "epoch": 0.4839156960621187, "grad_norm": 0.19320468604564667, "learning_rate": 2.1846237746704526e-05, "loss": 0.5662, "step": 1745 }, { "epoch": 0.4841930116472546, "grad_norm": 0.18520797789096832, "learning_rate": 2.1842570256976807e-05, "loss": 0.5808, "step": 1746 }, { "epoch": 0.4844703272323905, "grad_norm": 0.20254836976528168, "learning_rate": 2.18389009442661e-05, "loss": 0.5558, "step": 1747 }, { "epoch": 0.48474764281752636, "grad_norm": 0.2153664231300354, "learning_rate": 2.1835229809288393e-05, "loss": 0.5661, "step": 1748 }, { "epoch": 0.48502495840266224, "grad_norm": 0.1884908676147461, "learning_rate": 2.183155685276002e-05, "loss": 0.5577, "step": 1749 }, { "epoch": 0.4853022739877981, "grad_norm": 0.19069699943065643, "learning_rate": 2.1827882075397664e-05, "loss": 0.5417, "step": 1750 }, { "epoch": 0.485579589572934, "grad_norm": 0.2055320143699646, "learning_rate": 2.182420547791838e-05, "loss": 0.5887, "step": 1751 }, { "epoch": 0.4858569051580699, "grad_norm": 0.19550320506095886, "learning_rate": 2.182052706103957e-05, "loss": 0.5348, "step": 1752 }, { "epoch": 0.4861342207432058, "grad_norm": 0.19508466124534607, "learning_rate": 2.1816846825478988e-05, "loss": 0.5506, "step": 1753 }, { "epoch": 0.48641153632834166, "grad_norm": 0.18409934639930725, "learning_rate": 2.181316477195474e-05, "loss": 0.5629, "step": 1754 }, { "epoch": 0.48668885191347755, "grad_norm": 0.18185651302337646, "learning_rate": 2.1809480901185302e-05, "loss": 0.5471, "step": 1755 }, { "epoch": 0.48696616749861343, "grad_norm": 0.18371707201004028, "learning_rate": 2.180579521388949e-05, "loss": 0.5747, "step": 1756 }, { "epoch": 0.4872434830837493, "grad_norm": 0.1873805671930313, "learning_rate": 2.1802107710786476e-05, "loss": 0.5606, "step": 1757 }, { "epoch": 0.4875207986688852, "grad_norm": 0.19243435561656952, "learning_rate": 2.1798418392595794e-05, "loss": 0.5638, "step": 1758 }, { "epoch": 0.4877981142540211, "grad_norm": 0.184648796916008, "learning_rate": 2.179472726003733e-05, "loss": 0.5584, "step": 1759 }, { "epoch": 0.48807542983915697, "grad_norm": 0.17338646948337555, "learning_rate": 2.1791034313831316e-05, "loss": 0.556, "step": 1760 }, { "epoch": 0.48835274542429286, "grad_norm": 0.18127377331256866, "learning_rate": 2.1787339554698344e-05, "loss": 0.5631, "step": 1761 }, { "epoch": 0.48863006100942874, "grad_norm": 0.1888059824705124, "learning_rate": 2.1783642983359364e-05, "loss": 0.5611, "step": 1762 }, { "epoch": 0.4889073765945646, "grad_norm": 0.17712418735027313, "learning_rate": 2.1779944600535672e-05, "loss": 0.5462, "step": 1763 }, { "epoch": 0.4891846921797005, "grad_norm": 0.19322241842746735, "learning_rate": 2.177624440694892e-05, "loss": 0.5511, "step": 1764 }, { "epoch": 0.4894620077648364, "grad_norm": 0.18574179708957672, "learning_rate": 2.1772542403321118e-05, "loss": 0.5531, "step": 1765 }, { "epoch": 0.4897393233499723, "grad_norm": 0.18718282878398895, "learning_rate": 2.1768838590374617e-05, "loss": 0.5683, "step": 1766 }, { "epoch": 0.49001663893510816, "grad_norm": 0.18598803877830505, "learning_rate": 2.1765132968832135e-05, "loss": 0.5488, "step": 1767 }, { "epoch": 0.49029395452024405, "grad_norm": 0.18899311125278473, "learning_rate": 2.1761425539416737e-05, "loss": 0.5449, "step": 1768 }, { "epoch": 0.49057127010537993, "grad_norm": 0.1895790696144104, "learning_rate": 2.175771630285184e-05, "loss": 0.58, "step": 1769 }, { "epoch": 0.4908485856905158, "grad_norm": 0.17997822165489197, "learning_rate": 2.1754005259861217e-05, "loss": 0.5734, "step": 1770 }, { "epoch": 0.4911259012756517, "grad_norm": 0.19107869267463684, "learning_rate": 2.175029241116898e-05, "loss": 0.5707, "step": 1771 }, { "epoch": 0.4914032168607876, "grad_norm": 0.22478626668453217, "learning_rate": 2.1746577757499613e-05, "loss": 0.5667, "step": 1772 }, { "epoch": 0.49168053244592347, "grad_norm": 0.2036799043416977, "learning_rate": 2.1742861299577947e-05, "loss": 0.5505, "step": 1773 }, { "epoch": 0.49195784803105935, "grad_norm": 0.1856662929058075, "learning_rate": 2.1739143038129152e-05, "loss": 0.538, "step": 1774 }, { "epoch": 0.49223516361619524, "grad_norm": 0.18573297560214996, "learning_rate": 2.1735422973878766e-05, "loss": 0.5507, "step": 1775 }, { "epoch": 0.4925124792013311, "grad_norm": 0.19560420513153076, "learning_rate": 2.1731701107552673e-05, "loss": 0.5395, "step": 1776 }, { "epoch": 0.492789794786467, "grad_norm": 0.18675924837589264, "learning_rate": 2.1727977439877094e-05, "loss": 0.5523, "step": 1777 }, { "epoch": 0.4930671103716029, "grad_norm": 0.19126398861408234, "learning_rate": 2.1724251971578636e-05, "loss": 0.5736, "step": 1778 }, { "epoch": 0.4933444259567388, "grad_norm": 0.17955103516578674, "learning_rate": 2.1720524703384222e-05, "loss": 0.5398, "step": 1779 }, { "epoch": 0.49362174154187466, "grad_norm": 0.1961311250925064, "learning_rate": 2.1716795636021148e-05, "loss": 0.5565, "step": 1780 }, { "epoch": 0.49389905712701054, "grad_norm": 0.1914500594139099, "learning_rate": 2.171306477021705e-05, "loss": 0.5296, "step": 1781 }, { "epoch": 0.49417637271214643, "grad_norm": 0.2172551453113556, "learning_rate": 2.170933210669992e-05, "loss": 0.5711, "step": 1782 }, { "epoch": 0.4944536882972823, "grad_norm": 0.18967878818511963, "learning_rate": 2.1705597646198098e-05, "loss": 0.5719, "step": 1783 }, { "epoch": 0.4947310038824182, "grad_norm": 0.19041703641414642, "learning_rate": 2.1701861389440277e-05, "loss": 0.5431, "step": 1784 }, { "epoch": 0.4950083194675541, "grad_norm": 0.19202065467834473, "learning_rate": 2.1698123337155503e-05, "loss": 0.5392, "step": 1785 }, { "epoch": 0.49528563505268997, "grad_norm": 0.1918521374464035, "learning_rate": 2.1694383490073162e-05, "loss": 0.5268, "step": 1786 }, { "epoch": 0.49556295063782585, "grad_norm": 0.19112561643123627, "learning_rate": 2.1690641848923004e-05, "loss": 0.5741, "step": 1787 }, { "epoch": 0.49584026622296173, "grad_norm": 0.28441137075424194, "learning_rate": 2.168689841443512e-05, "loss": 0.5628, "step": 1788 }, { "epoch": 0.4961175818080976, "grad_norm": 0.1971031278371811, "learning_rate": 2.1683153187339955e-05, "loss": 0.5336, "step": 1789 }, { "epoch": 0.4963948973932335, "grad_norm": 0.18874448537826538, "learning_rate": 2.16794061683683e-05, "loss": 0.5717, "step": 1790 }, { "epoch": 0.4966722129783694, "grad_norm": 0.19406016170978546, "learning_rate": 2.1675657358251293e-05, "loss": 0.5641, "step": 1791 }, { "epoch": 0.49694952856350527, "grad_norm": 0.19491691887378693, "learning_rate": 2.1671906757720433e-05, "loss": 0.5598, "step": 1792 }, { "epoch": 0.49722684414864116, "grad_norm": 0.1898011416196823, "learning_rate": 2.166815436750756e-05, "loss": 0.5748, "step": 1793 }, { "epoch": 0.49750415973377704, "grad_norm": 0.18792784214019775, "learning_rate": 2.1664400188344863e-05, "loss": 0.5383, "step": 1794 }, { "epoch": 0.4977814753189129, "grad_norm": 0.1921299546957016, "learning_rate": 2.1660644220964886e-05, "loss": 0.5649, "step": 1795 }, { "epoch": 0.4980587909040488, "grad_norm": 0.1881396770477295, "learning_rate": 2.1656886466100514e-05, "loss": 0.5525, "step": 1796 }, { "epoch": 0.4983361064891847, "grad_norm": 0.19252420961856842, "learning_rate": 2.1653126924484985e-05, "loss": 0.5308, "step": 1797 }, { "epoch": 0.4986134220743206, "grad_norm": 0.2611597180366516, "learning_rate": 2.1649365596851884e-05, "loss": 0.5664, "step": 1798 }, { "epoch": 0.49889073765945646, "grad_norm": 0.18851755559444427, "learning_rate": 2.164560248393515e-05, "loss": 0.5314, "step": 1799 }, { "epoch": 0.49916805324459235, "grad_norm": 0.19385330379009247, "learning_rate": 2.164183758646906e-05, "loss": 0.5615, "step": 1800 }, { "epoch": 0.49944536882972823, "grad_norm": 0.20486874878406525, "learning_rate": 2.163807090518825e-05, "loss": 0.5833, "step": 1801 }, { "epoch": 0.4997226844148641, "grad_norm": 0.21984978020191193, "learning_rate": 2.16343024408277e-05, "loss": 0.5494, "step": 1802 }, { "epoch": 0.5, "grad_norm": 0.1872473657131195, "learning_rate": 2.1630532194122733e-05, "loss": 0.5388, "step": 1803 }, { "epoch": 0.5002773155851359, "grad_norm": 0.19348090887069702, "learning_rate": 2.1626760165809022e-05, "loss": 0.5615, "step": 1804 }, { "epoch": 0.5005546311702718, "grad_norm": 0.18968060612678528, "learning_rate": 2.16229863566226e-05, "loss": 0.5401, "step": 1805 }, { "epoch": 0.5008319467554077, "grad_norm": 0.1913541853427887, "learning_rate": 2.161921076729983e-05, "loss": 0.5797, "step": 1806 }, { "epoch": 0.5011092623405435, "grad_norm": 0.18872712552547455, "learning_rate": 2.1615433398577428e-05, "loss": 0.5385, "step": 1807 }, { "epoch": 0.5013865779256794, "grad_norm": 0.19591103494167328, "learning_rate": 2.1611654251192465e-05, "loss": 0.5568, "step": 1808 }, { "epoch": 0.5016638935108153, "grad_norm": 0.19541212916374207, "learning_rate": 2.1607873325882343e-05, "loss": 0.5679, "step": 1809 }, { "epoch": 0.5019412090959512, "grad_norm": 0.1972798854112625, "learning_rate": 2.160409062338483e-05, "loss": 0.5513, "step": 1810 }, { "epoch": 0.5022185246810871, "grad_norm": 0.19507858157157898, "learning_rate": 2.1600306144438027e-05, "loss": 0.5509, "step": 1811 }, { "epoch": 0.502495840266223, "grad_norm": 0.18637265264987946, "learning_rate": 2.1596519889780387e-05, "loss": 0.5632, "step": 1812 }, { "epoch": 0.5027731558513588, "grad_norm": 0.17938151955604553, "learning_rate": 2.159273186015071e-05, "loss": 0.5583, "step": 1813 }, { "epoch": 0.5030504714364947, "grad_norm": 0.19754791259765625, "learning_rate": 2.158894205628814e-05, "loss": 0.5705, "step": 1814 }, { "epoch": 0.5033277870216306, "grad_norm": 0.19835114479064941, "learning_rate": 2.1585150478932165e-05, "loss": 0.6031, "step": 1815 }, { "epoch": 0.5036051026067665, "grad_norm": 0.1887637823820114, "learning_rate": 2.1581357128822627e-05, "loss": 0.5551, "step": 1816 }, { "epoch": 0.5038824181919024, "grad_norm": 0.18568859994411469, "learning_rate": 2.157756200669971e-05, "loss": 0.5384, "step": 1817 }, { "epoch": 0.5041597337770383, "grad_norm": 0.1866898089647293, "learning_rate": 2.1573765113303936e-05, "loss": 0.5474, "step": 1818 }, { "epoch": 0.5044370493621742, "grad_norm": 0.18115058541297913, "learning_rate": 2.156996644937618e-05, "loss": 0.5459, "step": 1819 }, { "epoch": 0.50471436494731, "grad_norm": 0.188395157456398, "learning_rate": 2.1566166015657672e-05, "loss": 0.5379, "step": 1820 }, { "epoch": 0.5049916805324459, "grad_norm": 0.2012917846441269, "learning_rate": 2.156236381288997e-05, "loss": 0.5771, "step": 1821 }, { "epoch": 0.5052689961175818, "grad_norm": 0.18014481663703918, "learning_rate": 2.1558559841814986e-05, "loss": 0.5508, "step": 1822 }, { "epoch": 0.5055463117027177, "grad_norm": 0.1956920027732849, "learning_rate": 2.1554754103174972e-05, "loss": 0.5625, "step": 1823 }, { "epoch": 0.5058236272878536, "grad_norm": 0.1985912024974823, "learning_rate": 2.1550946597712536e-05, "loss": 0.532, "step": 1824 }, { "epoch": 0.5061009428729895, "grad_norm": 0.1955011934041977, "learning_rate": 2.1547137326170613e-05, "loss": 0.5641, "step": 1825 }, { "epoch": 0.5063782584581253, "grad_norm": 0.1937127411365509, "learning_rate": 2.1543326289292497e-05, "loss": 0.5369, "step": 1826 }, { "epoch": 0.5066555740432612, "grad_norm": 0.22294695675373077, "learning_rate": 2.153951348782183e-05, "loss": 0.5754, "step": 1827 }, { "epoch": 0.5069328896283971, "grad_norm": 0.1839090883731842, "learning_rate": 2.1535698922502582e-05, "loss": 0.5344, "step": 1828 }, { "epoch": 0.507210205213533, "grad_norm": 0.18940389156341553, "learning_rate": 2.1531882594079074e-05, "loss": 0.5399, "step": 1829 }, { "epoch": 0.5074875207986689, "grad_norm": 0.19242316484451294, "learning_rate": 2.152806450329598e-05, "loss": 0.5473, "step": 1830 }, { "epoch": 0.5077648363838048, "grad_norm": 0.19500425457954407, "learning_rate": 2.1524244650898308e-05, "loss": 0.5812, "step": 1831 }, { "epoch": 0.5080421519689406, "grad_norm": 0.19228143990039825, "learning_rate": 2.1520423037631408e-05, "loss": 0.5518, "step": 1832 }, { "epoch": 0.5083194675540765, "grad_norm": 0.1868668794631958, "learning_rate": 2.1516599664240985e-05, "loss": 0.5534, "step": 1833 }, { "epoch": 0.5085967831392124, "grad_norm": 0.1996205449104309, "learning_rate": 2.151277453147308e-05, "loss": 0.5283, "step": 1834 }, { "epoch": 0.5088740987243483, "grad_norm": 0.18492724001407623, "learning_rate": 2.150894764007407e-05, "loss": 0.563, "step": 1835 }, { "epoch": 0.5091514143094842, "grad_norm": 0.1847442388534546, "learning_rate": 2.150511899079069e-05, "loss": 0.5478, "step": 1836 }, { "epoch": 0.5094287298946201, "grad_norm": 0.19469550251960754, "learning_rate": 2.1501288584370006e-05, "loss": 0.5388, "step": 1837 }, { "epoch": 0.509706045479756, "grad_norm": 0.18692530691623688, "learning_rate": 2.1497456421559436e-05, "loss": 0.523, "step": 1838 }, { "epoch": 0.5099833610648918, "grad_norm": 0.18751968443393707, "learning_rate": 2.1493622503106736e-05, "loss": 0.561, "step": 1839 }, { "epoch": 0.5102606766500277, "grad_norm": 0.18232478201389313, "learning_rate": 2.1489786829760005e-05, "loss": 0.5579, "step": 1840 }, { "epoch": 0.5105379922351636, "grad_norm": 0.20106928050518036, "learning_rate": 2.1485949402267684e-05, "loss": 0.5445, "step": 1841 }, { "epoch": 0.5108153078202995, "grad_norm": 0.19342289865016937, "learning_rate": 2.1482110221378555e-05, "loss": 0.5627, "step": 1842 }, { "epoch": 0.5110926234054354, "grad_norm": 0.1977401226758957, "learning_rate": 2.1478269287841747e-05, "loss": 0.5949, "step": 1843 }, { "epoch": 0.5113699389905713, "grad_norm": 0.18564966320991516, "learning_rate": 2.1474426602406722e-05, "loss": 0.5598, "step": 1844 }, { "epoch": 0.5116472545757071, "grad_norm": 0.19453705847263336, "learning_rate": 2.1470582165823296e-05, "loss": 0.5876, "step": 1845 }, { "epoch": 0.511924570160843, "grad_norm": 0.1874392032623291, "learning_rate": 2.146673597884162e-05, "loss": 0.5465, "step": 1846 }, { "epoch": 0.5122018857459789, "grad_norm": 0.19269202649593353, "learning_rate": 2.1462888042212183e-05, "loss": 0.5333, "step": 1847 }, { "epoch": 0.5124792013311148, "grad_norm": 0.21227677166461945, "learning_rate": 2.1459038356685824e-05, "loss": 0.5772, "step": 1848 }, { "epoch": 0.5127565169162507, "grad_norm": 0.18281084299087524, "learning_rate": 2.1455186923013716e-05, "loss": 0.5732, "step": 1849 }, { "epoch": 0.5130338325013866, "grad_norm": 0.18322256207466125, "learning_rate": 2.1451333741947373e-05, "loss": 0.5367, "step": 1850 }, { "epoch": 0.5133111480865225, "grad_norm": 0.19677676260471344, "learning_rate": 2.1447478814238658e-05, "loss": 0.5797, "step": 1851 }, { "epoch": 0.5135884636716583, "grad_norm": 0.2173527330160141, "learning_rate": 2.1443622140639768e-05, "loss": 0.5688, "step": 1852 }, { "epoch": 0.5138657792567942, "grad_norm": 0.240644633769989, "learning_rate": 2.143976372190324e-05, "loss": 0.5754, "step": 1853 }, { "epoch": 0.5141430948419301, "grad_norm": 0.19872795045375824, "learning_rate": 2.1435903558781954e-05, "loss": 0.5752, "step": 1854 }, { "epoch": 0.514420410427066, "grad_norm": 0.18520157039165497, "learning_rate": 2.143204165202914e-05, "loss": 0.5564, "step": 1855 }, { "epoch": 0.5146977260122019, "grad_norm": 0.1843482255935669, "learning_rate": 2.1428178002398342e-05, "loss": 0.536, "step": 1856 }, { "epoch": 0.5149750415973378, "grad_norm": 0.18434906005859375, "learning_rate": 2.1424312610643467e-05, "loss": 0.5722, "step": 1857 }, { "epoch": 0.5152523571824736, "grad_norm": 0.18243710696697235, "learning_rate": 2.1420445477518756e-05, "loss": 0.5134, "step": 1858 }, { "epoch": 0.5155296727676095, "grad_norm": 0.1954345405101776, "learning_rate": 2.14165766037788e-05, "loss": 0.5465, "step": 1859 }, { "epoch": 0.5158069883527454, "grad_norm": 0.1804470270872116, "learning_rate": 2.1412705990178496e-05, "loss": 0.5529, "step": 1860 }, { "epoch": 0.5160843039378813, "grad_norm": 0.19080010056495667, "learning_rate": 2.140883363747312e-05, "loss": 0.54, "step": 1861 }, { "epoch": 0.5163616195230172, "grad_norm": 0.18515829741954803, "learning_rate": 2.1404959546418268e-05, "loss": 0.5409, "step": 1862 }, { "epoch": 0.5166389351081531, "grad_norm": 0.19277918338775635, "learning_rate": 2.1401083717769876e-05, "loss": 0.5703, "step": 1863 }, { "epoch": 0.516916250693289, "grad_norm": 0.2017827183008194, "learning_rate": 2.139720615228422e-05, "loss": 0.5545, "step": 1864 }, { "epoch": 0.5171935662784248, "grad_norm": 0.19409088790416718, "learning_rate": 2.1393326850717915e-05, "loss": 0.5613, "step": 1865 }, { "epoch": 0.5174708818635607, "grad_norm": 0.18852104246616364, "learning_rate": 2.138944581382792e-05, "loss": 0.5784, "step": 1866 }, { "epoch": 0.5177481974486966, "grad_norm": 0.18418540060520172, "learning_rate": 2.1385563042371525e-05, "loss": 0.5291, "step": 1867 }, { "epoch": 0.5180255130338325, "grad_norm": 0.20222961902618408, "learning_rate": 2.138167853710636e-05, "loss": 0.5432, "step": 1868 }, { "epoch": 0.5183028286189684, "grad_norm": 0.1890316754579544, "learning_rate": 2.1377792298790396e-05, "loss": 0.5859, "step": 1869 }, { "epoch": 0.5185801442041043, "grad_norm": 0.19611713290214539, "learning_rate": 2.1373904328181946e-05, "loss": 0.5468, "step": 1870 }, { "epoch": 0.5188574597892401, "grad_norm": 0.18812295794487, "learning_rate": 2.1370014626039648e-05, "loss": 0.5356, "step": 1871 }, { "epoch": 0.519134775374376, "grad_norm": 0.18963585793972015, "learning_rate": 2.136612319312249e-05, "loss": 0.5646, "step": 1872 }, { "epoch": 0.5194120909595119, "grad_norm": 0.1885354369878769, "learning_rate": 2.1362230030189795e-05, "loss": 0.5578, "step": 1873 }, { "epoch": 0.5196894065446478, "grad_norm": 0.1933208405971527, "learning_rate": 2.1358335138001224e-05, "loss": 0.5379, "step": 1874 }, { "epoch": 0.5199667221297837, "grad_norm": 0.1998060792684555, "learning_rate": 2.1354438517316767e-05, "loss": 0.5741, "step": 1875 }, { "epoch": 0.5202440377149196, "grad_norm": 0.18762660026550293, "learning_rate": 2.135054016889676e-05, "loss": 0.5618, "step": 1876 }, { "epoch": 0.5205213533000554, "grad_norm": 0.2009068727493286, "learning_rate": 2.1346640093501872e-05, "loss": 0.5499, "step": 1877 }, { "epoch": 0.5207986688851913, "grad_norm": 0.1818576157093048, "learning_rate": 2.1342738291893122e-05, "loss": 0.5276, "step": 1878 }, { "epoch": 0.5210759844703272, "grad_norm": 0.2910788357257843, "learning_rate": 2.1338834764831845e-05, "loss": 0.5333, "step": 1879 }, { "epoch": 0.5213533000554631, "grad_norm": 0.20181319117546082, "learning_rate": 2.1334929513079722e-05, "loss": 0.567, "step": 1880 }, { "epoch": 0.521630615640599, "grad_norm": 0.19496895372867584, "learning_rate": 2.133102253739878e-05, "loss": 0.5369, "step": 1881 }, { "epoch": 0.5219079312257349, "grad_norm": 0.18957465887069702, "learning_rate": 2.1327113838551362e-05, "loss": 0.5359, "step": 1882 }, { "epoch": 0.5221852468108708, "grad_norm": 0.19057804346084595, "learning_rate": 2.132320341730017e-05, "loss": 0.5595, "step": 1883 }, { "epoch": 0.5224625623960066, "grad_norm": 0.18731848895549774, "learning_rate": 2.131929127440822e-05, "loss": 0.5258, "step": 1884 }, { "epoch": 0.5227398779811425, "grad_norm": 0.19227994978427887, "learning_rate": 2.131537741063888e-05, "loss": 0.5927, "step": 1885 }, { "epoch": 0.5230171935662784, "grad_norm": 0.18374542891979218, "learning_rate": 2.1311461826755847e-05, "loss": 0.543, "step": 1886 }, { "epoch": 0.5232945091514143, "grad_norm": 0.1936596930027008, "learning_rate": 2.1307544523523156e-05, "loss": 0.5677, "step": 1887 }, { "epoch": 0.5235718247365502, "grad_norm": 0.18285861611366272, "learning_rate": 2.1303625501705183e-05, "loss": 0.5738, "step": 1888 }, { "epoch": 0.5238491403216861, "grad_norm": 0.23504310846328735, "learning_rate": 2.1299704762066618e-05, "loss": 0.5785, "step": 1889 }, { "epoch": 0.5241264559068219, "grad_norm": 0.1995551437139511, "learning_rate": 2.129578230537252e-05, "loss": 0.5637, "step": 1890 }, { "epoch": 0.5244037714919578, "grad_norm": 0.18941858410835266, "learning_rate": 2.1291858132388248e-05, "loss": 0.5523, "step": 1891 }, { "epoch": 0.5246810870770937, "grad_norm": 0.18262585997581482, "learning_rate": 2.1287932243879523e-05, "loss": 0.5587, "step": 1892 }, { "epoch": 0.5249584026622296, "grad_norm": 0.1891373097896576, "learning_rate": 2.1284004640612376e-05, "loss": 0.5494, "step": 1893 }, { "epoch": 0.5252357182473655, "grad_norm": 0.1890500783920288, "learning_rate": 2.1280075323353206e-05, "loss": 0.5411, "step": 1894 }, { "epoch": 0.5255130338325014, "grad_norm": 0.182157963514328, "learning_rate": 2.127614429286871e-05, "loss": 0.5265, "step": 1895 }, { "epoch": 0.5257903494176372, "grad_norm": 0.18624311685562134, "learning_rate": 2.1272211549925946e-05, "loss": 0.5309, "step": 1896 }, { "epoch": 0.5260676650027731, "grad_norm": 0.19566656649112701, "learning_rate": 2.1268277095292292e-05, "loss": 0.5694, "step": 1897 }, { "epoch": 0.526344980587909, "grad_norm": 0.19160917401313782, "learning_rate": 2.1264340929735467e-05, "loss": 0.5324, "step": 1898 }, { "epoch": 0.5266222961730449, "grad_norm": 0.18548499047756195, "learning_rate": 2.126040305402352e-05, "loss": 0.5326, "step": 1899 }, { "epoch": 0.5268996117581808, "grad_norm": 0.18145646154880524, "learning_rate": 2.1256463468924837e-05, "loss": 0.5491, "step": 1900 }, { "epoch": 0.5271769273433167, "grad_norm": 0.1829940527677536, "learning_rate": 2.125252217520813e-05, "loss": 0.5235, "step": 1901 }, { "epoch": 0.5274542429284526, "grad_norm": 0.19107188284397125, "learning_rate": 2.1248579173642453e-05, "loss": 0.539, "step": 1902 }, { "epoch": 0.5277315585135884, "grad_norm": 0.18394434452056885, "learning_rate": 2.1244634464997188e-05, "loss": 0.5672, "step": 1903 }, { "epoch": 0.5280088740987243, "grad_norm": 0.19489826261997223, "learning_rate": 2.1240688050042058e-05, "loss": 0.5514, "step": 1904 }, { "epoch": 0.5282861896838602, "grad_norm": 0.18582908809185028, "learning_rate": 2.1236739929547105e-05, "loss": 0.5484, "step": 1905 }, { "epoch": 0.5285635052689961, "grad_norm": 0.19354532659053802, "learning_rate": 2.123279010428272e-05, "loss": 0.5542, "step": 1906 }, { "epoch": 0.528840820854132, "grad_norm": 0.19419358670711517, "learning_rate": 2.1228838575019612e-05, "loss": 0.5349, "step": 1907 }, { "epoch": 0.5291181364392679, "grad_norm": 0.18452230095863342, "learning_rate": 2.1224885342528834e-05, "loss": 0.5676, "step": 1908 }, { "epoch": 0.5293954520244037, "grad_norm": 0.19760510325431824, "learning_rate": 2.1220930407581762e-05, "loss": 0.5439, "step": 1909 }, { "epoch": 0.5296727676095396, "grad_norm": 0.18620994687080383, "learning_rate": 2.121697377095011e-05, "loss": 0.5612, "step": 1910 }, { "epoch": 0.5299500831946755, "grad_norm": 0.1856573075056076, "learning_rate": 2.121301543340593e-05, "loss": 0.5267, "step": 1911 }, { "epoch": 0.5302273987798114, "grad_norm": 0.19919690489768982, "learning_rate": 2.1209055395721586e-05, "loss": 0.5499, "step": 1912 }, { "epoch": 0.5305047143649473, "grad_norm": 0.18541640043258667, "learning_rate": 2.1205093658669793e-05, "loss": 0.5693, "step": 1913 }, { "epoch": 0.5307820299500832, "grad_norm": 0.18908998370170593, "learning_rate": 2.120113022302359e-05, "loss": 0.5421, "step": 1914 }, { "epoch": 0.531059345535219, "grad_norm": 0.19005346298217773, "learning_rate": 2.119716508955635e-05, "loss": 0.5475, "step": 1915 }, { "epoch": 0.5313366611203549, "grad_norm": 0.19358742237091064, "learning_rate": 2.1193198259041774e-05, "loss": 0.5671, "step": 1916 }, { "epoch": 0.5316139767054908, "grad_norm": 0.19891729950904846, "learning_rate": 2.1189229732253894e-05, "loss": 0.5623, "step": 1917 }, { "epoch": 0.5318912922906267, "grad_norm": 0.1928594410419464, "learning_rate": 2.1185259509967082e-05, "loss": 0.5467, "step": 1918 }, { "epoch": 0.5321686078757626, "grad_norm": 0.21051675081253052, "learning_rate": 2.118128759295602e-05, "loss": 0.5504, "step": 1919 }, { "epoch": 0.5324459234608985, "grad_norm": 0.1916693150997162, "learning_rate": 2.1177313981995745e-05, "loss": 0.5376, "step": 1920 }, { "epoch": 0.5327232390460344, "grad_norm": 0.18833374977111816, "learning_rate": 2.1173338677861616e-05, "loss": 0.526, "step": 1921 }, { "epoch": 0.5330005546311702, "grad_norm": 0.1901186853647232, "learning_rate": 2.116936168132931e-05, "loss": 0.5658, "step": 1922 }, { "epoch": 0.5332778702163061, "grad_norm": 0.19015184044837952, "learning_rate": 2.1165382993174848e-05, "loss": 0.5703, "step": 1923 }, { "epoch": 0.533555185801442, "grad_norm": 0.17683614790439606, "learning_rate": 2.116140261417458e-05, "loss": 0.5384, "step": 1924 }, { "epoch": 0.5338325013865779, "grad_norm": 0.18865369260311127, "learning_rate": 2.1157420545105187e-05, "loss": 0.5473, "step": 1925 }, { "epoch": 0.5341098169717138, "grad_norm": 0.1918121576309204, "learning_rate": 2.1153436786743668e-05, "loss": 0.5587, "step": 1926 }, { "epoch": 0.5343871325568497, "grad_norm": 0.20620866119861603, "learning_rate": 2.1149451339867363e-05, "loss": 0.5358, "step": 1927 }, { "epoch": 0.5346644481419855, "grad_norm": 0.19819065928459167, "learning_rate": 2.114546420525394e-05, "loss": 0.5681, "step": 1928 }, { "epoch": 0.5349417637271214, "grad_norm": 0.19372405111789703, "learning_rate": 2.114147538368139e-05, "loss": 0.5615, "step": 1929 }, { "epoch": 0.5352190793122573, "grad_norm": 0.19177958369255066, "learning_rate": 2.1137484875928048e-05, "loss": 0.5336, "step": 1930 }, { "epoch": 0.5354963948973932, "grad_norm": 0.1978660523891449, "learning_rate": 2.1133492682772556e-05, "loss": 0.5465, "step": 1931 }, { "epoch": 0.5357737104825291, "grad_norm": 0.19225792586803436, "learning_rate": 2.1129498804993902e-05, "loss": 0.5548, "step": 1932 }, { "epoch": 0.536051026067665, "grad_norm": 0.19737133383750916, "learning_rate": 2.1125503243371398e-05, "loss": 0.5636, "step": 1933 }, { "epoch": 0.5363283416528009, "grad_norm": 0.19044062495231628, "learning_rate": 2.112150599868468e-05, "loss": 0.5595, "step": 1934 }, { "epoch": 0.5366056572379367, "grad_norm": 0.18862438201904297, "learning_rate": 2.1117507071713724e-05, "loss": 0.5594, "step": 1935 }, { "epoch": 0.5368829728230726, "grad_norm": 0.20726455748081207, "learning_rate": 2.111350646323882e-05, "loss": 0.5678, "step": 1936 }, { "epoch": 0.5371602884082085, "grad_norm": 0.1967361867427826, "learning_rate": 2.1109504174040594e-05, "loss": 0.5518, "step": 1937 }, { "epoch": 0.5374376039933444, "grad_norm": 0.19529032707214355, "learning_rate": 2.1105500204899997e-05, "loss": 0.5775, "step": 1938 }, { "epoch": 0.5377149195784803, "grad_norm": 0.19610485434532166, "learning_rate": 2.110149455659831e-05, "loss": 0.563, "step": 1939 }, { "epoch": 0.5379922351636162, "grad_norm": 0.18098145723342896, "learning_rate": 2.109748722991715e-05, "loss": 0.5401, "step": 1940 }, { "epoch": 0.538269550748752, "grad_norm": 0.21725578606128693, "learning_rate": 2.109347822563844e-05, "loss": 0.5671, "step": 1941 }, { "epoch": 0.5385468663338879, "grad_norm": 0.20272882282733917, "learning_rate": 2.108946754454445e-05, "loss": 0.5699, "step": 1942 }, { "epoch": 0.5388241819190238, "grad_norm": 0.18334084749221802, "learning_rate": 2.108545518741777e-05, "loss": 0.5508, "step": 1943 }, { "epoch": 0.5391014975041597, "grad_norm": 0.20137275755405426, "learning_rate": 2.1081441155041314e-05, "loss": 0.5315, "step": 1944 }, { "epoch": 0.5393788130892956, "grad_norm": 0.21601121127605438, "learning_rate": 2.1077425448198327e-05, "loss": 0.5415, "step": 1945 }, { "epoch": 0.5396561286744315, "grad_norm": 0.19099989533424377, "learning_rate": 2.107340806767238e-05, "loss": 0.5492, "step": 1946 }, { "epoch": 0.5399334442595674, "grad_norm": 0.18626771867275238, "learning_rate": 2.106938901424737e-05, "loss": 0.5622, "step": 1947 }, { "epoch": 0.5402107598447032, "grad_norm": 0.1979578286409378, "learning_rate": 2.1065368288707523e-05, "loss": 0.5659, "step": 1948 }, { "epoch": 0.5404880754298391, "grad_norm": 0.18457596004009247, "learning_rate": 2.1061345891837393e-05, "loss": 0.554, "step": 1949 }, { "epoch": 0.540765391014975, "grad_norm": 0.1944621354341507, "learning_rate": 2.1057321824421843e-05, "loss": 0.5574, "step": 1950 }, { "epoch": 0.5410427066001109, "grad_norm": 0.20393145084381104, "learning_rate": 2.1053296087246087e-05, "loss": 0.5557, "step": 1951 }, { "epoch": 0.5413200221852468, "grad_norm": 0.19211652874946594, "learning_rate": 2.1049268681095647e-05, "loss": 0.5626, "step": 1952 }, { "epoch": 0.5415973377703827, "grad_norm": 0.18954886496067047, "learning_rate": 2.1045239606756378e-05, "loss": 0.5481, "step": 1953 }, { "epoch": 0.5418746533555185, "grad_norm": 0.19707705080509186, "learning_rate": 2.1041208865014464e-05, "loss": 0.5435, "step": 1954 }, { "epoch": 0.5421519689406544, "grad_norm": 0.20506185293197632, "learning_rate": 2.10371764566564e-05, "loss": 0.5322, "step": 1955 }, { "epoch": 0.5424292845257903, "grad_norm": 0.19055700302124023, "learning_rate": 2.103314238246903e-05, "loss": 0.5513, "step": 1956 }, { "epoch": 0.5427066001109262, "grad_norm": 0.2124052196741104, "learning_rate": 2.102910664323949e-05, "loss": 0.574, "step": 1957 }, { "epoch": 0.5429839156960621, "grad_norm": 0.19025616347789764, "learning_rate": 2.1025069239755273e-05, "loss": 0.5342, "step": 1958 }, { "epoch": 0.543261231281198, "grad_norm": 0.18279728293418884, "learning_rate": 2.102103017280418e-05, "loss": 0.5542, "step": 1959 }, { "epoch": 0.5435385468663338, "grad_norm": 0.18862898647785187, "learning_rate": 2.101698944317434e-05, "loss": 0.5743, "step": 1960 }, { "epoch": 0.5438158624514697, "grad_norm": 0.18205633759498596, "learning_rate": 2.101294705165421e-05, "loss": 0.5597, "step": 1961 }, { "epoch": 0.5440931780366056, "grad_norm": 0.1904565840959549, "learning_rate": 2.100890299903256e-05, "loss": 0.5789, "step": 1962 }, { "epoch": 0.5443704936217415, "grad_norm": 0.19704897701740265, "learning_rate": 2.1004857286098495e-05, "loss": 0.5626, "step": 1963 }, { "epoch": 0.5446478092068774, "grad_norm": 0.1878540813922882, "learning_rate": 2.1000809913641445e-05, "loss": 0.5713, "step": 1964 }, { "epoch": 0.5449251247920133, "grad_norm": 0.2131820023059845, "learning_rate": 2.0996760882451148e-05, "loss": 0.5632, "step": 1965 }, { "epoch": 0.5452024403771492, "grad_norm": 0.19665499031543732, "learning_rate": 2.0992710193317693e-05, "loss": 0.5585, "step": 1966 }, { "epoch": 0.545479755962285, "grad_norm": 0.18704085052013397, "learning_rate": 2.0988657847031467e-05, "loss": 0.5534, "step": 1967 }, { "epoch": 0.5457570715474209, "grad_norm": 0.1849927306175232, "learning_rate": 2.0984603844383195e-05, "loss": 0.5654, "step": 1968 }, { "epoch": 0.5460343871325568, "grad_norm": 0.18812572956085205, "learning_rate": 2.0980548186163918e-05, "loss": 0.5412, "step": 1969 }, { "epoch": 0.5463117027176927, "grad_norm": 0.18602755665779114, "learning_rate": 2.0976490873165e-05, "loss": 0.5503, "step": 1970 }, { "epoch": 0.5465890183028286, "grad_norm": 0.19630911946296692, "learning_rate": 2.097243190617813e-05, "loss": 0.5687, "step": 1971 }, { "epoch": 0.5468663338879645, "grad_norm": 0.18516132235527039, "learning_rate": 2.0968371285995323e-05, "loss": 0.5561, "step": 1972 }, { "epoch": 0.5471436494731003, "grad_norm": 0.18923065066337585, "learning_rate": 2.0964309013408914e-05, "loss": 0.5477, "step": 1973 }, { "epoch": 0.5474209650582362, "grad_norm": 0.18436457216739655, "learning_rate": 2.096024508921156e-05, "loss": 0.56, "step": 1974 }, { "epoch": 0.5476982806433721, "grad_norm": 0.18705391883850098, "learning_rate": 2.095617951419624e-05, "loss": 0.536, "step": 1975 }, { "epoch": 0.547975596228508, "grad_norm": 0.18911142647266388, "learning_rate": 2.095211228915625e-05, "loss": 0.5431, "step": 1976 }, { "epoch": 0.5482529118136439, "grad_norm": 0.19025933742523193, "learning_rate": 2.0948043414885222e-05, "loss": 0.5352, "step": 1977 }, { "epoch": 0.5485302273987798, "grad_norm": 0.18421220779418945, "learning_rate": 2.0943972892177094e-05, "loss": 0.536, "step": 1978 }, { "epoch": 0.5488075429839157, "grad_norm": 0.19545422494411469, "learning_rate": 2.0939900721826132e-05, "loss": 0.5563, "step": 1979 }, { "epoch": 0.5490848585690515, "grad_norm": 0.19028547406196594, "learning_rate": 2.0935826904626937e-05, "loss": 0.5081, "step": 1980 }, { "epoch": 0.5493621741541874, "grad_norm": 0.20487383008003235, "learning_rate": 2.0931751441374406e-05, "loss": 0.5426, "step": 1981 }, { "epoch": 0.5496394897393233, "grad_norm": 0.20360392332077026, "learning_rate": 2.0927674332863774e-05, "loss": 0.5686, "step": 1982 }, { "epoch": 0.5499168053244592, "grad_norm": 0.19089289009571075, "learning_rate": 2.092359557989059e-05, "loss": 0.5574, "step": 1983 }, { "epoch": 0.5501941209095951, "grad_norm": 0.1797301024198532, "learning_rate": 2.0919515183250736e-05, "loss": 0.5666, "step": 1984 }, { "epoch": 0.550471436494731, "grad_norm": 0.19856330752372742, "learning_rate": 2.0915433143740393e-05, "loss": 0.5373, "step": 1985 }, { "epoch": 0.5507487520798668, "grad_norm": 0.19353127479553223, "learning_rate": 2.0911349462156082e-05, "loss": 0.5454, "step": 1986 }, { "epoch": 0.5510260676650027, "grad_norm": 0.19544550776481628, "learning_rate": 2.090726413929464e-05, "loss": 0.5705, "step": 1987 }, { "epoch": 0.5513033832501386, "grad_norm": 0.199398934841156, "learning_rate": 2.0903177175953216e-05, "loss": 0.5431, "step": 1988 }, { "epoch": 0.5515806988352745, "grad_norm": 0.19894284009933472, "learning_rate": 2.0899088572929286e-05, "loss": 0.5658, "step": 1989 }, { "epoch": 0.5518580144204104, "grad_norm": 0.1806151121854782, "learning_rate": 2.0894998331020645e-05, "loss": 0.5748, "step": 1990 }, { "epoch": 0.5521353300055463, "grad_norm": 0.19128967821598053, "learning_rate": 2.089090645102541e-05, "loss": 0.5576, "step": 1991 }, { "epoch": 0.5524126455906821, "grad_norm": 0.20147933065891266, "learning_rate": 2.0886812933742013e-05, "loss": 0.5738, "step": 1992 }, { "epoch": 0.552689961175818, "grad_norm": 0.17531876266002655, "learning_rate": 2.0882717779969207e-05, "loss": 0.544, "step": 1993 }, { "epoch": 0.5529672767609539, "grad_norm": 0.19415581226348877, "learning_rate": 2.087862099050607e-05, "loss": 0.5315, "step": 1994 }, { "epoch": 0.5532445923460898, "grad_norm": 0.22229987382888794, "learning_rate": 2.087452256615199e-05, "loss": 0.5324, "step": 1995 }, { "epoch": 0.5535219079312257, "grad_norm": 0.18078093230724335, "learning_rate": 2.0870422507706676e-05, "loss": 0.5378, "step": 1996 }, { "epoch": 0.5537992235163616, "grad_norm": 0.18564875423908234, "learning_rate": 2.0866320815970157e-05, "loss": 0.5238, "step": 1997 }, { "epoch": 0.5540765391014975, "grad_norm": 0.19045381247997284, "learning_rate": 2.086221749174279e-05, "loss": 0.5527, "step": 1998 }, { "epoch": 0.5543538546866333, "grad_norm": 0.18606480956077576, "learning_rate": 2.0858112535825242e-05, "loss": 0.5132, "step": 1999 }, { "epoch": 0.5546311702717692, "grad_norm": 0.19520308077335358, "learning_rate": 2.0854005949018487e-05, "loss": 0.554, "step": 2000 }, { "epoch": 0.5549084858569051, "grad_norm": 0.19881142675876617, "learning_rate": 2.0849897732123838e-05, "loss": 0.5327, "step": 2001 }, { "epoch": 0.555185801442041, "grad_norm": 0.18831515312194824, "learning_rate": 2.0845787885942917e-05, "loss": 0.5541, "step": 2002 }, { "epoch": 0.5554631170271769, "grad_norm": 0.1954246610403061, "learning_rate": 2.0841676411277662e-05, "loss": 0.5744, "step": 2003 }, { "epoch": 0.5557404326123128, "grad_norm": 0.20168912410736084, "learning_rate": 2.0837563308930325e-05, "loss": 0.5704, "step": 2004 }, { "epoch": 0.5560177481974486, "grad_norm": 0.19541358947753906, "learning_rate": 2.0833448579703492e-05, "loss": 0.5555, "step": 2005 }, { "epoch": 0.5562950637825845, "grad_norm": 0.20079733431339264, "learning_rate": 2.082933222440005e-05, "loss": 0.5394, "step": 2006 }, { "epoch": 0.5565723793677204, "grad_norm": 0.1951141506433487, "learning_rate": 2.082521424382321e-05, "loss": 0.5314, "step": 2007 }, { "epoch": 0.5568496949528563, "grad_norm": 0.1864672750234604, "learning_rate": 2.0821094638776497e-05, "loss": 0.5365, "step": 2008 }, { "epoch": 0.5571270105379922, "grad_norm": 0.18978318572044373, "learning_rate": 2.0816973410063754e-05, "loss": 0.5199, "step": 2009 }, { "epoch": 0.5574043261231281, "grad_norm": 0.1931847780942917, "learning_rate": 2.0812850558489153e-05, "loss": 0.5701, "step": 2010 }, { "epoch": 0.557681641708264, "grad_norm": 0.18549029529094696, "learning_rate": 2.0808726084857157e-05, "loss": 0.4933, "step": 2011 }, { "epoch": 0.5579589572933998, "grad_norm": 0.18897327780723572, "learning_rate": 2.0804599989972567e-05, "loss": 0.58, "step": 2012 }, { "epoch": 0.5582362728785357, "grad_norm": 0.20237743854522705, "learning_rate": 2.0800472274640494e-05, "loss": 0.5577, "step": 2013 }, { "epoch": 0.5585135884636716, "grad_norm": 0.18905304372310638, "learning_rate": 2.0796342939666362e-05, "loss": 0.5763, "step": 2014 }, { "epoch": 0.5587909040488075, "grad_norm": 0.2023850679397583, "learning_rate": 2.079221198585592e-05, "loss": 0.5502, "step": 2015 }, { "epoch": 0.5590682196339434, "grad_norm": 0.19735179841518402, "learning_rate": 2.0788079414015215e-05, "loss": 0.5293, "step": 2016 }, { "epoch": 0.5593455352190793, "grad_norm": 0.18779948353767395, "learning_rate": 2.078394522495063e-05, "loss": 0.5524, "step": 2017 }, { "epoch": 0.5596228508042151, "grad_norm": 0.20868045091629028, "learning_rate": 2.0779809419468854e-05, "loss": 0.5724, "step": 2018 }, { "epoch": 0.559900166389351, "grad_norm": 0.21078519523143768, "learning_rate": 2.077567199837689e-05, "loss": 0.5398, "step": 2019 }, { "epoch": 0.5601774819744869, "grad_norm": 0.18992477655410767, "learning_rate": 2.0771532962482057e-05, "loss": 0.5665, "step": 2020 }, { "epoch": 0.5604547975596228, "grad_norm": 0.1859736144542694, "learning_rate": 2.0767392312591992e-05, "loss": 0.5429, "step": 2021 }, { "epoch": 0.5607321131447587, "grad_norm": 0.18972904980182648, "learning_rate": 2.0763250049514654e-05, "loss": 0.5595, "step": 2022 }, { "epoch": 0.5610094287298946, "grad_norm": 0.189973384141922, "learning_rate": 2.0759106174058293e-05, "loss": 0.5669, "step": 2023 }, { "epoch": 0.5612867443150305, "grad_norm": 0.19088168442249298, "learning_rate": 2.07549606870315e-05, "loss": 0.5654, "step": 2024 }, { "epoch": 0.5615640599001663, "grad_norm": 0.1857694834470749, "learning_rate": 2.075081358924317e-05, "loss": 0.5318, "step": 2025 }, { "epoch": 0.5618413754853022, "grad_norm": 0.19243502616882324, "learning_rate": 2.0746664881502496e-05, "loss": 0.549, "step": 2026 }, { "epoch": 0.5621186910704381, "grad_norm": 0.18214242160320282, "learning_rate": 2.0742514564619022e-05, "loss": 0.5318, "step": 2027 }, { "epoch": 0.562396006655574, "grad_norm": 0.18616682291030884, "learning_rate": 2.0738362639402574e-05, "loss": 0.5397, "step": 2028 }, { "epoch": 0.56267332224071, "grad_norm": 0.18527580797672272, "learning_rate": 2.07342091066633e-05, "loss": 0.5513, "step": 2029 }, { "epoch": 0.5629506378258459, "grad_norm": 0.20299869775772095, "learning_rate": 2.073005396721167e-05, "loss": 0.5437, "step": 2030 }, { "epoch": 0.5632279534109818, "grad_norm": 0.18229471147060394, "learning_rate": 2.072589722185846e-05, "loss": 0.5411, "step": 2031 }, { "epoch": 0.5635052689961176, "grad_norm": 0.19586963951587677, "learning_rate": 2.0721738871414763e-05, "loss": 0.5674, "step": 2032 }, { "epoch": 0.5637825845812535, "grad_norm": 0.21511641144752502, "learning_rate": 2.0717578916691977e-05, "loss": 0.5762, "step": 2033 }, { "epoch": 0.5640599001663894, "grad_norm": 0.18836332857608795, "learning_rate": 2.071341735850183e-05, "loss": 0.5599, "step": 2034 }, { "epoch": 0.5643372157515253, "grad_norm": 0.1830950826406479, "learning_rate": 2.070925419765634e-05, "loss": 0.539, "step": 2035 }, { "epoch": 0.5646145313366612, "grad_norm": 0.18749170005321503, "learning_rate": 2.070508943496786e-05, "loss": 0.5516, "step": 2036 }, { "epoch": 0.5648918469217971, "grad_norm": 0.18844857811927795, "learning_rate": 2.070092307124904e-05, "loss": 0.5301, "step": 2037 }, { "epoch": 0.5651691625069329, "grad_norm": 0.1957472264766693, "learning_rate": 2.0696755107312845e-05, "loss": 0.5612, "step": 2038 }, { "epoch": 0.5654464780920688, "grad_norm": 0.1901545226573944, "learning_rate": 2.0692585543972566e-05, "loss": 0.5529, "step": 2039 }, { "epoch": 0.5657237936772047, "grad_norm": 0.1877562254667282, "learning_rate": 2.0688414382041788e-05, "loss": 0.5324, "step": 2040 }, { "epoch": 0.5660011092623406, "grad_norm": 0.18741729855537415, "learning_rate": 2.068424162233441e-05, "loss": 0.5522, "step": 2041 }, { "epoch": 0.5662784248474765, "grad_norm": 0.18096144497394562, "learning_rate": 2.068006726566466e-05, "loss": 0.5382, "step": 2042 }, { "epoch": 0.5665557404326124, "grad_norm": 0.18831691145896912, "learning_rate": 2.0675891312847064e-05, "loss": 0.5547, "step": 2043 }, { "epoch": 0.5668330560177482, "grad_norm": 0.19189482927322388, "learning_rate": 2.0671713764696445e-05, "loss": 0.5699, "step": 2044 }, { "epoch": 0.5671103716028841, "grad_norm": 0.183754563331604, "learning_rate": 2.0667534622027974e-05, "loss": 0.5256, "step": 2045 }, { "epoch": 0.56738768718802, "grad_norm": 0.19887655973434448, "learning_rate": 2.0663353885657098e-05, "loss": 0.5792, "step": 2046 }, { "epoch": 0.5676650027731559, "grad_norm": 0.18769381940364838, "learning_rate": 2.0659171556399596e-05, "loss": 0.5679, "step": 2047 }, { "epoch": 0.5679423183582918, "grad_norm": 0.20135840773582458, "learning_rate": 2.0654987635071554e-05, "loss": 0.5766, "step": 2048 }, { "epoch": 0.5682196339434277, "grad_norm": 0.20204971730709076, "learning_rate": 2.065080212248936e-05, "loss": 0.602, "step": 2049 }, { "epoch": 0.5684969495285636, "grad_norm": 0.18772201240062714, "learning_rate": 2.0646615019469724e-05, "loss": 0.5584, "step": 2050 }, { "epoch": 0.5687742651136994, "grad_norm": 0.18956024944782257, "learning_rate": 2.064242632682965e-05, "loss": 0.573, "step": 2051 }, { "epoch": 0.5690515806988353, "grad_norm": 0.20082207024097443, "learning_rate": 2.0638236045386472e-05, "loss": 0.5478, "step": 2052 }, { "epoch": 0.5693288962839712, "grad_norm": 0.1947973519563675, "learning_rate": 2.063404417595783e-05, "loss": 0.5543, "step": 2053 }, { "epoch": 0.5696062118691071, "grad_norm": 0.19126874208450317, "learning_rate": 2.0629850719361654e-05, "loss": 0.5704, "step": 2054 }, { "epoch": 0.569883527454243, "grad_norm": 0.3870353698730469, "learning_rate": 2.062565567641621e-05, "loss": 0.5323, "step": 2055 }, { "epoch": 0.5701608430393789, "grad_norm": 0.20603255927562714, "learning_rate": 2.0621459047940056e-05, "loss": 0.574, "step": 2056 }, { "epoch": 0.5704381586245147, "grad_norm": 0.18535418808460236, "learning_rate": 2.0617260834752068e-05, "loss": 0.5358, "step": 2057 }, { "epoch": 0.5707154742096506, "grad_norm": 0.24727782607078552, "learning_rate": 2.061306103767143e-05, "loss": 0.5744, "step": 2058 }, { "epoch": 0.5709927897947865, "grad_norm": 0.19518110156059265, "learning_rate": 2.0608859657517633e-05, "loss": 0.5655, "step": 2059 }, { "epoch": 0.5712701053799224, "grad_norm": 0.19751004874706268, "learning_rate": 2.0604656695110476e-05, "loss": 0.5555, "step": 2060 }, { "epoch": 0.5715474209650583, "grad_norm": 0.18788260221481323, "learning_rate": 2.0600452151270068e-05, "loss": 0.5463, "step": 2061 }, { "epoch": 0.5718247365501942, "grad_norm": 0.23272345960140228, "learning_rate": 2.0596246026816826e-05, "loss": 0.5601, "step": 2062 }, { "epoch": 0.57210205213533, "grad_norm": 0.19826073944568634, "learning_rate": 2.059203832257148e-05, "loss": 0.5654, "step": 2063 }, { "epoch": 0.5723793677204659, "grad_norm": 0.19848833978176117, "learning_rate": 2.058782903935506e-05, "loss": 0.5666, "step": 2064 }, { "epoch": 0.5726566833056018, "grad_norm": 0.21063697338104248, "learning_rate": 2.0583618177988917e-05, "loss": 0.5421, "step": 2065 }, { "epoch": 0.5729339988907377, "grad_norm": 0.19619859755039215, "learning_rate": 2.0579405739294695e-05, "loss": 0.5343, "step": 2066 }, { "epoch": 0.5732113144758736, "grad_norm": 0.18998843431472778, "learning_rate": 2.057519172409435e-05, "loss": 0.5486, "step": 2067 }, { "epoch": 0.5734886300610095, "grad_norm": 0.19432705640792847, "learning_rate": 2.0570976133210152e-05, "loss": 0.528, "step": 2068 }, { "epoch": 0.5737659456461454, "grad_norm": 0.18903441727161407, "learning_rate": 2.0566758967464677e-05, "loss": 0.5529, "step": 2069 }, { "epoch": 0.5740432612312812, "grad_norm": 0.19154904782772064, "learning_rate": 2.05625402276808e-05, "loss": 0.5519, "step": 2070 }, { "epoch": 0.5743205768164171, "grad_norm": 0.19544386863708496, "learning_rate": 2.0558319914681713e-05, "loss": 0.5444, "step": 2071 }, { "epoch": 0.574597892401553, "grad_norm": 0.1918288618326187, "learning_rate": 2.055409802929091e-05, "loss": 0.5646, "step": 2072 }, { "epoch": 0.5748752079866889, "grad_norm": 0.19810417294502258, "learning_rate": 2.054987457233219e-05, "loss": 0.5799, "step": 2073 }, { "epoch": 0.5751525235718248, "grad_norm": 0.20202040672302246, "learning_rate": 2.0545649544629665e-05, "loss": 0.5555, "step": 2074 }, { "epoch": 0.5754298391569607, "grad_norm": 0.19962945580482483, "learning_rate": 2.0541422947007748e-05, "loss": 0.5245, "step": 2075 }, { "epoch": 0.5757071547420965, "grad_norm": 0.19925406575202942, "learning_rate": 2.053719478029116e-05, "loss": 0.5755, "step": 2076 }, { "epoch": 0.5759844703272324, "grad_norm": 0.1866733878850937, "learning_rate": 2.0532965045304932e-05, "loss": 0.5339, "step": 2077 }, { "epoch": 0.5762617859123683, "grad_norm": 0.22320881485939026, "learning_rate": 2.052873374287439e-05, "loss": 0.5111, "step": 2078 }, { "epoch": 0.5765391014975042, "grad_norm": 0.19458182156085968, "learning_rate": 2.0524500873825182e-05, "loss": 0.5625, "step": 2079 }, { "epoch": 0.5768164170826401, "grad_norm": 0.18524032831192017, "learning_rate": 2.0520266438983242e-05, "loss": 0.5876, "step": 2080 }, { "epoch": 0.577093732667776, "grad_norm": 0.17904391884803772, "learning_rate": 2.0516030439174833e-05, "loss": 0.5541, "step": 2081 }, { "epoch": 0.5773710482529119, "grad_norm": 0.1819324642419815, "learning_rate": 2.05117928752265e-05, "loss": 0.5326, "step": 2082 }, { "epoch": 0.5776483638380477, "grad_norm": 0.18749113380908966, "learning_rate": 2.0507553747965114e-05, "loss": 0.5502, "step": 2083 }, { "epoch": 0.5779256794231836, "grad_norm": 0.1892707794904709, "learning_rate": 2.050331305821783e-05, "loss": 0.5533, "step": 2084 }, { "epoch": 0.5782029950083195, "grad_norm": 0.1937221884727478, "learning_rate": 2.0499070806812126e-05, "loss": 0.5615, "step": 2085 }, { "epoch": 0.5784803105934554, "grad_norm": 0.20081382989883423, "learning_rate": 2.0494826994575777e-05, "loss": 0.5424, "step": 2086 }, { "epoch": 0.5787576261785913, "grad_norm": 0.18762163817882538, "learning_rate": 2.0490581622336863e-05, "loss": 0.5621, "step": 2087 }, { "epoch": 0.5790349417637272, "grad_norm": 0.18031221628189087, "learning_rate": 2.048633469092377e-05, "loss": 0.5045, "step": 2088 }, { "epoch": 0.579312257348863, "grad_norm": 0.21820896863937378, "learning_rate": 2.048208620116518e-05, "loss": 0.5676, "step": 2089 }, { "epoch": 0.5795895729339989, "grad_norm": 0.19855897128582, "learning_rate": 2.0477836153890095e-05, "loss": 0.5461, "step": 2090 }, { "epoch": 0.5798668885191348, "grad_norm": 0.1899833232164383, "learning_rate": 2.0473584549927806e-05, "loss": 0.5519, "step": 2091 }, { "epoch": 0.5801442041042707, "grad_norm": 0.1892709583044052, "learning_rate": 2.0469331390107914e-05, "loss": 0.5255, "step": 2092 }, { "epoch": 0.5804215196894066, "grad_norm": 0.19004952907562256, "learning_rate": 2.0465076675260326e-05, "loss": 0.5468, "step": 2093 }, { "epoch": 0.5806988352745425, "grad_norm": 0.18664851784706116, "learning_rate": 2.0460820406215247e-05, "loss": 0.5333, "step": 2094 }, { "epoch": 0.5809761508596784, "grad_norm": 0.18904832005500793, "learning_rate": 2.045656258380319e-05, "loss": 0.5653, "step": 2095 }, { "epoch": 0.5812534664448142, "grad_norm": 0.19123364984989166, "learning_rate": 2.0452303208854966e-05, "loss": 0.5368, "step": 2096 }, { "epoch": 0.5815307820299501, "grad_norm": 0.1906225085258484, "learning_rate": 2.0448042282201694e-05, "loss": 0.5672, "step": 2097 }, { "epoch": 0.581808097615086, "grad_norm": 0.19360537827014923, "learning_rate": 2.0443779804674796e-05, "loss": 0.548, "step": 2098 }, { "epoch": 0.5820854132002219, "grad_norm": 0.21694041788578033, "learning_rate": 2.0439515777105987e-05, "loss": 0.5724, "step": 2099 }, { "epoch": 0.5823627287853578, "grad_norm": 0.191674143075943, "learning_rate": 2.04352502003273e-05, "loss": 0.5295, "step": 2100 }, { "epoch": 0.5826400443704937, "grad_norm": 0.19594216346740723, "learning_rate": 2.0430983075171055e-05, "loss": 0.5412, "step": 2101 }, { "epoch": 0.5829173599556295, "grad_norm": 0.19115525484085083, "learning_rate": 2.0426714402469887e-05, "loss": 0.5368, "step": 2102 }, { "epoch": 0.5831946755407654, "grad_norm": 0.19171211123466492, "learning_rate": 2.042244418305673e-05, "loss": 0.57, "step": 2103 }, { "epoch": 0.5834719911259013, "grad_norm": 0.18373924493789673, "learning_rate": 2.0418172417764802e-05, "loss": 0.5648, "step": 2104 }, { "epoch": 0.5837493067110372, "grad_norm": 0.20192669332027435, "learning_rate": 2.0413899107427652e-05, "loss": 0.5699, "step": 2105 }, { "epoch": 0.5840266222961731, "grad_norm": 0.18735186755657196, "learning_rate": 2.0409624252879112e-05, "loss": 0.551, "step": 2106 }, { "epoch": 0.584303937881309, "grad_norm": 0.19031678140163422, "learning_rate": 2.0405347854953316e-05, "loss": 0.5313, "step": 2107 }, { "epoch": 0.5845812534664449, "grad_norm": 0.19912661612033844, "learning_rate": 2.0401069914484707e-05, "loss": 0.5815, "step": 2108 }, { "epoch": 0.5848585690515807, "grad_norm": 0.1985718458890915, "learning_rate": 2.0396790432308025e-05, "loss": 0.5364, "step": 2109 }, { "epoch": 0.5851358846367166, "grad_norm": 0.20383597910404205, "learning_rate": 2.0392509409258303e-05, "loss": 0.5747, "step": 2110 }, { "epoch": 0.5854132002218525, "grad_norm": 0.19357682764530182, "learning_rate": 2.038822684617089e-05, "loss": 0.5508, "step": 2111 }, { "epoch": 0.5856905158069884, "grad_norm": 0.18165504932403564, "learning_rate": 2.0383942743881425e-05, "loss": 0.5234, "step": 2112 }, { "epoch": 0.5859678313921243, "grad_norm": 0.18874984979629517, "learning_rate": 2.0379657103225852e-05, "loss": 0.5691, "step": 2113 }, { "epoch": 0.5862451469772602, "grad_norm": 0.18956932425498962, "learning_rate": 2.0375369925040406e-05, "loss": 0.5664, "step": 2114 }, { "epoch": 0.586522462562396, "grad_norm": 0.17788472771644592, "learning_rate": 2.0371081210161634e-05, "loss": 0.5473, "step": 2115 }, { "epoch": 0.5867997781475319, "grad_norm": 0.22669924795627594, "learning_rate": 2.0366790959426378e-05, "loss": 0.5603, "step": 2116 }, { "epoch": 0.5870770937326678, "grad_norm": 0.1913762092590332, "learning_rate": 2.0362499173671784e-05, "loss": 0.5698, "step": 2117 }, { "epoch": 0.5873544093178037, "grad_norm": 0.19173979759216309, "learning_rate": 2.0358205853735287e-05, "loss": 0.5735, "step": 2118 }, { "epoch": 0.5876317249029396, "grad_norm": 0.18526272475719452, "learning_rate": 2.035391100045462e-05, "loss": 0.5462, "step": 2119 }, { "epoch": 0.5879090404880755, "grad_norm": 0.20534314215183258, "learning_rate": 2.034961461466784e-05, "loss": 0.5643, "step": 2120 }, { "epoch": 0.5881863560732113, "grad_norm": 0.19565671682357788, "learning_rate": 2.0345316697213273e-05, "loss": 0.5599, "step": 2121 }, { "epoch": 0.5884636716583472, "grad_norm": 0.19610898196697235, "learning_rate": 2.034101724892956e-05, "loss": 0.5597, "step": 2122 }, { "epoch": 0.5887409872434831, "grad_norm": 0.19708271324634552, "learning_rate": 2.033671627065564e-05, "loss": 0.5494, "step": 2123 }, { "epoch": 0.589018302828619, "grad_norm": 0.285875141620636, "learning_rate": 2.033241376323075e-05, "loss": 0.5553, "step": 2124 }, { "epoch": 0.5892956184137549, "grad_norm": 0.1879926174879074, "learning_rate": 2.0328109727494417e-05, "loss": 0.5464, "step": 2125 }, { "epoch": 0.5895729339988908, "grad_norm": 0.20556902885437012, "learning_rate": 2.032380416428647e-05, "loss": 0.5533, "step": 2126 }, { "epoch": 0.5898502495840267, "grad_norm": 0.1847870945930481, "learning_rate": 2.0319497074447043e-05, "loss": 0.5629, "step": 2127 }, { "epoch": 0.5901275651691625, "grad_norm": 0.19035299122333527, "learning_rate": 2.0315188458816567e-05, "loss": 0.5491, "step": 2128 }, { "epoch": 0.5904048807542984, "grad_norm": 0.17980261147022247, "learning_rate": 2.031087831823576e-05, "loss": 0.5281, "step": 2129 }, { "epoch": 0.5906821963394343, "grad_norm": 0.19888748228549957, "learning_rate": 2.030656665354565e-05, "loss": 0.5443, "step": 2130 }, { "epoch": 0.5909595119245702, "grad_norm": 0.17473064363002777, "learning_rate": 2.0302253465587555e-05, "loss": 0.5263, "step": 2131 }, { "epoch": 0.5912368275097061, "grad_norm": 0.1915203481912613, "learning_rate": 2.0297938755203088e-05, "loss": 0.5609, "step": 2132 }, { "epoch": 0.591514143094842, "grad_norm": 0.1980845183134079, "learning_rate": 2.029362252323417e-05, "loss": 0.5549, "step": 2133 }, { "epoch": 0.5917914586799778, "grad_norm": 0.18366824090480804, "learning_rate": 2.028930477052301e-05, "loss": 0.5333, "step": 2134 }, { "epoch": 0.5920687742651137, "grad_norm": 0.1946541965007782, "learning_rate": 2.0284985497912118e-05, "loss": 0.588, "step": 2135 }, { "epoch": 0.5923460898502496, "grad_norm": 0.1904669553041458, "learning_rate": 2.028066470624429e-05, "loss": 0.5602, "step": 2136 }, { "epoch": 0.5926234054353855, "grad_norm": 0.1851152926683426, "learning_rate": 2.0276342396362636e-05, "loss": 0.5461, "step": 2137 }, { "epoch": 0.5929007210205214, "grad_norm": 0.18607455492019653, "learning_rate": 2.0272018569110552e-05, "loss": 0.542, "step": 2138 }, { "epoch": 0.5931780366056573, "grad_norm": 0.19023166596889496, "learning_rate": 2.0267693225331726e-05, "loss": 0.5564, "step": 2139 }, { "epoch": 0.5934553521907932, "grad_norm": 0.18977974355220795, "learning_rate": 2.0263366365870152e-05, "loss": 0.5778, "step": 2140 }, { "epoch": 0.593732667775929, "grad_norm": 0.2097538560628891, "learning_rate": 2.0259037991570116e-05, "loss": 0.5822, "step": 2141 }, { "epoch": 0.5940099833610649, "grad_norm": 0.18402041494846344, "learning_rate": 2.0254708103276193e-05, "loss": 0.5207, "step": 2142 }, { "epoch": 0.5942872989462008, "grad_norm": 0.196335569024086, "learning_rate": 2.025037670183326e-05, "loss": 0.5509, "step": 2143 }, { "epoch": 0.5945646145313367, "grad_norm": 0.18417152762413025, "learning_rate": 2.0246043788086498e-05, "loss": 0.5418, "step": 2144 }, { "epoch": 0.5948419301164726, "grad_norm": 0.1963946372270584, "learning_rate": 2.024170936288136e-05, "loss": 0.5717, "step": 2145 }, { "epoch": 0.5951192457016085, "grad_norm": 0.20180796086788177, "learning_rate": 2.023737342706361e-05, "loss": 0.5457, "step": 2146 }, { "epoch": 0.5953965612867443, "grad_norm": 0.18826082348823547, "learning_rate": 2.0233035981479316e-05, "loss": 0.545, "step": 2147 }, { "epoch": 0.5956738768718802, "grad_norm": 0.1920921951532364, "learning_rate": 2.0228697026974808e-05, "loss": 0.5584, "step": 2148 }, { "epoch": 0.5959511924570161, "grad_norm": 0.20350618660449982, "learning_rate": 2.0224356564396747e-05, "loss": 0.5343, "step": 2149 }, { "epoch": 0.596228508042152, "grad_norm": 0.19070284068584442, "learning_rate": 2.0220014594592068e-05, "loss": 0.5573, "step": 2150 }, { "epoch": 0.5965058236272879, "grad_norm": 0.1937059462070465, "learning_rate": 2.0215671118408004e-05, "loss": 0.5548, "step": 2151 }, { "epoch": 0.5967831392124238, "grad_norm": 0.18935304880142212, "learning_rate": 2.021132613669208e-05, "loss": 0.5746, "step": 2152 }, { "epoch": 0.5970604547975596, "grad_norm": 0.20879191160202026, "learning_rate": 2.0206979650292117e-05, "loss": 0.5602, "step": 2153 }, { "epoch": 0.5973377703826955, "grad_norm": 0.19923923909664154, "learning_rate": 2.020263166005624e-05, "loss": 0.5441, "step": 2154 }, { "epoch": 0.5976150859678314, "grad_norm": 0.1885758638381958, "learning_rate": 2.019828216683284e-05, "loss": 0.5362, "step": 2155 }, { "epoch": 0.5978924015529673, "grad_norm": 0.18912041187286377, "learning_rate": 2.019393117147063e-05, "loss": 0.5488, "step": 2156 }, { "epoch": 0.5981697171381032, "grad_norm": 0.18454459309577942, "learning_rate": 2.0189578674818603e-05, "loss": 0.5386, "step": 2157 }, { "epoch": 0.5984470327232391, "grad_norm": 0.1803792268037796, "learning_rate": 2.018522467772604e-05, "loss": 0.5444, "step": 2158 }, { "epoch": 0.598724348308375, "grad_norm": 0.18980631232261658, "learning_rate": 2.0180869181042532e-05, "loss": 0.5521, "step": 2159 }, { "epoch": 0.5990016638935108, "grad_norm": 0.18810968101024628, "learning_rate": 2.0176512185617945e-05, "loss": 0.5342, "step": 2160 }, { "epoch": 0.5992789794786467, "grad_norm": 0.24775730073451996, "learning_rate": 2.0172153692302445e-05, "loss": 0.533, "step": 2161 }, { "epoch": 0.5995562950637826, "grad_norm": 0.19899478554725647, "learning_rate": 2.0167793701946488e-05, "loss": 0.5394, "step": 2162 }, { "epoch": 0.5998336106489185, "grad_norm": 0.19542957842350006, "learning_rate": 2.0163432215400822e-05, "loss": 0.5379, "step": 2163 }, { "epoch": 0.6001109262340544, "grad_norm": 0.2002883106470108, "learning_rate": 2.0159069233516504e-05, "loss": 0.5672, "step": 2164 }, { "epoch": 0.6003882418191903, "grad_norm": 0.18904021382331848, "learning_rate": 2.0154704757144845e-05, "loss": 0.5483, "step": 2165 }, { "epoch": 0.6006655574043261, "grad_norm": 0.19162911176681519, "learning_rate": 2.0150338787137486e-05, "loss": 0.5552, "step": 2166 }, { "epoch": 0.600942872989462, "grad_norm": 0.18318617343902588, "learning_rate": 2.014597132434633e-05, "loss": 0.5548, "step": 2167 }, { "epoch": 0.6012201885745979, "grad_norm": 0.1864987015724182, "learning_rate": 2.01416023696236e-05, "loss": 0.5401, "step": 2168 }, { "epoch": 0.6014975041597338, "grad_norm": 0.18252375721931458, "learning_rate": 2.0137231923821785e-05, "loss": 0.5547, "step": 2169 }, { "epoch": 0.6017748197448697, "grad_norm": 0.196000874042511, "learning_rate": 2.013285998779367e-05, "loss": 0.5518, "step": 2170 }, { "epoch": 0.6020521353300056, "grad_norm": 0.17955927550792694, "learning_rate": 2.0128486562392354e-05, "loss": 0.5312, "step": 2171 }, { "epoch": 0.6023294509151415, "grad_norm": 0.1867658495903015, "learning_rate": 2.0124111648471192e-05, "loss": 0.5374, "step": 2172 }, { "epoch": 0.6026067665002773, "grad_norm": 0.20001055300235748, "learning_rate": 2.0119735246883852e-05, "loss": 0.5501, "step": 2173 }, { "epoch": 0.6028840820854132, "grad_norm": 0.19283756613731384, "learning_rate": 2.011535735848428e-05, "loss": 0.5505, "step": 2174 }, { "epoch": 0.6031613976705491, "grad_norm": 0.18642939627170563, "learning_rate": 2.011097798412673e-05, "loss": 0.5246, "step": 2175 }, { "epoch": 0.603438713255685, "grad_norm": 0.1911175698041916, "learning_rate": 2.0106597124665716e-05, "loss": 0.5134, "step": 2176 }, { "epoch": 0.6037160288408209, "grad_norm": 0.18297746777534485, "learning_rate": 2.0102214780956073e-05, "loss": 0.5598, "step": 2177 }, { "epoch": 0.6039933444259568, "grad_norm": 0.19483189284801483, "learning_rate": 2.0097830953852914e-05, "loss": 0.5502, "step": 2178 }, { "epoch": 0.6042706600110926, "grad_norm": 0.18785777688026428, "learning_rate": 2.009344564421163e-05, "loss": 0.5384, "step": 2179 }, { "epoch": 0.6045479755962285, "grad_norm": 0.19045081734657288, "learning_rate": 2.0089058852887923e-05, "loss": 0.5311, "step": 2180 }, { "epoch": 0.6048252911813644, "grad_norm": 0.19081301987171173, "learning_rate": 2.0084670580737758e-05, "loss": 0.5593, "step": 2181 }, { "epoch": 0.6051026067665003, "grad_norm": 0.19843098521232605, "learning_rate": 2.0080280828617414e-05, "loss": 0.557, "step": 2182 }, { "epoch": 0.6053799223516362, "grad_norm": 0.1962948590517044, "learning_rate": 2.0075889597383446e-05, "loss": 0.5591, "step": 2183 }, { "epoch": 0.6056572379367721, "grad_norm": 0.193936288356781, "learning_rate": 2.0071496887892693e-05, "loss": 0.5721, "step": 2184 }, { "epoch": 0.605934553521908, "grad_norm": 0.19298399984836578, "learning_rate": 2.00671027010023e-05, "loss": 0.5734, "step": 2185 }, { "epoch": 0.6062118691070438, "grad_norm": 0.20237858593463898, "learning_rate": 2.006270703756968e-05, "loss": 0.548, "step": 2186 }, { "epoch": 0.6064891846921797, "grad_norm": 0.19070473313331604, "learning_rate": 2.0058309898452552e-05, "loss": 0.5187, "step": 2187 }, { "epoch": 0.6067665002773156, "grad_norm": 0.18649962544441223, "learning_rate": 2.0053911284508902e-05, "loss": 0.5465, "step": 2188 }, { "epoch": 0.6070438158624515, "grad_norm": 0.21800090372562408, "learning_rate": 2.0049511196597027e-05, "loss": 0.5665, "step": 2189 }, { "epoch": 0.6073211314475874, "grad_norm": 0.18259546160697937, "learning_rate": 2.00451096355755e-05, "loss": 0.5456, "step": 2190 }, { "epoch": 0.6075984470327233, "grad_norm": 0.19721095263957977, "learning_rate": 2.0040706602303173e-05, "loss": 0.5752, "step": 2191 }, { "epoch": 0.6078757626178591, "grad_norm": 0.19083106517791748, "learning_rate": 2.0036302097639204e-05, "loss": 0.552, "step": 2192 }, { "epoch": 0.608153078202995, "grad_norm": 0.18962670862674713, "learning_rate": 2.0031896122443023e-05, "loss": 0.5729, "step": 2193 }, { "epoch": 0.6084303937881309, "grad_norm": 0.19744956493377686, "learning_rate": 2.0027488677574358e-05, "loss": 0.5494, "step": 2194 }, { "epoch": 0.6087077093732668, "grad_norm": 0.19325025379657745, "learning_rate": 2.0023079763893208e-05, "loss": 0.5389, "step": 2195 }, { "epoch": 0.6089850249584027, "grad_norm": 0.1866646558046341, "learning_rate": 2.0018669382259885e-05, "loss": 0.5703, "step": 2196 }, { "epoch": 0.6092623405435386, "grad_norm": 0.21793098747730255, "learning_rate": 2.001425753353496e-05, "loss": 0.5585, "step": 2197 }, { "epoch": 0.6095396561286744, "grad_norm": 0.18613992631435394, "learning_rate": 2.0009844218579298e-05, "loss": 0.544, "step": 2198 }, { "epoch": 0.6098169717138103, "grad_norm": 0.1990228146314621, "learning_rate": 2.0005429438254063e-05, "loss": 0.5681, "step": 2199 }, { "epoch": 0.6100942872989462, "grad_norm": 0.19253648817539215, "learning_rate": 2.000101319342069e-05, "loss": 0.5451, "step": 2200 }, { "epoch": 0.6103716028840821, "grad_norm": 0.18924476206302643, "learning_rate": 1.9996595484940915e-05, "loss": 0.5684, "step": 2201 }, { "epoch": 0.610648918469218, "grad_norm": 0.18552450835704803, "learning_rate": 1.9992176313676737e-05, "loss": 0.5307, "step": 2202 }, { "epoch": 0.6109262340543539, "grad_norm": 0.19528694450855255, "learning_rate": 1.9987755680490456e-05, "loss": 0.5598, "step": 2203 }, { "epoch": 0.6112035496394898, "grad_norm": 0.1909715086221695, "learning_rate": 1.998333358624466e-05, "loss": 0.5647, "step": 2204 }, { "epoch": 0.6114808652246256, "grad_norm": 0.19347867369651794, "learning_rate": 1.9978910031802218e-05, "loss": 0.5726, "step": 2205 }, { "epoch": 0.6117581808097615, "grad_norm": 0.22693443298339844, "learning_rate": 1.9974485018026273e-05, "loss": 0.5403, "step": 2206 }, { "epoch": 0.6120354963948974, "grad_norm": 0.1854747086763382, "learning_rate": 1.997005854578027e-05, "loss": 0.5733, "step": 2207 }, { "epoch": 0.6123128119800333, "grad_norm": 0.2657473683357239, "learning_rate": 1.9965630615927932e-05, "loss": 0.5516, "step": 2208 }, { "epoch": 0.6125901275651692, "grad_norm": 0.19607201218605042, "learning_rate": 1.996120122933326e-05, "loss": 0.5525, "step": 2209 }, { "epoch": 0.6128674431503051, "grad_norm": 0.17779147624969482, "learning_rate": 1.9956770386860547e-05, "loss": 0.5377, "step": 2210 }, { "epoch": 0.6131447587354409, "grad_norm": 0.2034800499677658, "learning_rate": 1.9952338089374366e-05, "loss": 0.5434, "step": 2211 }, { "epoch": 0.6134220743205768, "grad_norm": 0.18624994158744812, "learning_rate": 1.9947904337739582e-05, "loss": 0.5301, "step": 2212 }, { "epoch": 0.6136993899057127, "grad_norm": 0.18844860792160034, "learning_rate": 1.9943469132821334e-05, "loss": 0.5508, "step": 2213 }, { "epoch": 0.6139767054908486, "grad_norm": 0.20432956516742706, "learning_rate": 1.9939032475485043e-05, "loss": 0.5565, "step": 2214 }, { "epoch": 0.6142540210759845, "grad_norm": 0.18925762176513672, "learning_rate": 1.9934594366596423e-05, "loss": 0.5745, "step": 2215 }, { "epoch": 0.6145313366611204, "grad_norm": 0.20550455152988434, "learning_rate": 1.993015480702147e-05, "loss": 0.5689, "step": 2216 }, { "epoch": 0.6148086522462562, "grad_norm": 0.18953469395637512, "learning_rate": 1.992571379762645e-05, "loss": 0.5393, "step": 2217 }, { "epoch": 0.6150859678313921, "grad_norm": 0.19926683604717255, "learning_rate": 1.9921271339277935e-05, "loss": 0.5563, "step": 2218 }, { "epoch": 0.615363283416528, "grad_norm": 0.18314692378044128, "learning_rate": 1.9916827432842756e-05, "loss": 0.5353, "step": 2219 }, { "epoch": 0.6156405990016639, "grad_norm": 0.20692428946495056, "learning_rate": 1.991238207918804e-05, "loss": 0.57, "step": 2220 }, { "epoch": 0.6159179145867998, "grad_norm": 0.202706441283226, "learning_rate": 1.990793527918119e-05, "loss": 0.5496, "step": 2221 }, { "epoch": 0.6161952301719357, "grad_norm": 0.18222178518772125, "learning_rate": 1.99034870336899e-05, "loss": 0.5553, "step": 2222 }, { "epoch": 0.6164725457570716, "grad_norm": 0.20487068593502045, "learning_rate": 1.9899037343582135e-05, "loss": 0.5611, "step": 2223 }, { "epoch": 0.6167498613422074, "grad_norm": 0.18165314197540283, "learning_rate": 1.989458620972615e-05, "loss": 0.5561, "step": 2224 }, { "epoch": 0.6170271769273433, "grad_norm": 0.1925044059753418, "learning_rate": 1.9890133632990488e-05, "loss": 0.5866, "step": 2225 }, { "epoch": 0.6173044925124792, "grad_norm": 0.19239196181297302, "learning_rate": 1.988567961424395e-05, "loss": 0.5629, "step": 2226 }, { "epoch": 0.6175818080976151, "grad_norm": 0.19511006772518158, "learning_rate": 1.9881224154355638e-05, "loss": 0.5484, "step": 2227 }, { "epoch": 0.617859123682751, "grad_norm": 0.18884329497814178, "learning_rate": 1.9876767254194932e-05, "loss": 0.5541, "step": 2228 }, { "epoch": 0.6181364392678869, "grad_norm": 0.1917329877614975, "learning_rate": 1.9872308914631494e-05, "loss": 0.5535, "step": 2229 }, { "epoch": 0.6184137548530227, "grad_norm": 0.19358281791210175, "learning_rate": 1.986784913653526e-05, "loss": 0.5445, "step": 2230 }, { "epoch": 0.6186910704381586, "grad_norm": 0.1858266144990921, "learning_rate": 1.9863387920776454e-05, "loss": 0.5446, "step": 2231 }, { "epoch": 0.6189683860232945, "grad_norm": 0.1879933476448059, "learning_rate": 1.985892526822557e-05, "loss": 0.5511, "step": 2232 }, { "epoch": 0.6192457016084304, "grad_norm": 0.19498036801815033, "learning_rate": 1.9854461179753396e-05, "loss": 0.5535, "step": 2233 }, { "epoch": 0.6195230171935663, "grad_norm": 0.1921449601650238, "learning_rate": 1.9849995656231e-05, "loss": 0.563, "step": 2234 }, { "epoch": 0.6198003327787022, "grad_norm": 0.1764960139989853, "learning_rate": 1.984552869852971e-05, "loss": 0.5604, "step": 2235 }, { "epoch": 0.620077648363838, "grad_norm": 0.18862247467041016, "learning_rate": 1.984106030752116e-05, "loss": 0.5507, "step": 2236 }, { "epoch": 0.6203549639489739, "grad_norm": 0.19972002506256104, "learning_rate": 1.9836590484077244e-05, "loss": 0.5443, "step": 2237 }, { "epoch": 0.6206322795341098, "grad_norm": 0.18754172325134277, "learning_rate": 1.983211922907014e-05, "loss": 0.5361, "step": 2238 }, { "epoch": 0.6209095951192457, "grad_norm": 0.20183882117271423, "learning_rate": 1.9827646543372322e-05, "loss": 0.5537, "step": 2239 }, { "epoch": 0.6211869107043816, "grad_norm": 0.18864542245864868, "learning_rate": 1.9823172427856518e-05, "loss": 0.5515, "step": 2240 }, { "epoch": 0.6214642262895175, "grad_norm": 0.20876039564609528, "learning_rate": 1.981869688339575e-05, "loss": 0.5353, "step": 2241 }, { "epoch": 0.6217415418746534, "grad_norm": 0.20126941800117493, "learning_rate": 1.9814219910863313e-05, "loss": 0.5549, "step": 2242 }, { "epoch": 0.6220188574597892, "grad_norm": 0.1895267367362976, "learning_rate": 1.9809741511132786e-05, "loss": 0.5427, "step": 2243 }, { "epoch": 0.6222961730449251, "grad_norm": 0.1944306641817093, "learning_rate": 1.980526168507802e-05, "loss": 0.5627, "step": 2244 }, { "epoch": 0.622573488630061, "grad_norm": 0.19707219302654266, "learning_rate": 1.980078043357315e-05, "loss": 0.573, "step": 2245 }, { "epoch": 0.6228508042151969, "grad_norm": 0.1903533786535263, "learning_rate": 1.9796297757492587e-05, "loss": 0.5534, "step": 2246 }, { "epoch": 0.6231281198003328, "grad_norm": 0.1989421546459198, "learning_rate": 1.9791813657711022e-05, "loss": 0.5343, "step": 2247 }, { "epoch": 0.6234054353854687, "grad_norm": 0.1919817328453064, "learning_rate": 1.9787328135103418e-05, "loss": 0.5493, "step": 2248 }, { "epoch": 0.6236827509706045, "grad_norm": 0.18703347444534302, "learning_rate": 1.9782841190545024e-05, "loss": 0.572, "step": 2249 }, { "epoch": 0.6239600665557404, "grad_norm": 0.18850034475326538, "learning_rate": 1.9778352824911356e-05, "loss": 0.5358, "step": 2250 }, { "epoch": 0.6242373821408763, "grad_norm": 0.22748717665672302, "learning_rate": 1.9773863039078217e-05, "loss": 0.5569, "step": 2251 }, { "epoch": 0.6245146977260122, "grad_norm": 0.1906166821718216, "learning_rate": 1.976937183392168e-05, "loss": 0.5226, "step": 2252 }, { "epoch": 0.6247920133111481, "grad_norm": 0.1964375227689743, "learning_rate": 1.9764879210318098e-05, "loss": 0.5467, "step": 2253 }, { "epoch": 0.625069328896284, "grad_norm": 0.27289459109306335, "learning_rate": 1.9760385169144108e-05, "loss": 0.5597, "step": 2254 }, { "epoch": 0.6253466444814199, "grad_norm": 0.1879124641418457, "learning_rate": 1.9755889711276603e-05, "loss": 0.5509, "step": 2255 }, { "epoch": 0.6256239600665557, "grad_norm": 0.1970473676919937, "learning_rate": 1.9751392837592782e-05, "loss": 0.53, "step": 2256 }, { "epoch": 0.6259012756516916, "grad_norm": 0.20250020921230316, "learning_rate": 1.9746894548970092e-05, "loss": 0.5458, "step": 2257 }, { "epoch": 0.6261785912368275, "grad_norm": 0.20085811614990234, "learning_rate": 1.9742394846286277e-05, "loss": 0.5483, "step": 2258 }, { "epoch": 0.6264559068219634, "grad_norm": 0.21487122774124146, "learning_rate": 1.9737893730419337e-05, "loss": 0.5522, "step": 2259 }, { "epoch": 0.6267332224070993, "grad_norm": 0.1989215910434723, "learning_rate": 1.9733391202247577e-05, "loss": 0.5269, "step": 2260 }, { "epoch": 0.6270105379922352, "grad_norm": 0.1842491775751114, "learning_rate": 1.9728887262649536e-05, "loss": 0.543, "step": 2261 }, { "epoch": 0.627287853577371, "grad_norm": 0.1977192461490631, "learning_rate": 1.972438191250407e-05, "loss": 0.5457, "step": 2262 }, { "epoch": 0.6275651691625069, "grad_norm": 0.18922410905361176, "learning_rate": 1.9719875152690288e-05, "loss": 0.5508, "step": 2263 }, { "epoch": 0.6278424847476428, "grad_norm": 0.19666342437267303, "learning_rate": 1.9715366984087575e-05, "loss": 0.5359, "step": 2264 }, { "epoch": 0.6281198003327787, "grad_norm": 0.19078543782234192, "learning_rate": 1.9710857407575595e-05, "loss": 0.5349, "step": 2265 }, { "epoch": 0.6283971159179146, "grad_norm": 0.18407784402370453, "learning_rate": 1.970634642403429e-05, "loss": 0.5554, "step": 2266 }, { "epoch": 0.6286744315030505, "grad_norm": 0.1921215057373047, "learning_rate": 1.9701834034343864e-05, "loss": 0.583, "step": 2267 }, { "epoch": 0.6289517470881864, "grad_norm": 0.2015913873910904, "learning_rate": 1.969732023938481e-05, "loss": 0.5656, "step": 2268 }, { "epoch": 0.6292290626733222, "grad_norm": 0.19959089159965515, "learning_rate": 1.969280504003789e-05, "loss": 0.5404, "step": 2269 }, { "epoch": 0.6295063782584581, "grad_norm": 0.18149082362651825, "learning_rate": 1.968828843718414e-05, "loss": 0.5716, "step": 2270 }, { "epoch": 0.629783693843594, "grad_norm": 0.20897513628005981, "learning_rate": 1.9683770431704857e-05, "loss": 0.5566, "step": 2271 }, { "epoch": 0.6300610094287299, "grad_norm": 0.18247532844543457, "learning_rate": 1.9679251024481636e-05, "loss": 0.5463, "step": 2272 }, { "epoch": 0.6303383250138658, "grad_norm": 0.18694794178009033, "learning_rate": 1.9674730216396334e-05, "loss": 0.5611, "step": 2273 }, { "epoch": 0.6306156405990017, "grad_norm": 0.1819789707660675, "learning_rate": 1.9670208008331073e-05, "loss": 0.5336, "step": 2274 }, { "epoch": 0.6308929561841375, "grad_norm": 0.18136881291866302, "learning_rate": 1.9665684401168258e-05, "loss": 0.5496, "step": 2275 }, { "epoch": 0.6311702717692734, "grad_norm": 0.1916753500699997, "learning_rate": 1.9661159395790563e-05, "loss": 0.56, "step": 2276 }, { "epoch": 0.6314475873544093, "grad_norm": 0.18821988999843597, "learning_rate": 1.965663299308094e-05, "loss": 0.5507, "step": 2277 }, { "epoch": 0.6317249029395452, "grad_norm": 0.2045837789773941, "learning_rate": 1.965210519392261e-05, "loss": 0.5495, "step": 2278 }, { "epoch": 0.6320022185246811, "grad_norm": 0.17228901386260986, "learning_rate": 1.964757599919907e-05, "loss": 0.5505, "step": 2279 }, { "epoch": 0.632279534109817, "grad_norm": 0.19306735694408417, "learning_rate": 1.9643045409794074e-05, "loss": 0.5257, "step": 2280 }, { "epoch": 0.6325568496949528, "grad_norm": 0.19630911946296692, "learning_rate": 1.9638513426591668e-05, "loss": 0.5324, "step": 2281 }, { "epoch": 0.6328341652800887, "grad_norm": 0.19862103462219238, "learning_rate": 1.9633980050476164e-05, "loss": 0.5556, "step": 2282 }, { "epoch": 0.6331114808652246, "grad_norm": 0.20335890352725983, "learning_rate": 1.9629445282332136e-05, "loss": 0.5583, "step": 2283 }, { "epoch": 0.6333887964503605, "grad_norm": 0.19751910865306854, "learning_rate": 1.9624909123044448e-05, "loss": 0.5844, "step": 2284 }, { "epoch": 0.6336661120354964, "grad_norm": 0.18284855782985687, "learning_rate": 1.9620371573498212e-05, "loss": 0.5393, "step": 2285 }, { "epoch": 0.6339434276206323, "grad_norm": 0.18531352281570435, "learning_rate": 1.961583263457884e-05, "loss": 0.5591, "step": 2286 }, { "epoch": 0.6342207432057682, "grad_norm": 0.18705891072750092, "learning_rate": 1.9611292307171987e-05, "loss": 0.5309, "step": 2287 }, { "epoch": 0.634498058790904, "grad_norm": 0.18091407418251038, "learning_rate": 1.9606750592163593e-05, "loss": 0.5041, "step": 2288 }, { "epoch": 0.6347753743760399, "grad_norm": 0.19142916798591614, "learning_rate": 1.960220749043987e-05, "loss": 0.5601, "step": 2289 }, { "epoch": 0.6350526899611758, "grad_norm": 0.18897870182991028, "learning_rate": 1.9597663002887294e-05, "loss": 0.5541, "step": 2290 }, { "epoch": 0.6353300055463117, "grad_norm": 0.19178354740142822, "learning_rate": 1.959311713039262e-05, "loss": 0.5217, "step": 2291 }, { "epoch": 0.6356073211314476, "grad_norm": 0.18749533593654633, "learning_rate": 1.9588569873842864e-05, "loss": 0.5291, "step": 2292 }, { "epoch": 0.6358846367165835, "grad_norm": 0.20121093094348907, "learning_rate": 1.9584021234125323e-05, "loss": 0.5414, "step": 2293 }, { "epoch": 0.6361619523017193, "grad_norm": 0.18628259003162384, "learning_rate": 1.957947121212754e-05, "loss": 0.511, "step": 2294 }, { "epoch": 0.6364392678868552, "grad_norm": 0.18616369366645813, "learning_rate": 1.9574919808737364e-05, "loss": 0.5496, "step": 2295 }, { "epoch": 0.6367165834719911, "grad_norm": 0.20249204337596893, "learning_rate": 1.9570367024842888e-05, "loss": 0.5585, "step": 2296 }, { "epoch": 0.636993899057127, "grad_norm": 0.23987984657287598, "learning_rate": 1.9565812861332477e-05, "loss": 0.5659, "step": 2297 }, { "epoch": 0.6372712146422629, "grad_norm": 0.1872101128101349, "learning_rate": 1.956125731909477e-05, "loss": 0.5419, "step": 2298 }, { "epoch": 0.6375485302273988, "grad_norm": 0.18463543057441711, "learning_rate": 1.955670039901868e-05, "loss": 0.5302, "step": 2299 }, { "epoch": 0.6378258458125347, "grad_norm": 0.20480939745903015, "learning_rate": 1.955214210199338e-05, "loss": 0.5343, "step": 2300 }, { "epoch": 0.6381031613976705, "grad_norm": 0.18215136229991913, "learning_rate": 1.9547582428908306e-05, "loss": 0.546, "step": 2301 }, { "epoch": 0.6383804769828064, "grad_norm": 0.17943714559078217, "learning_rate": 1.954302138065318e-05, "loss": 0.5334, "step": 2302 }, { "epoch": 0.6386577925679423, "grad_norm": 0.19718489050865173, "learning_rate": 1.9538458958117982e-05, "loss": 0.5621, "step": 2303 }, { "epoch": 0.6389351081530782, "grad_norm": 0.19121624529361725, "learning_rate": 1.953389516219296e-05, "loss": 0.5523, "step": 2304 }, { "epoch": 0.6392124237382141, "grad_norm": 0.1928092986345291, "learning_rate": 1.9529329993768634e-05, "loss": 0.5455, "step": 2305 }, { "epoch": 0.63948973932335, "grad_norm": 0.1785450428724289, "learning_rate": 1.952476345373579e-05, "loss": 0.5643, "step": 2306 }, { "epoch": 0.6397670549084858, "grad_norm": 0.17965517938137054, "learning_rate": 1.9520195542985476e-05, "loss": 0.5266, "step": 2307 }, { "epoch": 0.6400443704936217, "grad_norm": 0.21014821529388428, "learning_rate": 1.9515626262409016e-05, "loss": 0.5327, "step": 2308 }, { "epoch": 0.6403216860787576, "grad_norm": 0.18984338641166687, "learning_rate": 1.951105561289799e-05, "loss": 0.5678, "step": 2309 }, { "epoch": 0.6405990016638935, "grad_norm": 0.18631823360919952, "learning_rate": 1.9506483595344267e-05, "loss": 0.5469, "step": 2310 }, { "epoch": 0.6408763172490294, "grad_norm": 0.18727704882621765, "learning_rate": 1.9501910210639958e-05, "loss": 0.5657, "step": 2311 }, { "epoch": 0.6411536328341653, "grad_norm": 0.17807155847549438, "learning_rate": 1.9497335459677458e-05, "loss": 0.5044, "step": 2312 }, { "epoch": 0.6414309484193012, "grad_norm": 0.18843533098697662, "learning_rate": 1.9492759343349415e-05, "loss": 0.553, "step": 2313 }, { "epoch": 0.641708264004437, "grad_norm": 0.1941610723733902, "learning_rate": 1.9488181862548753e-05, "loss": 0.587, "step": 2314 }, { "epoch": 0.6419855795895729, "grad_norm": 0.1894078403711319, "learning_rate": 1.9483603018168666e-05, "loss": 0.5285, "step": 2315 }, { "epoch": 0.6422628951747088, "grad_norm": 0.19420726597309113, "learning_rate": 1.9479022811102604e-05, "loss": 0.5302, "step": 2316 }, { "epoch": 0.6425402107598447, "grad_norm": 0.205157071352005, "learning_rate": 1.9474441242244284e-05, "loss": 0.5539, "step": 2317 }, { "epoch": 0.6428175263449806, "grad_norm": 0.1943119317293167, "learning_rate": 1.9469858312487693e-05, "loss": 0.5465, "step": 2318 }, { "epoch": 0.6430948419301165, "grad_norm": 0.19968454539775848, "learning_rate": 1.946527402272708e-05, "loss": 0.5438, "step": 2319 }, { "epoch": 0.6433721575152523, "grad_norm": 0.18584848940372467, "learning_rate": 1.9460688373856967e-05, "loss": 0.546, "step": 2320 }, { "epoch": 0.6436494731003882, "grad_norm": 0.17821067571640015, "learning_rate": 1.945610136677213e-05, "loss": 0.5148, "step": 2321 }, { "epoch": 0.6439267886855241, "grad_norm": 0.19228345155715942, "learning_rate": 1.945151300236762e-05, "loss": 0.5368, "step": 2322 }, { "epoch": 0.64420410427066, "grad_norm": 0.18330131471157074, "learning_rate": 1.9446923281538747e-05, "loss": 0.5611, "step": 2323 }, { "epoch": 0.6444814198557959, "grad_norm": 0.18893574178218842, "learning_rate": 1.9442332205181086e-05, "loss": 0.54, "step": 2324 }, { "epoch": 0.6447587354409318, "grad_norm": 0.19229231774806976, "learning_rate": 1.943773977419047e-05, "loss": 0.5236, "step": 2325 }, { "epoch": 0.6450360510260676, "grad_norm": 0.20103448629379272, "learning_rate": 1.9433145989463027e-05, "loss": 0.554, "step": 2326 }, { "epoch": 0.6453133666112035, "grad_norm": 0.1895090788602829, "learning_rate": 1.9428550851895098e-05, "loss": 0.5676, "step": 2327 }, { "epoch": 0.6455906821963394, "grad_norm": 0.18887649476528168, "learning_rate": 1.9423954362383334e-05, "loss": 0.5601, "step": 2328 }, { "epoch": 0.6458679977814753, "grad_norm": 0.2108272910118103, "learning_rate": 1.941935652182463e-05, "loss": 0.5748, "step": 2329 }, { "epoch": 0.6461453133666112, "grad_norm": 0.20968154072761536, "learning_rate": 1.941475733111614e-05, "loss": 0.5306, "step": 2330 }, { "epoch": 0.6464226289517471, "grad_norm": 0.18780824542045593, "learning_rate": 1.9410156791155297e-05, "loss": 0.5326, "step": 2331 }, { "epoch": 0.646699944536883, "grad_norm": 0.19030767679214478, "learning_rate": 1.9405554902839778e-05, "loss": 0.5533, "step": 2332 }, { "epoch": 0.6469772601220188, "grad_norm": 0.1849377304315567, "learning_rate": 1.9400951667067542e-05, "loss": 0.571, "step": 2333 }, { "epoch": 0.6472545757071547, "grad_norm": 0.1965888887643814, "learning_rate": 1.9396347084736794e-05, "loss": 0.5591, "step": 2334 }, { "epoch": 0.6475318912922906, "grad_norm": 0.1921495646238327, "learning_rate": 1.9391741156746013e-05, "loss": 0.5723, "step": 2335 }, { "epoch": 0.6478092068774265, "grad_norm": 0.20073464512825012, "learning_rate": 1.9387133883993948e-05, "loss": 0.535, "step": 2336 }, { "epoch": 0.6480865224625624, "grad_norm": 0.18834145367145538, "learning_rate": 1.938252526737958e-05, "loss": 0.5256, "step": 2337 }, { "epoch": 0.6483638380476983, "grad_norm": 0.1929401457309723, "learning_rate": 1.9377915307802192e-05, "loss": 0.5204, "step": 2338 }, { "epoch": 0.6486411536328341, "grad_norm": 0.19703806936740875, "learning_rate": 1.9373304006161298e-05, "loss": 0.5603, "step": 2339 }, { "epoch": 0.64891846921797, "grad_norm": 0.205661803483963, "learning_rate": 1.9368691363356682e-05, "loss": 0.522, "step": 2340 }, { "epoch": 0.6491957848031059, "grad_norm": 0.19013790786266327, "learning_rate": 1.9364077380288408e-05, "loss": 0.549, "step": 2341 }, { "epoch": 0.6494731003882418, "grad_norm": 0.18775691092014313, "learning_rate": 1.935946205785677e-05, "loss": 0.5389, "step": 2342 }, { "epoch": 0.6497504159733777, "grad_norm": 0.18783038854599, "learning_rate": 1.9354845396962353e-05, "loss": 0.5378, "step": 2343 }, { "epoch": 0.6500277315585136, "grad_norm": 0.19113753736019135, "learning_rate": 1.9350227398505976e-05, "loss": 0.5461, "step": 2344 }, { "epoch": 0.6503050471436495, "grad_norm": 0.17977707087993622, "learning_rate": 1.9345608063388742e-05, "loss": 0.5512, "step": 2345 }, { "epoch": 0.6505823627287853, "grad_norm": 0.20450885593891144, "learning_rate": 1.9340987392512006e-05, "loss": 0.5501, "step": 2346 }, { "epoch": 0.6508596783139212, "grad_norm": 0.2484101504087448, "learning_rate": 1.9336365386777376e-05, "loss": 0.5223, "step": 2347 }, { "epoch": 0.6511369938990571, "grad_norm": 0.20386487245559692, "learning_rate": 1.9331742047086743e-05, "loss": 0.5262, "step": 2348 }, { "epoch": 0.651414309484193, "grad_norm": 0.18846935033798218, "learning_rate": 1.9327117374342223e-05, "loss": 0.5347, "step": 2349 }, { "epoch": 0.6516916250693289, "grad_norm": 0.18970006704330444, "learning_rate": 1.932249136944623e-05, "loss": 0.5683, "step": 2350 }, { "epoch": 0.6519689406544648, "grad_norm": 0.1962558776140213, "learning_rate": 1.9317864033301407e-05, "loss": 0.5791, "step": 2351 }, { "epoch": 0.6522462562396006, "grad_norm": 0.20231659710407257, "learning_rate": 1.9313235366810676e-05, "loss": 0.5627, "step": 2352 }, { "epoch": 0.6525235718247365, "grad_norm": 0.1958416849374771, "learning_rate": 1.9308605370877215e-05, "loss": 0.5375, "step": 2353 }, { "epoch": 0.6528008874098724, "grad_norm": 0.20812073349952698, "learning_rate": 1.9303974046404455e-05, "loss": 0.5299, "step": 2354 }, { "epoch": 0.6530782029950083, "grad_norm": 0.1921248584985733, "learning_rate": 1.929934139429609e-05, "loss": 0.5544, "step": 2355 }, { "epoch": 0.6533555185801442, "grad_norm": 0.20390520989894867, "learning_rate": 1.929470741545607e-05, "loss": 0.581, "step": 2356 }, { "epoch": 0.6536328341652801, "grad_norm": 0.18480364978313446, "learning_rate": 1.9290072110788616e-05, "loss": 0.5599, "step": 2357 }, { "epoch": 0.653910149750416, "grad_norm": 0.1953095942735672, "learning_rate": 1.928543548119819e-05, "loss": 0.5961, "step": 2358 }, { "epoch": 0.6541874653355518, "grad_norm": 0.19727711379528046, "learning_rate": 1.9280797527589527e-05, "loss": 0.5585, "step": 2359 }, { "epoch": 0.6544647809206877, "grad_norm": 0.19928644597530365, "learning_rate": 1.927615825086761e-05, "loss": 0.5522, "step": 2360 }, { "epoch": 0.6547420965058236, "grad_norm": 0.1891396939754486, "learning_rate": 1.9271517651937688e-05, "loss": 0.5586, "step": 2361 }, { "epoch": 0.6550194120909595, "grad_norm": 0.1949121057987213, "learning_rate": 1.9266875731705266e-05, "loss": 0.5307, "step": 2362 }, { "epoch": 0.6552967276760954, "grad_norm": 0.21069341897964478, "learning_rate": 1.9262232491076104e-05, "loss": 0.5546, "step": 2363 }, { "epoch": 0.6555740432612313, "grad_norm": 0.18490912020206451, "learning_rate": 1.925758793095622e-05, "loss": 0.5613, "step": 2364 }, { "epoch": 0.6558513588463671, "grad_norm": 0.1878899782896042, "learning_rate": 1.9252942052251892e-05, "loss": 0.5497, "step": 2365 }, { "epoch": 0.656128674431503, "grad_norm": 0.19967246055603027, "learning_rate": 1.9248294855869653e-05, "loss": 0.5327, "step": 2366 }, { "epoch": 0.6564059900166389, "grad_norm": 0.19277790188789368, "learning_rate": 1.9243646342716296e-05, "loss": 0.5545, "step": 2367 }, { "epoch": 0.6566833056017748, "grad_norm": 0.19045887887477875, "learning_rate": 1.9238996513698864e-05, "loss": 0.5071, "step": 2368 }, { "epoch": 0.6569606211869107, "grad_norm": 0.19380688667297363, "learning_rate": 1.923434536972467e-05, "loss": 0.5437, "step": 2369 }, { "epoch": 0.6572379367720466, "grad_norm": 0.18202729523181915, "learning_rate": 1.9229692911701275e-05, "loss": 0.5356, "step": 2370 }, { "epoch": 0.6575152523571824, "grad_norm": 0.18908429145812988, "learning_rate": 1.9225039140536488e-05, "loss": 0.5526, "step": 2371 }, { "epoch": 0.6577925679423183, "grad_norm": 0.21290616691112518, "learning_rate": 1.9220384057138386e-05, "loss": 0.5306, "step": 2372 }, { "epoch": 0.6580698835274542, "grad_norm": 0.18612539768218994, "learning_rate": 1.9215727662415303e-05, "loss": 0.5387, "step": 2373 }, { "epoch": 0.6583471991125901, "grad_norm": 0.182894766330719, "learning_rate": 1.9211069957275822e-05, "loss": 0.5365, "step": 2374 }, { "epoch": 0.658624514697726, "grad_norm": 0.1848146617412567, "learning_rate": 1.920641094262879e-05, "loss": 0.5423, "step": 2375 }, { "epoch": 0.6589018302828619, "grad_norm": 0.18521788716316223, "learning_rate": 1.92017506193833e-05, "loss": 0.5534, "step": 2376 }, { "epoch": 0.6591791458679978, "grad_norm": 0.1895231306552887, "learning_rate": 1.9197088988448703e-05, "loss": 0.5447, "step": 2377 }, { "epoch": 0.6594564614531336, "grad_norm": 0.18669599294662476, "learning_rate": 1.9192426050734608e-05, "loss": 0.5446, "step": 2378 }, { "epoch": 0.6597337770382695, "grad_norm": 0.1867615282535553, "learning_rate": 1.9187761807150878e-05, "loss": 0.5269, "step": 2379 }, { "epoch": 0.6600110926234054, "grad_norm": 0.19190800189971924, "learning_rate": 1.918309625860763e-05, "loss": 0.5833, "step": 2380 }, { "epoch": 0.6602884082085413, "grad_norm": 0.18087397515773773, "learning_rate": 1.917842940601524e-05, "loss": 0.5379, "step": 2381 }, { "epoch": 0.6605657237936772, "grad_norm": 0.18190105259418488, "learning_rate": 1.9173761250284324e-05, "loss": 0.5489, "step": 2382 }, { "epoch": 0.6608430393788131, "grad_norm": 0.19485372304916382, "learning_rate": 1.9169091792325777e-05, "loss": 0.5687, "step": 2383 }, { "epoch": 0.6611203549639489, "grad_norm": 0.19502972066402435, "learning_rate": 1.9164421033050724e-05, "loss": 0.5445, "step": 2384 }, { "epoch": 0.6613976705490848, "grad_norm": 0.18996240198612213, "learning_rate": 1.915974897337056e-05, "loss": 0.5621, "step": 2385 }, { "epoch": 0.6616749861342207, "grad_norm": 0.19751591980457306, "learning_rate": 1.915507561419692e-05, "loss": 0.5468, "step": 2386 }, { "epoch": 0.6619523017193566, "grad_norm": 0.2202579826116562, "learning_rate": 1.915040095644171e-05, "loss": 0.5515, "step": 2387 }, { "epoch": 0.6622296173044925, "grad_norm": 0.1802307367324829, "learning_rate": 1.914572500101707e-05, "loss": 0.5449, "step": 2388 }, { "epoch": 0.6625069328896284, "grad_norm": 0.18632298707962036, "learning_rate": 1.914104774883541e-05, "loss": 0.5612, "step": 2389 }, { "epoch": 0.6627842484747642, "grad_norm": 0.19929082691669464, "learning_rate": 1.9136369200809378e-05, "loss": 0.5789, "step": 2390 }, { "epoch": 0.6630615640599001, "grad_norm": 0.18650726974010468, "learning_rate": 1.913168935785189e-05, "loss": 0.5618, "step": 2391 }, { "epoch": 0.663338879645036, "grad_norm": 0.1912173330783844, "learning_rate": 1.912700822087611e-05, "loss": 0.5487, "step": 2392 }, { "epoch": 0.6636161952301719, "grad_norm": 0.19155828654766083, "learning_rate": 1.912232579079544e-05, "loss": 0.5421, "step": 2393 }, { "epoch": 0.6638935108153078, "grad_norm": 0.21673326194286346, "learning_rate": 1.9117642068523556e-05, "loss": 0.5302, "step": 2394 }, { "epoch": 0.6641708264004437, "grad_norm": 0.1941951960325241, "learning_rate": 1.9112957054974373e-05, "loss": 0.5558, "step": 2395 }, { "epoch": 0.6644481419855796, "grad_norm": 0.18575075268745422, "learning_rate": 1.9108270751062064e-05, "loss": 0.5737, "step": 2396 }, { "epoch": 0.6647254575707154, "grad_norm": 0.19413797557353973, "learning_rate": 1.9103583157701046e-05, "loss": 0.5559, "step": 2397 }, { "epoch": 0.6650027731558513, "grad_norm": 0.19030508399009705, "learning_rate": 1.9098894275805994e-05, "loss": 0.5519, "step": 2398 }, { "epoch": 0.6652800887409872, "grad_norm": 0.18635134398937225, "learning_rate": 1.9094204106291842e-05, "loss": 0.5203, "step": 2399 }, { "epoch": 0.6655574043261231, "grad_norm": 0.1943938434123993, "learning_rate": 1.908951265007375e-05, "loss": 0.5664, "step": 2400 }, { "epoch": 0.665834719911259, "grad_norm": 0.2069421112537384, "learning_rate": 1.9084819908067156e-05, "loss": 0.5561, "step": 2401 }, { "epoch": 0.6661120354963949, "grad_norm": 0.18940883874893188, "learning_rate": 1.9080125881187737e-05, "loss": 0.5833, "step": 2402 }, { "epoch": 0.6663893510815307, "grad_norm": 0.1828288733959198, "learning_rate": 1.907543057035142e-05, "loss": 0.5478, "step": 2403 }, { "epoch": 0.6666666666666666, "grad_norm": 0.18825417757034302, "learning_rate": 1.907073397647439e-05, "loss": 0.5362, "step": 2404 }, { "epoch": 0.6669439822518025, "grad_norm": 0.19599126279354095, "learning_rate": 1.906603610047307e-05, "loss": 0.5425, "step": 2405 }, { "epoch": 0.6672212978369384, "grad_norm": 0.1776115894317627, "learning_rate": 1.9061336943264145e-05, "loss": 0.5395, "step": 2406 }, { "epoch": 0.6674986134220743, "grad_norm": 0.20096100866794586, "learning_rate": 1.905663650576454e-05, "loss": 0.547, "step": 2407 }, { "epoch": 0.6677759290072102, "grad_norm": 0.1988787204027176, "learning_rate": 1.9051934788891443e-05, "loss": 0.5336, "step": 2408 }, { "epoch": 0.668053244592346, "grad_norm": 0.19204209744930267, "learning_rate": 1.9047231793562276e-05, "loss": 0.5449, "step": 2409 }, { "epoch": 0.6683305601774819, "grad_norm": 0.1779128760099411, "learning_rate": 1.904252752069472e-05, "loss": 0.5023, "step": 2410 }, { "epoch": 0.6686078757626178, "grad_norm": 0.2877557575702667, "learning_rate": 1.9037821971206703e-05, "loss": 0.5446, "step": 2411 }, { "epoch": 0.6688851913477537, "grad_norm": 0.19600874185562134, "learning_rate": 1.90331151460164e-05, "loss": 0.5275, "step": 2412 }, { "epoch": 0.6691625069328896, "grad_norm": 0.28246966004371643, "learning_rate": 1.9028407046042246e-05, "loss": 0.5321, "step": 2413 }, { "epoch": 0.6694398225180255, "grad_norm": 0.1964629739522934, "learning_rate": 1.9023697672202905e-05, "loss": 0.5727, "step": 2414 }, { "epoch": 0.6697171381031614, "grad_norm": 0.17945913970470428, "learning_rate": 1.901898702541731e-05, "loss": 0.5237, "step": 2415 }, { "epoch": 0.6699944536882972, "grad_norm": 0.190501868724823, "learning_rate": 1.901427510660463e-05, "loss": 0.5491, "step": 2416 }, { "epoch": 0.6702717692734331, "grad_norm": 0.19791793823242188, "learning_rate": 1.9009561916684282e-05, "loss": 0.5586, "step": 2417 }, { "epoch": 0.670549084858569, "grad_norm": 0.18914659321308136, "learning_rate": 1.900484745657594e-05, "loss": 0.5302, "step": 2418 }, { "epoch": 0.6708264004437049, "grad_norm": 0.1981426477432251, "learning_rate": 1.9000131727199513e-05, "loss": 0.5609, "step": 2419 }, { "epoch": 0.6711037160288408, "grad_norm": 0.1939757615327835, "learning_rate": 1.8995414729475165e-05, "loss": 0.5749, "step": 2420 }, { "epoch": 0.6713810316139767, "grad_norm": 0.19931401312351227, "learning_rate": 1.899069646432332e-05, "loss": 0.547, "step": 2421 }, { "epoch": 0.6716583471991125, "grad_norm": 0.19219137728214264, "learning_rate": 1.898597693266462e-05, "loss": 0.5426, "step": 2422 }, { "epoch": 0.6719356627842484, "grad_norm": 0.199588343501091, "learning_rate": 1.898125613541998e-05, "loss": 0.5411, "step": 2423 }, { "epoch": 0.6722129783693843, "grad_norm": 0.19906532764434814, "learning_rate": 1.897653407351055e-05, "loss": 0.5707, "step": 2424 }, { "epoch": 0.6724902939545202, "grad_norm": 0.18722088634967804, "learning_rate": 1.8971810747857726e-05, "loss": 0.5623, "step": 2425 }, { "epoch": 0.6727676095396561, "grad_norm": 0.18101942539215088, "learning_rate": 1.8967086159383162e-05, "loss": 0.5519, "step": 2426 }, { "epoch": 0.673044925124792, "grad_norm": 0.19272929430007935, "learning_rate": 1.8962360309008746e-05, "loss": 0.5413, "step": 2427 }, { "epoch": 0.6733222407099279, "grad_norm": 0.19695578515529633, "learning_rate": 1.8957633197656615e-05, "loss": 0.5299, "step": 2428 }, { "epoch": 0.6735995562950637, "grad_norm": 0.19021473824977875, "learning_rate": 1.8952904826249158e-05, "loss": 0.5453, "step": 2429 }, { "epoch": 0.6738768718801996, "grad_norm": 0.1905011683702469, "learning_rate": 1.8948175195709e-05, "loss": 0.5458, "step": 2430 }, { "epoch": 0.6741541874653355, "grad_norm": 0.1979636549949646, "learning_rate": 1.8943444306959017e-05, "loss": 0.5379, "step": 2431 }, { "epoch": 0.6744315030504714, "grad_norm": 0.19846384227275848, "learning_rate": 1.8938712160922343e-05, "loss": 0.5164, "step": 2432 }, { "epoch": 0.6747088186356073, "grad_norm": 0.18356280028820038, "learning_rate": 1.893397875852233e-05, "loss": 0.5429, "step": 2433 }, { "epoch": 0.6749861342207432, "grad_norm": 0.1836164891719818, "learning_rate": 1.8929244100682597e-05, "loss": 0.5569, "step": 2434 }, { "epoch": 0.675263449805879, "grad_norm": 0.20087088644504547, "learning_rate": 1.8924508188327e-05, "loss": 0.5676, "step": 2435 }, { "epoch": 0.6755407653910149, "grad_norm": 0.1871204376220703, "learning_rate": 1.891977102237964e-05, "loss": 0.5314, "step": 2436 }, { "epoch": 0.6758180809761508, "grad_norm": 0.19372668862342834, "learning_rate": 1.891503260376487e-05, "loss": 0.543, "step": 2437 }, { "epoch": 0.6760953965612867, "grad_norm": 0.20033282041549683, "learning_rate": 1.891029293340727e-05, "loss": 0.5395, "step": 2438 }, { "epoch": 0.6763727121464226, "grad_norm": 0.1941455453634262, "learning_rate": 1.8905552012231684e-05, "loss": 0.5775, "step": 2439 }, { "epoch": 0.6766500277315585, "grad_norm": 0.18717962503433228, "learning_rate": 1.890080984116319e-05, "loss": 0.5305, "step": 2440 }, { "epoch": 0.6769273433166944, "grad_norm": 0.1877082884311676, "learning_rate": 1.8896066421127106e-05, "loss": 0.5438, "step": 2441 }, { "epoch": 0.6772046589018302, "grad_norm": 0.19558964669704437, "learning_rate": 1.8891321753049008e-05, "loss": 0.5452, "step": 2442 }, { "epoch": 0.6774819744869661, "grad_norm": 0.19550803303718567, "learning_rate": 1.8886575837854696e-05, "loss": 0.5409, "step": 2443 }, { "epoch": 0.677759290072102, "grad_norm": 0.18728572130203247, "learning_rate": 1.888182867647023e-05, "loss": 0.5572, "step": 2444 }, { "epoch": 0.6780366056572379, "grad_norm": 0.184623122215271, "learning_rate": 1.8877080269821906e-05, "loss": 0.565, "step": 2445 }, { "epoch": 0.6783139212423738, "grad_norm": 0.19215163588523865, "learning_rate": 1.8872330618836265e-05, "loss": 0.532, "step": 2446 }, { "epoch": 0.6785912368275097, "grad_norm": 0.19430597126483917, "learning_rate": 1.886757972444009e-05, "loss": 0.5409, "step": 2447 }, { "epoch": 0.6788685524126455, "grad_norm": 0.1996561586856842, "learning_rate": 1.88628275875604e-05, "loss": 0.5514, "step": 2448 }, { "epoch": 0.6791458679977814, "grad_norm": 0.20458458364009857, "learning_rate": 1.8858074209124473e-05, "loss": 0.5566, "step": 2449 }, { "epoch": 0.6794231835829173, "grad_norm": 0.20288583636283875, "learning_rate": 1.885331959005981e-05, "loss": 0.5237, "step": 2450 }, { "epoch": 0.6797004991680532, "grad_norm": 0.18585625290870667, "learning_rate": 1.8848563731294172e-05, "loss": 0.53, "step": 2451 }, { "epoch": 0.6799778147531891, "grad_norm": 0.22308149933815002, "learning_rate": 1.8843806633755544e-05, "loss": 0.5513, "step": 2452 }, { "epoch": 0.680255130338325, "grad_norm": 0.18869346380233765, "learning_rate": 1.8839048298372165e-05, "loss": 0.5527, "step": 2453 }, { "epoch": 0.6805324459234608, "grad_norm": 0.1881789267063141, "learning_rate": 1.8834288726072513e-05, "loss": 0.5368, "step": 2454 }, { "epoch": 0.6808097615085967, "grad_norm": 0.18957830965518951, "learning_rate": 1.882952791778531e-05, "loss": 0.5235, "step": 2455 }, { "epoch": 0.6810870770937326, "grad_norm": 0.1874406933784485, "learning_rate": 1.882476587443951e-05, "loss": 0.5164, "step": 2456 }, { "epoch": 0.6813643926788685, "grad_norm": 0.19134515523910522, "learning_rate": 1.8820002596964316e-05, "loss": 0.5606, "step": 2457 }, { "epoch": 0.6816417082640044, "grad_norm": 0.18497633934020996, "learning_rate": 1.881523808628917e-05, "loss": 0.5681, "step": 2458 }, { "epoch": 0.6819190238491403, "grad_norm": 0.18682947754859924, "learning_rate": 1.881047234334376e-05, "loss": 0.55, "step": 2459 }, { "epoch": 0.6821963394342762, "grad_norm": 0.1982649862766266, "learning_rate": 1.8805705369057993e-05, "loss": 0.5321, "step": 2460 }, { "epoch": 0.682473655019412, "grad_norm": 0.19017384946346283, "learning_rate": 1.880093716436205e-05, "loss": 0.5295, "step": 2461 }, { "epoch": 0.6827509706045479, "grad_norm": 0.20334112644195557, "learning_rate": 1.8796167730186322e-05, "loss": 0.5601, "step": 2462 }, { "epoch": 0.6830282861896838, "grad_norm": 0.1974753588438034, "learning_rate": 1.8791397067461457e-05, "loss": 0.5572, "step": 2463 }, { "epoch": 0.6833056017748197, "grad_norm": 0.17885488271713257, "learning_rate": 1.878662517711834e-05, "loss": 0.5245, "step": 2464 }, { "epoch": 0.6835829173599556, "grad_norm": 0.18409696221351624, "learning_rate": 1.8781852060088083e-05, "loss": 0.5321, "step": 2465 }, { "epoch": 0.6838602329450915, "grad_norm": 0.19201841950416565, "learning_rate": 1.877707771730206e-05, "loss": 0.5563, "step": 2466 }, { "epoch": 0.6841375485302273, "grad_norm": 0.1807066947221756, "learning_rate": 1.8772302149691866e-05, "loss": 0.5253, "step": 2467 }, { "epoch": 0.6844148641153632, "grad_norm": 0.18462277948856354, "learning_rate": 1.8767525358189343e-05, "loss": 0.5315, "step": 2468 }, { "epoch": 0.6846921797004991, "grad_norm": 0.18250201642513275, "learning_rate": 1.876274734372656e-05, "loss": 0.5383, "step": 2469 }, { "epoch": 0.684969495285635, "grad_norm": 0.1986282765865326, "learning_rate": 1.8757968107235853e-05, "loss": 0.5282, "step": 2470 }, { "epoch": 0.6852468108707709, "grad_norm": 0.26469552516937256, "learning_rate": 1.8753187649649757e-05, "loss": 0.5564, "step": 2471 }, { "epoch": 0.6855241264559068, "grad_norm": 0.19594305753707886, "learning_rate": 1.874840597190108e-05, "loss": 0.5546, "step": 2472 }, { "epoch": 0.6858014420410427, "grad_norm": 0.18754026293754578, "learning_rate": 1.8743623074922843e-05, "loss": 0.5309, "step": 2473 }, { "epoch": 0.6860787576261785, "grad_norm": 0.1846148520708084, "learning_rate": 1.873883895964833e-05, "loss": 0.5288, "step": 2474 }, { "epoch": 0.6863560732113144, "grad_norm": 0.18642264604568481, "learning_rate": 1.873405362701104e-05, "loss": 0.5316, "step": 2475 }, { "epoch": 0.6866333887964503, "grad_norm": 0.20203615725040436, "learning_rate": 1.8729267077944717e-05, "loss": 0.5235, "step": 2476 }, { "epoch": 0.6869107043815862, "grad_norm": 0.18540050089359283, "learning_rate": 1.872447931338335e-05, "loss": 0.5488, "step": 2477 }, { "epoch": 0.6871880199667221, "grad_norm": 0.19175854325294495, "learning_rate": 1.8719690334261148e-05, "loss": 0.5529, "step": 2478 }, { "epoch": 0.687465335551858, "grad_norm": 0.18168555200099945, "learning_rate": 1.8714900141512574e-05, "loss": 0.5119, "step": 2479 }, { "epoch": 0.687742651136994, "grad_norm": 0.1855335384607315, "learning_rate": 1.871010873607233e-05, "loss": 0.5448, "step": 2480 }, { "epoch": 0.6880199667221298, "grad_norm": 0.1834007054567337, "learning_rate": 1.870531611887533e-05, "loss": 0.5583, "step": 2481 }, { "epoch": 0.6882972823072657, "grad_norm": 0.1926104575395584, "learning_rate": 1.870052229085675e-05, "loss": 0.5549, "step": 2482 }, { "epoch": 0.6885745978924016, "grad_norm": 0.19106236100196838, "learning_rate": 1.8695727252951995e-05, "loss": 0.5146, "step": 2483 }, { "epoch": 0.6888519134775375, "grad_norm": 0.18813811242580414, "learning_rate": 1.8690931006096695e-05, "loss": 0.5773, "step": 2484 }, { "epoch": 0.6891292290626734, "grad_norm": 0.19836729764938354, "learning_rate": 1.8686133551226735e-05, "loss": 0.5793, "step": 2485 }, { "epoch": 0.6894065446478093, "grad_norm": 0.1816731095314026, "learning_rate": 1.8681334889278217e-05, "loss": 0.5205, "step": 2486 }, { "epoch": 0.6896838602329451, "grad_norm": 0.1861170530319214, "learning_rate": 1.8676535021187495e-05, "loss": 0.5589, "step": 2487 }, { "epoch": 0.689961175818081, "grad_norm": 0.18680687248706818, "learning_rate": 1.867173394789114e-05, "loss": 0.5283, "step": 2488 }, { "epoch": 0.6902384914032169, "grad_norm": 0.19478975236415863, "learning_rate": 1.866693167032598e-05, "loss": 0.5559, "step": 2489 }, { "epoch": 0.6905158069883528, "grad_norm": 0.19333085417747498, "learning_rate": 1.8662128189429058e-05, "loss": 0.516, "step": 2490 }, { "epoch": 0.6907931225734887, "grad_norm": 0.19102855026721954, "learning_rate": 1.8657323506137668e-05, "loss": 0.5587, "step": 2491 }, { "epoch": 0.6910704381586246, "grad_norm": 0.19927440583705902, "learning_rate": 1.8652517621389324e-05, "loss": 0.5475, "step": 2492 }, { "epoch": 0.6913477537437605, "grad_norm": 0.19673167169094086, "learning_rate": 1.8647710536121784e-05, "loss": 0.5504, "step": 2493 }, { "epoch": 0.6916250693288963, "grad_norm": 0.19291207194328308, "learning_rate": 1.8642902251273038e-05, "loss": 0.555, "step": 2494 }, { "epoch": 0.6919023849140322, "grad_norm": 0.22686271369457245, "learning_rate": 1.863809276778131e-05, "loss": 0.5169, "step": 2495 }, { "epoch": 0.6921797004991681, "grad_norm": 0.19432714581489563, "learning_rate": 1.8633282086585057e-05, "loss": 0.5287, "step": 2496 }, { "epoch": 0.692457016084304, "grad_norm": 0.18512091040611267, "learning_rate": 1.8628470208622972e-05, "loss": 0.5452, "step": 2497 }, { "epoch": 0.6927343316694399, "grad_norm": 0.18711940944194794, "learning_rate": 1.8623657134833976e-05, "loss": 0.5489, "step": 2498 }, { "epoch": 0.6930116472545758, "grad_norm": 0.19019177556037903, "learning_rate": 1.8618842866157234e-05, "loss": 0.5188, "step": 2499 }, { "epoch": 0.6932889628397116, "grad_norm": 0.18888919055461884, "learning_rate": 1.861402740353213e-05, "loss": 0.5563, "step": 2500 }, { "epoch": 0.6935662784248475, "grad_norm": 0.19487237930297852, "learning_rate": 1.8609210747898293e-05, "loss": 0.5627, "step": 2501 }, { "epoch": 0.6938435940099834, "grad_norm": 0.18301233649253845, "learning_rate": 1.8604392900195573e-05, "loss": 0.5539, "step": 2502 }, { "epoch": 0.6941209095951193, "grad_norm": 0.18450453877449036, "learning_rate": 1.8599573861364074e-05, "loss": 0.5238, "step": 2503 }, { "epoch": 0.6943982251802552, "grad_norm": 0.19734638929367065, "learning_rate": 1.8594753632344104e-05, "loss": 0.5619, "step": 2504 }, { "epoch": 0.6946755407653911, "grad_norm": 0.1974724531173706, "learning_rate": 1.858993221407622e-05, "loss": 0.5805, "step": 2505 }, { "epoch": 0.694952856350527, "grad_norm": 0.20074540376663208, "learning_rate": 1.858510960750122e-05, "loss": 0.5515, "step": 2506 }, { "epoch": 0.6952301719356628, "grad_norm": 0.18881772458553314, "learning_rate": 1.8580285813560104e-05, "loss": 0.5371, "step": 2507 }, { "epoch": 0.6955074875207987, "grad_norm": 0.19910936057567596, "learning_rate": 1.8575460833194142e-05, "loss": 0.5288, "step": 2508 }, { "epoch": 0.6957848031059346, "grad_norm": 0.18827465176582336, "learning_rate": 1.8570634667344795e-05, "loss": 0.5591, "step": 2509 }, { "epoch": 0.6960621186910705, "grad_norm": 0.18730634450912476, "learning_rate": 1.8565807316953796e-05, "loss": 0.5126, "step": 2510 }, { "epoch": 0.6963394342762064, "grad_norm": 0.18552148342132568, "learning_rate": 1.856097878296307e-05, "loss": 0.5208, "step": 2511 }, { "epoch": 0.6966167498613423, "grad_norm": 0.18370574712753296, "learning_rate": 1.8556149066314803e-05, "loss": 0.5122, "step": 2512 }, { "epoch": 0.6968940654464781, "grad_norm": 0.19410766661167145, "learning_rate": 1.8551318167951403e-05, "loss": 0.5062, "step": 2513 }, { "epoch": 0.697171381031614, "grad_norm": 0.1899997889995575, "learning_rate": 1.85464860888155e-05, "loss": 0.5355, "step": 2514 }, { "epoch": 0.6974486966167499, "grad_norm": 0.2276785969734192, "learning_rate": 1.854165282984996e-05, "loss": 0.5434, "step": 2515 }, { "epoch": 0.6977260122018858, "grad_norm": 0.20629100501537323, "learning_rate": 1.8536818391997884e-05, "loss": 0.5434, "step": 2516 }, { "epoch": 0.6980033277870217, "grad_norm": 0.1940404623746872, "learning_rate": 1.8531982776202598e-05, "loss": 0.5566, "step": 2517 }, { "epoch": 0.6982806433721576, "grad_norm": 0.18856097757816315, "learning_rate": 1.8527145983407658e-05, "loss": 0.5414, "step": 2518 }, { "epoch": 0.6985579589572934, "grad_norm": 0.18844252824783325, "learning_rate": 1.8522308014556843e-05, "loss": 0.5535, "step": 2519 }, { "epoch": 0.6988352745424293, "grad_norm": 0.19054186344146729, "learning_rate": 1.8517468870594188e-05, "loss": 0.5436, "step": 2520 }, { "epoch": 0.6991125901275652, "grad_norm": 0.1948595494031906, "learning_rate": 1.8512628552463917e-05, "loss": 0.5589, "step": 2521 }, { "epoch": 0.6993899057127011, "grad_norm": 0.2020605355501175, "learning_rate": 1.850778706111052e-05, "loss": 0.5586, "step": 2522 }, { "epoch": 0.699667221297837, "grad_norm": 0.1909698247909546, "learning_rate": 1.8502944397478693e-05, "loss": 0.5231, "step": 2523 }, { "epoch": 0.6999445368829729, "grad_norm": 0.19061705470085144, "learning_rate": 1.849810056251337e-05, "loss": 0.5436, "step": 2524 }, { "epoch": 0.7002218524681088, "grad_norm": 0.18788163363933563, "learning_rate": 1.8493255557159704e-05, "loss": 0.5223, "step": 2525 }, { "epoch": 0.7004991680532446, "grad_norm": 0.18889035284519196, "learning_rate": 1.8488409382363095e-05, "loss": 0.5178, "step": 2526 }, { "epoch": 0.7007764836383805, "grad_norm": 0.18843677639961243, "learning_rate": 1.8483562039069157e-05, "loss": 0.5535, "step": 2527 }, { "epoch": 0.7010537992235164, "grad_norm": 0.195985808968544, "learning_rate": 1.847871352822373e-05, "loss": 0.5592, "step": 2528 }, { "epoch": 0.7013311148086523, "grad_norm": 0.1854049116373062, "learning_rate": 1.8473863850772897e-05, "loss": 0.5272, "step": 2529 }, { "epoch": 0.7016084303937882, "grad_norm": 0.17917431890964508, "learning_rate": 1.8469013007662946e-05, "loss": 0.5239, "step": 2530 }, { "epoch": 0.7018857459789241, "grad_norm": 0.18059198558330536, "learning_rate": 1.8464160999840417e-05, "loss": 0.565, "step": 2531 }, { "epoch": 0.7021630615640599, "grad_norm": 0.18871940672397614, "learning_rate": 1.8459307828252052e-05, "loss": 0.5422, "step": 2532 }, { "epoch": 0.7024403771491958, "grad_norm": 0.1852055937051773, "learning_rate": 1.845445349384485e-05, "loss": 0.5211, "step": 2533 }, { "epoch": 0.7027176927343317, "grad_norm": 0.1867264062166214, "learning_rate": 1.8449597997566005e-05, "loss": 0.5318, "step": 2534 }, { "epoch": 0.7029950083194676, "grad_norm": 0.1850994974374771, "learning_rate": 1.844474134036296e-05, "loss": 0.5375, "step": 2535 }, { "epoch": 0.7032723239046035, "grad_norm": 0.188653826713562, "learning_rate": 1.8439883523183377e-05, "loss": 0.5223, "step": 2536 }, { "epoch": 0.7035496394897394, "grad_norm": 0.19549886882305145, "learning_rate": 1.8435024546975142e-05, "loss": 0.5946, "step": 2537 }, { "epoch": 0.7038269550748752, "grad_norm": 0.19603100419044495, "learning_rate": 1.8430164412686375e-05, "loss": 0.5579, "step": 2538 }, { "epoch": 0.7041042706600111, "grad_norm": 0.27463892102241516, "learning_rate": 1.8425303121265414e-05, "loss": 0.561, "step": 2539 }, { "epoch": 0.704381586245147, "grad_norm": 0.1948249191045761, "learning_rate": 1.842044067366082e-05, "loss": 0.5715, "step": 2540 }, { "epoch": 0.7046589018302829, "grad_norm": 0.191276416182518, "learning_rate": 1.8415577070821398e-05, "loss": 0.5448, "step": 2541 }, { "epoch": 0.7049362174154188, "grad_norm": 0.19777309894561768, "learning_rate": 1.841071231369616e-05, "loss": 0.5333, "step": 2542 }, { "epoch": 0.7052135330005547, "grad_norm": 0.1844799816608429, "learning_rate": 1.8405846403234346e-05, "loss": 0.5455, "step": 2543 }, { "epoch": 0.7054908485856906, "grad_norm": 0.17811760306358337, "learning_rate": 1.840097934038543e-05, "loss": 0.5307, "step": 2544 }, { "epoch": 0.7057681641708264, "grad_norm": 0.1874844878911972, "learning_rate": 1.8396111126099094e-05, "loss": 0.5755, "step": 2545 }, { "epoch": 0.7060454797559623, "grad_norm": 0.20174047350883484, "learning_rate": 1.839124176132527e-05, "loss": 0.5422, "step": 2546 }, { "epoch": 0.7063227953410982, "grad_norm": 0.1862955093383789, "learning_rate": 1.838637124701409e-05, "loss": 0.5469, "step": 2547 }, { "epoch": 0.7066001109262341, "grad_norm": 0.18790222704410553, "learning_rate": 1.8381499584115924e-05, "loss": 0.5472, "step": 2548 }, { "epoch": 0.70687742651137, "grad_norm": 0.17207522690296173, "learning_rate": 1.8376626773581358e-05, "loss": 0.5074, "step": 2549 }, { "epoch": 0.7071547420965059, "grad_norm": 0.19320419430732727, "learning_rate": 1.8371752816361215e-05, "loss": 0.5416, "step": 2550 }, { "epoch": 0.7074320576816417, "grad_norm": 0.19113574922084808, "learning_rate": 1.8366877713406526e-05, "loss": 0.533, "step": 2551 }, { "epoch": 0.7077093732667776, "grad_norm": 0.19141121208667755, "learning_rate": 1.8362001465668554e-05, "loss": 0.5435, "step": 2552 }, { "epoch": 0.7079866888519135, "grad_norm": 0.17392635345458984, "learning_rate": 1.8357124074098788e-05, "loss": 0.5126, "step": 2553 }, { "epoch": 0.7082640044370494, "grad_norm": 0.18224339187145233, "learning_rate": 1.8352245539648933e-05, "loss": 0.5369, "step": 2554 }, { "epoch": 0.7085413200221853, "grad_norm": 0.193458691239357, "learning_rate": 1.834736586327092e-05, "loss": 0.5581, "step": 2555 }, { "epoch": 0.7088186356073212, "grad_norm": 0.19396451115608215, "learning_rate": 1.8342485045916902e-05, "loss": 0.546, "step": 2556 }, { "epoch": 0.709095951192457, "grad_norm": 0.2662739157676697, "learning_rate": 1.8337603088539263e-05, "loss": 0.5557, "step": 2557 }, { "epoch": 0.7093732667775929, "grad_norm": 0.18841521441936493, "learning_rate": 1.8332719992090592e-05, "loss": 0.5359, "step": 2558 }, { "epoch": 0.7096505823627288, "grad_norm": 0.20101507008075714, "learning_rate": 1.8327835757523716e-05, "loss": 0.54, "step": 2559 }, { "epoch": 0.7099278979478647, "grad_norm": 0.17808422446250916, "learning_rate": 1.832295038579168e-05, "loss": 0.5263, "step": 2560 }, { "epoch": 0.7102052135330006, "grad_norm": 0.19543784856796265, "learning_rate": 1.8318063877847747e-05, "loss": 0.5342, "step": 2561 }, { "epoch": 0.7104825291181365, "grad_norm": 0.1944831758737564, "learning_rate": 1.8313176234645406e-05, "loss": 0.4973, "step": 2562 }, { "epoch": 0.7107598447032724, "grad_norm": 0.19557087123394012, "learning_rate": 1.8308287457138362e-05, "loss": 0.533, "step": 2563 }, { "epoch": 0.7110371602884082, "grad_norm": 0.19200831651687622, "learning_rate": 1.8303397546280547e-05, "loss": 0.5417, "step": 2564 }, { "epoch": 0.7113144758735441, "grad_norm": 0.1837347149848938, "learning_rate": 1.829850650302612e-05, "loss": 0.5594, "step": 2565 }, { "epoch": 0.71159179145868, "grad_norm": 0.1859401911497116, "learning_rate": 1.8293614328329437e-05, "loss": 0.5249, "step": 2566 }, { "epoch": 0.7118691070438159, "grad_norm": 0.18670018017292023, "learning_rate": 1.8288721023145105e-05, "loss": 0.5339, "step": 2567 }, { "epoch": 0.7121464226289518, "grad_norm": 0.19364280998706818, "learning_rate": 1.8283826588427927e-05, "loss": 0.5729, "step": 2568 }, { "epoch": 0.7124237382140877, "grad_norm": 0.19787278771400452, "learning_rate": 1.827893102513295e-05, "loss": 0.5544, "step": 2569 }, { "epoch": 0.7127010537992235, "grad_norm": 0.19540858268737793, "learning_rate": 1.827403433421541e-05, "loss": 0.5233, "step": 2570 }, { "epoch": 0.7129783693843594, "grad_norm": 0.20480972528457642, "learning_rate": 1.8269136516630798e-05, "loss": 0.545, "step": 2571 }, { "epoch": 0.7132556849694953, "grad_norm": 0.20005930960178375, "learning_rate": 1.82642375733348e-05, "loss": 0.561, "step": 2572 }, { "epoch": 0.7135330005546312, "grad_norm": 0.18213188648223877, "learning_rate": 1.825933750528333e-05, "loss": 0.5277, "step": 2573 }, { "epoch": 0.7138103161397671, "grad_norm": 0.19505798816680908, "learning_rate": 1.8254436313432522e-05, "loss": 0.5283, "step": 2574 }, { "epoch": 0.714087631724903, "grad_norm": 0.1885930299758911, "learning_rate": 1.824953399873873e-05, "loss": 0.5189, "step": 2575 }, { "epoch": 0.7143649473100389, "grad_norm": 0.19202375411987305, "learning_rate": 1.824463056215852e-05, "loss": 0.5197, "step": 2576 }, { "epoch": 0.7146422628951747, "grad_norm": 0.19426311552524567, "learning_rate": 1.823972600464869e-05, "loss": 0.5243, "step": 2577 }, { "epoch": 0.7149195784803106, "grad_norm": 0.18350745737552643, "learning_rate": 1.8234820327166244e-05, "loss": 0.5283, "step": 2578 }, { "epoch": 0.7151968940654465, "grad_norm": 0.1888923943042755, "learning_rate": 1.822991353066841e-05, "loss": 0.5298, "step": 2579 }, { "epoch": 0.7154742096505824, "grad_norm": 0.19120195508003235, "learning_rate": 1.8225005616112636e-05, "loss": 0.5583, "step": 2580 }, { "epoch": 0.7157515252357183, "grad_norm": 0.18390871584415436, "learning_rate": 1.8220096584456587e-05, "loss": 0.5174, "step": 2581 }, { "epoch": 0.7160288408208542, "grad_norm": 0.19268232583999634, "learning_rate": 1.8215186436658142e-05, "loss": 0.5706, "step": 2582 }, { "epoch": 0.71630615640599, "grad_norm": 0.2088870108127594, "learning_rate": 1.82102751736754e-05, "loss": 0.5467, "step": 2583 }, { "epoch": 0.7165834719911259, "grad_norm": 0.18982863426208496, "learning_rate": 1.8205362796466682e-05, "loss": 0.547, "step": 2584 }, { "epoch": 0.7168607875762618, "grad_norm": 0.20448660850524902, "learning_rate": 1.820044930599052e-05, "loss": 0.5523, "step": 2585 }, { "epoch": 0.7171381031613977, "grad_norm": 0.19304388761520386, "learning_rate": 1.8195534703205674e-05, "loss": 0.5766, "step": 2586 }, { "epoch": 0.7174154187465336, "grad_norm": 0.20076531171798706, "learning_rate": 1.81906189890711e-05, "loss": 0.5327, "step": 2587 }, { "epoch": 0.7176927343316695, "grad_norm": 0.24745801091194153, "learning_rate": 1.8185702164546e-05, "loss": 0.5386, "step": 2588 }, { "epoch": 0.7179700499168054, "grad_norm": 0.1958095282316208, "learning_rate": 1.8180784230589758e-05, "loss": 0.5592, "step": 2589 }, { "epoch": 0.7182473655019412, "grad_norm": 0.1962784081697464, "learning_rate": 1.8175865188162007e-05, "loss": 0.5547, "step": 2590 }, { "epoch": 0.7185246810870771, "grad_norm": 0.21214796602725983, "learning_rate": 1.8170945038222577e-05, "loss": 0.5789, "step": 2591 }, { "epoch": 0.718801996672213, "grad_norm": 0.18657919764518738, "learning_rate": 1.8166023781731523e-05, "loss": 0.5909, "step": 2592 }, { "epoch": 0.7190793122573489, "grad_norm": 0.1949455291032791, "learning_rate": 1.816110141964911e-05, "loss": 0.5804, "step": 2593 }, { "epoch": 0.7193566278424848, "grad_norm": 0.20440496504306793, "learning_rate": 1.8156177952935824e-05, "loss": 0.5819, "step": 2594 }, { "epoch": 0.7196339434276207, "grad_norm": 0.19375431537628174, "learning_rate": 1.815125338255236e-05, "loss": 0.5707, "step": 2595 }, { "epoch": 0.7199112590127565, "grad_norm": 0.2573017477989197, "learning_rate": 1.8146327709459635e-05, "loss": 0.5622, "step": 2596 }, { "epoch": 0.7201885745978924, "grad_norm": 0.19676506519317627, "learning_rate": 1.8141400934618775e-05, "loss": 0.5668, "step": 2597 }, { "epoch": 0.7204658901830283, "grad_norm": 0.19473743438720703, "learning_rate": 1.8136473058991126e-05, "loss": 0.5654, "step": 2598 }, { "epoch": 0.7207432057681642, "grad_norm": 0.18709680438041687, "learning_rate": 1.8131544083538253e-05, "loss": 0.5283, "step": 2599 }, { "epoch": 0.7210205213533001, "grad_norm": 0.19263465702533722, "learning_rate": 1.812661400922192e-05, "loss": 0.5379, "step": 2600 }, { "epoch": 0.721297836938436, "grad_norm": 0.19263778626918793, "learning_rate": 1.8121682837004118e-05, "loss": 0.5678, "step": 2601 }, { "epoch": 0.7215751525235718, "grad_norm": 0.19861197471618652, "learning_rate": 1.8116750567847058e-05, "loss": 0.5456, "step": 2602 }, { "epoch": 0.7218524681087077, "grad_norm": 0.1754927933216095, "learning_rate": 1.8111817202713143e-05, "loss": 0.5164, "step": 2603 }, { "epoch": 0.7221297836938436, "grad_norm": 0.19112901389598846, "learning_rate": 1.8106882742565008e-05, "loss": 0.5362, "step": 2604 }, { "epoch": 0.7224070992789795, "grad_norm": 0.1980351209640503, "learning_rate": 1.8101947188365503e-05, "loss": 0.5687, "step": 2605 }, { "epoch": 0.7226844148641154, "grad_norm": 0.19299866259098053, "learning_rate": 1.8097010541077678e-05, "loss": 0.5589, "step": 2606 }, { "epoch": 0.7229617304492513, "grad_norm": 0.19257931411266327, "learning_rate": 1.809207280166481e-05, "loss": 0.5701, "step": 2607 }, { "epoch": 0.7232390460343872, "grad_norm": 0.1912074089050293, "learning_rate": 1.8087133971090374e-05, "loss": 0.544, "step": 2608 }, { "epoch": 0.723516361619523, "grad_norm": 0.19741860032081604, "learning_rate": 1.808219405031808e-05, "loss": 0.5527, "step": 2609 }, { "epoch": 0.7237936772046589, "grad_norm": 0.18676535785198212, "learning_rate": 1.807725304031182e-05, "loss": 0.5539, "step": 2610 }, { "epoch": 0.7240709927897948, "grad_norm": 0.17912089824676514, "learning_rate": 1.807231094203573e-05, "loss": 0.5342, "step": 2611 }, { "epoch": 0.7243483083749307, "grad_norm": 0.18593581020832062, "learning_rate": 1.806736775645414e-05, "loss": 0.5366, "step": 2612 }, { "epoch": 0.7246256239600666, "grad_norm": 0.20467206835746765, "learning_rate": 1.8062423484531592e-05, "loss": 0.527, "step": 2613 }, { "epoch": 0.7249029395452025, "grad_norm": 0.18463392555713654, "learning_rate": 1.8057478127232854e-05, "loss": 0.545, "step": 2614 }, { "epoch": 0.7251802551303383, "grad_norm": 0.20011630654335022, "learning_rate": 1.805253168552289e-05, "loss": 0.5242, "step": 2615 }, { "epoch": 0.7254575707154742, "grad_norm": 0.18936677277088165, "learning_rate": 1.804758416036688e-05, "loss": 0.5285, "step": 2616 }, { "epoch": 0.7257348863006101, "grad_norm": 0.19428247213363647, "learning_rate": 1.804263555273022e-05, "loss": 0.5448, "step": 2617 }, { "epoch": 0.726012201885746, "grad_norm": 0.18848338723182678, "learning_rate": 1.8037685863578514e-05, "loss": 0.5673, "step": 2618 }, { "epoch": 0.7262895174708819, "grad_norm": 0.19170448184013367, "learning_rate": 1.803273509387758e-05, "loss": 0.5234, "step": 2619 }, { "epoch": 0.7265668330560178, "grad_norm": 0.18249256908893585, "learning_rate": 1.8027783244593443e-05, "loss": 0.5377, "step": 2620 }, { "epoch": 0.7268441486411537, "grad_norm": 0.18726296722888947, "learning_rate": 1.8022830316692336e-05, "loss": 0.5381, "step": 2621 }, { "epoch": 0.7271214642262895, "grad_norm": 0.1881718784570694, "learning_rate": 1.801787631114071e-05, "loss": 0.5453, "step": 2622 }, { "epoch": 0.7273987798114254, "grad_norm": 0.19469492137432098, "learning_rate": 1.8012921228905225e-05, "loss": 0.5231, "step": 2623 }, { "epoch": 0.7276760953965613, "grad_norm": 0.18475371599197388, "learning_rate": 1.8007965070952743e-05, "loss": 0.5323, "step": 2624 }, { "epoch": 0.7279534109816972, "grad_norm": 0.18544836342334747, "learning_rate": 1.8003007838250343e-05, "loss": 0.5345, "step": 2625 }, { "epoch": 0.7282307265668331, "grad_norm": 0.19587865471839905, "learning_rate": 1.799804953176532e-05, "loss": 0.5168, "step": 2626 }, { "epoch": 0.728508042151969, "grad_norm": 0.23171131312847137, "learning_rate": 1.7993090152465163e-05, "loss": 0.5235, "step": 2627 }, { "epoch": 0.7287853577371048, "grad_norm": 0.19391484558582306, "learning_rate": 1.7988129701317582e-05, "loss": 0.5345, "step": 2628 }, { "epoch": 0.7290626733222407, "grad_norm": 0.19040954113006592, "learning_rate": 1.7983168179290488e-05, "loss": 0.5226, "step": 2629 }, { "epoch": 0.7293399889073766, "grad_norm": 0.1814422607421875, "learning_rate": 1.797820558735201e-05, "loss": 0.5291, "step": 2630 }, { "epoch": 0.7296173044925125, "grad_norm": 0.19977906346321106, "learning_rate": 1.797324192647048e-05, "loss": 0.5641, "step": 2631 }, { "epoch": 0.7298946200776484, "grad_norm": 0.19043037295341492, "learning_rate": 1.796827719761444e-05, "loss": 0.5686, "step": 2632 }, { "epoch": 0.7301719356627843, "grad_norm": 0.19778837263584137, "learning_rate": 1.7963311401752638e-05, "loss": 0.5648, "step": 2633 }, { "epoch": 0.7304492512479202, "grad_norm": 0.45009469985961914, "learning_rate": 1.7958344539854034e-05, "loss": 0.5244, "step": 2634 }, { "epoch": 0.730726566833056, "grad_norm": 0.1936669498682022, "learning_rate": 1.7953376612887793e-05, "loss": 0.5296, "step": 2635 }, { "epoch": 0.7310038824181919, "grad_norm": 0.19970230758190155, "learning_rate": 1.7948407621823287e-05, "loss": 0.5832, "step": 2636 }, { "epoch": 0.7312811980033278, "grad_norm": 0.19142326712608337, "learning_rate": 1.794343756763011e-05, "loss": 0.5478, "step": 2637 }, { "epoch": 0.7315585135884637, "grad_norm": 0.18312163650989532, "learning_rate": 1.7938466451278034e-05, "loss": 0.5382, "step": 2638 }, { "epoch": 0.7318358291735996, "grad_norm": 0.19482283294200897, "learning_rate": 1.793349427373707e-05, "loss": 0.5392, "step": 2639 }, { "epoch": 0.7321131447587355, "grad_norm": 0.20726899802684784, "learning_rate": 1.7928521035977413e-05, "loss": 0.5597, "step": 2640 }, { "epoch": 0.7323904603438713, "grad_norm": 0.20082899928092957, "learning_rate": 1.7923546738969478e-05, "loss": 0.5293, "step": 2641 }, { "epoch": 0.7326677759290072, "grad_norm": 0.19428935647010803, "learning_rate": 1.791857138368388e-05, "loss": 0.5428, "step": 2642 }, { "epoch": 0.7329450915141431, "grad_norm": 0.19222451746463776, "learning_rate": 1.791359497109144e-05, "loss": 0.5417, "step": 2643 }, { "epoch": 0.733222407099279, "grad_norm": 0.1904270201921463, "learning_rate": 1.7908617502163188e-05, "loss": 0.5368, "step": 2644 }, { "epoch": 0.7334997226844149, "grad_norm": 0.1973213404417038, "learning_rate": 1.7903638977870372e-05, "loss": 0.5347, "step": 2645 }, { "epoch": 0.7337770382695508, "grad_norm": 0.1838080883026123, "learning_rate": 1.7898659399184415e-05, "loss": 0.5239, "step": 2646 }, { "epoch": 0.7340543538546866, "grad_norm": 0.18665340542793274, "learning_rate": 1.7893678767076982e-05, "loss": 0.5469, "step": 2647 }, { "epoch": 0.7343316694398225, "grad_norm": 0.18644295632839203, "learning_rate": 1.788869708251991e-05, "loss": 0.5423, "step": 2648 }, { "epoch": 0.7346089850249584, "grad_norm": 0.18912896513938904, "learning_rate": 1.788371434648528e-05, "loss": 0.5289, "step": 2649 }, { "epoch": 0.7348863006100943, "grad_norm": 0.1896572709083557, "learning_rate": 1.7878730559945327e-05, "loss": 0.544, "step": 2650 }, { "epoch": 0.7351636161952302, "grad_norm": 0.18456673622131348, "learning_rate": 1.7873745723872545e-05, "loss": 0.5437, "step": 2651 }, { "epoch": 0.7354409317803661, "grad_norm": 0.18309368193149567, "learning_rate": 1.7868759839239596e-05, "loss": 0.5079, "step": 2652 }, { "epoch": 0.735718247365502, "grad_norm": 0.18934939801692963, "learning_rate": 1.7863772907019356e-05, "loss": 0.5473, "step": 2653 }, { "epoch": 0.7359955629506378, "grad_norm": 0.19800709187984467, "learning_rate": 1.7858784928184916e-05, "loss": 0.5386, "step": 2654 }, { "epoch": 0.7362728785357737, "grad_norm": 0.20616304874420166, "learning_rate": 1.7853795903709556e-05, "loss": 0.5388, "step": 2655 }, { "epoch": 0.7365501941209096, "grad_norm": 0.18855001032352448, "learning_rate": 1.7848805834566768e-05, "loss": 0.5499, "step": 2656 }, { "epoch": 0.7368275097060455, "grad_norm": 0.18595397472381592, "learning_rate": 1.7843814721730244e-05, "loss": 0.5599, "step": 2657 }, { "epoch": 0.7371048252911814, "grad_norm": 0.1929646134376526, "learning_rate": 1.7838822566173894e-05, "loss": 0.5412, "step": 2658 }, { "epoch": 0.7373821408763173, "grad_norm": 0.19367007911205292, "learning_rate": 1.7833829368871808e-05, "loss": 0.5328, "step": 2659 }, { "epoch": 0.7376594564614531, "grad_norm": 0.1873459815979004, "learning_rate": 1.7828835130798296e-05, "loss": 0.5444, "step": 2660 }, { "epoch": 0.737936772046589, "grad_norm": 0.20193496346473694, "learning_rate": 1.7823839852927867e-05, "loss": 0.5287, "step": 2661 }, { "epoch": 0.7382140876317249, "grad_norm": 0.18771541118621826, "learning_rate": 1.7818843536235224e-05, "loss": 0.5404, "step": 2662 }, { "epoch": 0.7384914032168608, "grad_norm": 0.1829247921705246, "learning_rate": 1.781384618169529e-05, "loss": 0.5512, "step": 2663 }, { "epoch": 0.7387687188019967, "grad_norm": 0.2635699212551117, "learning_rate": 1.7808847790283183e-05, "loss": 0.5678, "step": 2664 }, { "epoch": 0.7390460343871326, "grad_norm": 0.1860508918762207, "learning_rate": 1.780384836297421e-05, "loss": 0.5348, "step": 2665 }, { "epoch": 0.7393233499722685, "grad_norm": 0.18465931713581085, "learning_rate": 1.7798847900743904e-05, "loss": 0.5043, "step": 2666 }, { "epoch": 0.7396006655574043, "grad_norm": 0.18881580233573914, "learning_rate": 1.779384640456798e-05, "loss": 0.5251, "step": 2667 }, { "epoch": 0.7398779811425402, "grad_norm": 0.18016085028648376, "learning_rate": 1.7788843875422367e-05, "loss": 0.5585, "step": 2668 }, { "epoch": 0.7401552967276761, "grad_norm": 0.19220809638500214, "learning_rate": 1.7783840314283183e-05, "loss": 0.5263, "step": 2669 }, { "epoch": 0.740432612312812, "grad_norm": 0.18954598903656006, "learning_rate": 1.7778835722126764e-05, "loss": 0.542, "step": 2670 }, { "epoch": 0.7407099278979479, "grad_norm": 0.18674500286579132, "learning_rate": 1.7773830099929635e-05, "loss": 0.5247, "step": 2671 }, { "epoch": 0.7409872434830838, "grad_norm": 0.18231706321239471, "learning_rate": 1.776882344866853e-05, "loss": 0.5115, "step": 2672 }, { "epoch": 0.7412645590682196, "grad_norm": 0.18846355378627777, "learning_rate": 1.776381576932037e-05, "loss": 0.5186, "step": 2673 }, { "epoch": 0.7415418746533555, "grad_norm": 0.18721552193164825, "learning_rate": 1.7758807062862292e-05, "loss": 0.5313, "step": 2674 }, { "epoch": 0.7418191902384914, "grad_norm": 0.191980242729187, "learning_rate": 1.775379733027163e-05, "loss": 0.5505, "step": 2675 }, { "epoch": 0.7420965058236273, "grad_norm": 0.18773001432418823, "learning_rate": 1.7748786572525907e-05, "loss": 0.5334, "step": 2676 }, { "epoch": 0.7423738214087632, "grad_norm": 0.19638672471046448, "learning_rate": 1.7743774790602864e-05, "loss": 0.5718, "step": 2677 }, { "epoch": 0.7426511369938991, "grad_norm": 0.19333893060684204, "learning_rate": 1.7738761985480425e-05, "loss": 0.5479, "step": 2678 }, { "epoch": 0.742928452579035, "grad_norm": 0.20004448294639587, "learning_rate": 1.7733748158136725e-05, "loss": 0.5331, "step": 2679 }, { "epoch": 0.7432057681641708, "grad_norm": 0.19896887242794037, "learning_rate": 1.7728733309550097e-05, "loss": 0.5484, "step": 2680 }, { "epoch": 0.7434830837493067, "grad_norm": 0.1829969435930252, "learning_rate": 1.7723717440699066e-05, "loss": 0.5459, "step": 2681 }, { "epoch": 0.7437603993344426, "grad_norm": 0.1870088130235672, "learning_rate": 1.771870055256236e-05, "loss": 0.5628, "step": 2682 }, { "epoch": 0.7440377149195785, "grad_norm": 0.18835529685020447, "learning_rate": 1.7713682646118914e-05, "loss": 0.5439, "step": 2683 }, { "epoch": 0.7443150305047144, "grad_norm": 0.20633459091186523, "learning_rate": 1.7708663722347845e-05, "loss": 0.5677, "step": 2684 }, { "epoch": 0.7445923460898503, "grad_norm": 0.18712858855724335, "learning_rate": 1.7703643782228488e-05, "loss": 0.5666, "step": 2685 }, { "epoch": 0.7448696616749861, "grad_norm": 0.18475639820098877, "learning_rate": 1.769862282674036e-05, "loss": 0.5548, "step": 2686 }, { "epoch": 0.745146977260122, "grad_norm": 0.19311586022377014, "learning_rate": 1.769360085686318e-05, "loss": 0.5428, "step": 2687 }, { "epoch": 0.7454242928452579, "grad_norm": 0.19635801017284393, "learning_rate": 1.7688577873576872e-05, "loss": 0.5228, "step": 2688 }, { "epoch": 0.7457016084303938, "grad_norm": 0.18878091871738434, "learning_rate": 1.7683553877861554e-05, "loss": 0.4995, "step": 2689 }, { "epoch": 0.7459789240155297, "grad_norm": 0.1772637963294983, "learning_rate": 1.7678528870697537e-05, "loss": 0.5241, "step": 2690 }, { "epoch": 0.7462562396006656, "grad_norm": 0.18778811395168304, "learning_rate": 1.7673502853065335e-05, "loss": 0.5247, "step": 2691 }, { "epoch": 0.7465335551858014, "grad_norm": 0.20334573090076447, "learning_rate": 1.7668475825945656e-05, "loss": 0.5369, "step": 2692 }, { "epoch": 0.7468108707709373, "grad_norm": 0.18709522485733032, "learning_rate": 1.766344779031941e-05, "loss": 0.5701, "step": 2693 }, { "epoch": 0.7470881863560732, "grad_norm": 0.18577025830745697, "learning_rate": 1.7658418747167694e-05, "loss": 0.5409, "step": 2694 }, { "epoch": 0.7473655019412091, "grad_norm": 0.19616863131523132, "learning_rate": 1.765338869747181e-05, "loss": 0.5533, "step": 2695 }, { "epoch": 0.747642817526345, "grad_norm": 0.20094148814678192, "learning_rate": 1.764835764221326e-05, "loss": 0.5252, "step": 2696 }, { "epoch": 0.7479201331114809, "grad_norm": 0.19073578715324402, "learning_rate": 1.7643325582373728e-05, "loss": 0.52, "step": 2697 }, { "epoch": 0.7481974486966168, "grad_norm": 0.19425810873508453, "learning_rate": 1.7638292518935103e-05, "loss": 0.5612, "step": 2698 }, { "epoch": 0.7484747642817526, "grad_norm": 0.1896180808544159, "learning_rate": 1.7633258452879475e-05, "loss": 0.5504, "step": 2699 }, { "epoch": 0.7487520798668885, "grad_norm": 0.19124028086662292, "learning_rate": 1.762822338518912e-05, "loss": 0.5335, "step": 2700 }, { "epoch": 0.7490293954520244, "grad_norm": 0.19054248929023743, "learning_rate": 1.762318731684651e-05, "loss": 0.552, "step": 2701 }, { "epoch": 0.7493067110371603, "grad_norm": 0.18848967552185059, "learning_rate": 1.761815024883432e-05, "loss": 0.557, "step": 2702 }, { "epoch": 0.7495840266222962, "grad_norm": 0.1953321397304535, "learning_rate": 1.7613112182135406e-05, "loss": 0.5916, "step": 2703 }, { "epoch": 0.7498613422074321, "grad_norm": 0.18123508989810944, "learning_rate": 1.7608073117732848e-05, "loss": 0.5643, "step": 2704 }, { "epoch": 0.7501386577925679, "grad_norm": 0.19151508808135986, "learning_rate": 1.760303305660988e-05, "loss": 0.5512, "step": 2705 }, { "epoch": 0.7504159733777038, "grad_norm": 0.18512828648090363, "learning_rate": 1.7597991999749967e-05, "loss": 0.5627, "step": 2706 }, { "epoch": 0.7506932889628397, "grad_norm": 0.20286305248737335, "learning_rate": 1.7592949948136737e-05, "loss": 0.568, "step": 2707 }, { "epoch": 0.7509706045479756, "grad_norm": 0.20783564448356628, "learning_rate": 1.758790690275405e-05, "loss": 0.5563, "step": 2708 }, { "epoch": 0.7512479201331115, "grad_norm": 0.18659134209156036, "learning_rate": 1.7582862864585913e-05, "loss": 0.5285, "step": 2709 }, { "epoch": 0.7515252357182474, "grad_norm": 0.17614908516407013, "learning_rate": 1.757781783461657e-05, "loss": 0.4952, "step": 2710 }, { "epoch": 0.7518025513033832, "grad_norm": 0.18984104692935944, "learning_rate": 1.757277181383043e-05, "loss": 0.5564, "step": 2711 }, { "epoch": 0.7520798668885191, "grad_norm": 0.19199056923389435, "learning_rate": 1.756772480321211e-05, "loss": 0.5815, "step": 2712 }, { "epoch": 0.752357182473655, "grad_norm": 0.5123929381370544, "learning_rate": 1.7562676803746414e-05, "loss": 0.537, "step": 2713 }, { "epoch": 0.7526344980587909, "grad_norm": 0.1854097694158554, "learning_rate": 1.7557627816418337e-05, "loss": 0.5275, "step": 2714 }, { "epoch": 0.7529118136439268, "grad_norm": 0.1879512071609497, "learning_rate": 1.755257784221308e-05, "loss": 0.5292, "step": 2715 }, { "epoch": 0.7531891292290627, "grad_norm": 0.18993008136749268, "learning_rate": 1.7547526882116014e-05, "loss": 0.5282, "step": 2716 }, { "epoch": 0.7534664448141986, "grad_norm": 0.18528947234153748, "learning_rate": 1.7542474937112725e-05, "loss": 0.5457, "step": 2717 }, { "epoch": 0.7537437603993344, "grad_norm": 0.1859760731458664, "learning_rate": 1.753742200818898e-05, "loss": 0.5448, "step": 2718 }, { "epoch": 0.7540210759844703, "grad_norm": 0.17713625729084015, "learning_rate": 1.753236809633073e-05, "loss": 0.5479, "step": 2719 }, { "epoch": 0.7542983915696062, "grad_norm": 0.17718899250030518, "learning_rate": 1.7527313202524144e-05, "loss": 0.5378, "step": 2720 }, { "epoch": 0.7545757071547421, "grad_norm": 0.19346462190151215, "learning_rate": 1.752225732775555e-05, "loss": 0.552, "step": 2721 }, { "epoch": 0.754853022739878, "grad_norm": 0.17717614769935608, "learning_rate": 1.7517200473011488e-05, "loss": 0.5348, "step": 2722 }, { "epoch": 0.7551303383250139, "grad_norm": 0.1940585970878601, "learning_rate": 1.751214263927869e-05, "loss": 0.5405, "step": 2723 }, { "epoch": 0.7554076539101497, "grad_norm": 0.19863441586494446, "learning_rate": 1.7507083827544065e-05, "loss": 0.5357, "step": 2724 }, { "epoch": 0.7556849694952856, "grad_norm": 0.18913887441158295, "learning_rate": 1.7502024038794727e-05, "loss": 0.5658, "step": 2725 }, { "epoch": 0.7559622850804215, "grad_norm": 0.18873843550682068, "learning_rate": 1.7496963274017975e-05, "loss": 0.5774, "step": 2726 }, { "epoch": 0.7562396006655574, "grad_norm": 0.1888992041349411, "learning_rate": 1.7491901534201295e-05, "loss": 0.5319, "step": 2727 }, { "epoch": 0.7565169162506933, "grad_norm": 0.18841078877449036, "learning_rate": 1.7486838820332362e-05, "loss": 0.5542, "step": 2728 }, { "epoch": 0.7567942318358292, "grad_norm": 0.19633720815181732, "learning_rate": 1.7481775133399057e-05, "loss": 0.5615, "step": 2729 }, { "epoch": 0.757071547420965, "grad_norm": 0.19098395109176636, "learning_rate": 1.7476710474389434e-05, "loss": 0.5642, "step": 2730 }, { "epoch": 0.7573488630061009, "grad_norm": 0.1895277500152588, "learning_rate": 1.747164484429174e-05, "loss": 0.557, "step": 2731 }, { "epoch": 0.7576261785912368, "grad_norm": 0.18626920878887177, "learning_rate": 1.7466578244094417e-05, "loss": 0.5314, "step": 2732 }, { "epoch": 0.7579034941763727, "grad_norm": 0.1883586198091507, "learning_rate": 1.746151067478609e-05, "loss": 0.5457, "step": 2733 }, { "epoch": 0.7581808097615086, "grad_norm": 0.18349739909172058, "learning_rate": 1.745644213735558e-05, "loss": 0.5467, "step": 2734 }, { "epoch": 0.7584581253466445, "grad_norm": 0.17938470840454102, "learning_rate": 1.7451372632791888e-05, "loss": 0.5444, "step": 2735 }, { "epoch": 0.7587354409317804, "grad_norm": 0.19910098612308502, "learning_rate": 1.7446302162084215e-05, "loss": 0.5777, "step": 2736 }, { "epoch": 0.7590127565169162, "grad_norm": 0.20055991411209106, "learning_rate": 1.7441230726221936e-05, "loss": 0.5546, "step": 2737 }, { "epoch": 0.7592900721020521, "grad_norm": 0.18521596491336823, "learning_rate": 1.743615832619463e-05, "loss": 0.5239, "step": 2738 }, { "epoch": 0.759567387687188, "grad_norm": 0.19067248702049255, "learning_rate": 1.7431084962992052e-05, "loss": 0.5288, "step": 2739 }, { "epoch": 0.7598447032723239, "grad_norm": 0.18803349137306213, "learning_rate": 1.7426010637604152e-05, "loss": 0.5397, "step": 2740 }, { "epoch": 0.7601220188574598, "grad_norm": 0.1846192330121994, "learning_rate": 1.7420935351021062e-05, "loss": 0.5379, "step": 2741 }, { "epoch": 0.7603993344425957, "grad_norm": 0.1797967106103897, "learning_rate": 1.7415859104233108e-05, "loss": 0.5528, "step": 2742 }, { "epoch": 0.7606766500277315, "grad_norm": 0.18609130382537842, "learning_rate": 1.7410781898230797e-05, "loss": 0.5473, "step": 2743 }, { "epoch": 0.7609539656128674, "grad_norm": 0.23831035196781158, "learning_rate": 1.7405703734004837e-05, "loss": 0.5114, "step": 2744 }, { "epoch": 0.7612312811980033, "grad_norm": 0.16559574007987976, "learning_rate": 1.74006246125461e-05, "loss": 0.501, "step": 2745 }, { "epoch": 0.7615085967831392, "grad_norm": 0.1880342662334442, "learning_rate": 1.7395544534845663e-05, "loss": 0.5344, "step": 2746 }, { "epoch": 0.7617859123682751, "grad_norm": 0.1892349123954773, "learning_rate": 1.7390463501894778e-05, "loss": 0.5745, "step": 2747 }, { "epoch": 0.762063227953411, "grad_norm": 0.2029358148574829, "learning_rate": 1.7385381514684896e-05, "loss": 0.5206, "step": 2748 }, { "epoch": 0.7623405435385469, "grad_norm": 0.2009795904159546, "learning_rate": 1.7380298574207645e-05, "loss": 0.5765, "step": 2749 }, { "epoch": 0.7626178591236827, "grad_norm": 0.18493135273456573, "learning_rate": 1.737521468145484e-05, "loss": 0.567, "step": 2750 }, { "epoch": 0.7628951747088186, "grad_norm": 0.2014453411102295, "learning_rate": 1.7370129837418487e-05, "loss": 0.564, "step": 2751 }, { "epoch": 0.7631724902939545, "grad_norm": 0.19111500680446625, "learning_rate": 1.7365044043090766e-05, "loss": 0.5589, "step": 2752 }, { "epoch": 0.7634498058790904, "grad_norm": 0.19902296364307404, "learning_rate": 1.7359957299464062e-05, "loss": 0.5543, "step": 2753 }, { "epoch": 0.7637271214642263, "grad_norm": 0.19618061184883118, "learning_rate": 1.7354869607530923e-05, "loss": 0.5639, "step": 2754 }, { "epoch": 0.7640044370493622, "grad_norm": 0.17767266929149628, "learning_rate": 1.7349780968284094e-05, "loss": 0.5041, "step": 2755 }, { "epoch": 0.764281752634498, "grad_norm": 0.19212745130062103, "learning_rate": 1.7344691382716508e-05, "loss": 0.5416, "step": 2756 }, { "epoch": 0.7645590682196339, "grad_norm": 0.19896987080574036, "learning_rate": 1.7339600851821274e-05, "loss": 0.5505, "step": 2757 }, { "epoch": 0.7648363838047698, "grad_norm": 0.1840539574623108, "learning_rate": 1.7334509376591695e-05, "loss": 0.5373, "step": 2758 }, { "epoch": 0.7651136993899057, "grad_norm": 0.20208869874477386, "learning_rate": 1.7329416958021247e-05, "loss": 0.5553, "step": 2759 }, { "epoch": 0.7653910149750416, "grad_norm": 0.18945381045341492, "learning_rate": 1.7324323597103597e-05, "loss": 0.536, "step": 2760 }, { "epoch": 0.7656683305601775, "grad_norm": 0.19363372027873993, "learning_rate": 1.7319229294832597e-05, "loss": 0.5607, "step": 2761 }, { "epoch": 0.7659456461453134, "grad_norm": 0.20634472370147705, "learning_rate": 1.7314134052202272e-05, "loss": 0.5451, "step": 2762 }, { "epoch": 0.7662229617304492, "grad_norm": 0.19053952395915985, "learning_rate": 1.730903787020685e-05, "loss": 0.5661, "step": 2763 }, { "epoch": 0.7665002773155851, "grad_norm": 0.18681201338768005, "learning_rate": 1.7303940749840726e-05, "loss": 0.5312, "step": 2764 }, { "epoch": 0.766777592900721, "grad_norm": 0.19541014730930328, "learning_rate": 1.7298842692098488e-05, "loss": 0.5366, "step": 2765 }, { "epoch": 0.7670549084858569, "grad_norm": 0.19399814307689667, "learning_rate": 1.729374369797489e-05, "loss": 0.547, "step": 2766 }, { "epoch": 0.7673322240709928, "grad_norm": 0.19590160250663757, "learning_rate": 1.7288643768464892e-05, "loss": 0.5286, "step": 2767 }, { "epoch": 0.7676095396561287, "grad_norm": 0.18889807164669037, "learning_rate": 1.7283542904563625e-05, "loss": 0.5357, "step": 2768 }, { "epoch": 0.7678868552412645, "grad_norm": 0.1905566155910492, "learning_rate": 1.7278441107266395e-05, "loss": 0.5663, "step": 2769 }, { "epoch": 0.7681641708264004, "grad_norm": 0.19881530106067657, "learning_rate": 1.7273338377568707e-05, "loss": 0.5829, "step": 2770 }, { "epoch": 0.7684414864115363, "grad_norm": 0.17877154052257538, "learning_rate": 1.726823471646623e-05, "loss": 0.5607, "step": 2771 }, { "epoch": 0.7687188019966722, "grad_norm": 0.1801327019929886, "learning_rate": 1.7263130124954832e-05, "loss": 0.5608, "step": 2772 }, { "epoch": 0.7689961175818081, "grad_norm": 0.1976090669631958, "learning_rate": 1.7258024604030547e-05, "loss": 0.4987, "step": 2773 }, { "epoch": 0.769273433166944, "grad_norm": 0.18582318723201752, "learning_rate": 1.72529181546896e-05, "loss": 0.5422, "step": 2774 }, { "epoch": 0.7695507487520798, "grad_norm": 0.1935378760099411, "learning_rate": 1.7247810777928396e-05, "loss": 0.5464, "step": 2775 }, { "epoch": 0.7698280643372157, "grad_norm": 0.187955841422081, "learning_rate": 1.7242702474743517e-05, "loss": 0.514, "step": 2776 }, { "epoch": 0.7701053799223516, "grad_norm": 0.18268531560897827, "learning_rate": 1.7237593246131735e-05, "loss": 0.517, "step": 2777 }, { "epoch": 0.7703826955074875, "grad_norm": 0.18831866979599, "learning_rate": 1.7232483093089986e-05, "loss": 0.579, "step": 2778 }, { "epoch": 0.7706600110926234, "grad_norm": 0.20355799794197083, "learning_rate": 1.7227372016615402e-05, "loss": 0.5656, "step": 2779 }, { "epoch": 0.7709373266777593, "grad_norm": 0.1870361566543579, "learning_rate": 1.7222260017705286e-05, "loss": 0.5528, "step": 2780 }, { "epoch": 0.7712146422628952, "grad_norm": 0.2032066434621811, "learning_rate": 1.7217147097357127e-05, "loss": 0.5684, "step": 2781 }, { "epoch": 0.771491957848031, "grad_norm": 0.18692703545093536, "learning_rate": 1.7212033256568595e-05, "loss": 0.5654, "step": 2782 }, { "epoch": 0.7717692734331669, "grad_norm": 0.1919548362493515, "learning_rate": 1.7206918496337525e-05, "loss": 0.5801, "step": 2783 }, { "epoch": 0.7720465890183028, "grad_norm": 0.19089831411838531, "learning_rate": 1.7201802817661955e-05, "loss": 0.5617, "step": 2784 }, { "epoch": 0.7723239046034387, "grad_norm": 0.19473430514335632, "learning_rate": 1.7196686221540077e-05, "loss": 0.5376, "step": 2785 }, { "epoch": 0.7726012201885746, "grad_norm": 0.1860806941986084, "learning_rate": 1.7191568708970286e-05, "loss": 0.5541, "step": 2786 }, { "epoch": 0.7728785357737105, "grad_norm": 0.18971897661685944, "learning_rate": 1.7186450280951137e-05, "loss": 0.5514, "step": 2787 }, { "epoch": 0.7731558513588463, "grad_norm": 0.18644990026950836, "learning_rate": 1.7181330938481375e-05, "loss": 0.5504, "step": 2788 }, { "epoch": 0.7734331669439822, "grad_norm": 0.18728910386562347, "learning_rate": 1.717621068255992e-05, "loss": 0.5528, "step": 2789 }, { "epoch": 0.7737104825291181, "grad_norm": 0.19061507284641266, "learning_rate": 1.7171089514185857e-05, "loss": 0.5356, "step": 2790 }, { "epoch": 0.773987798114254, "grad_norm": 0.18289197981357574, "learning_rate": 1.7165967434358483e-05, "loss": 0.5487, "step": 2791 }, { "epoch": 0.7742651136993899, "grad_norm": 0.19676977396011353, "learning_rate": 1.716084444407723e-05, "loss": 0.5625, "step": 2792 }, { "epoch": 0.7745424292845258, "grad_norm": 0.18007270991802216, "learning_rate": 1.7155720544341746e-05, "loss": 0.543, "step": 2793 }, { "epoch": 0.7748197448696617, "grad_norm": 0.19169475138187408, "learning_rate": 1.715059573615183e-05, "loss": 0.5378, "step": 2794 }, { "epoch": 0.7750970604547975, "grad_norm": 0.1876417100429535, "learning_rate": 1.714547002050747e-05, "loss": 0.5304, "step": 2795 }, { "epoch": 0.7753743760399334, "grad_norm": 0.18966078758239746, "learning_rate": 1.714034339840883e-05, "loss": 0.5302, "step": 2796 }, { "epoch": 0.7756516916250693, "grad_norm": 0.1796397715806961, "learning_rate": 1.7135215870856253e-05, "loss": 0.5388, "step": 2797 }, { "epoch": 0.7759290072102052, "grad_norm": 0.19350911676883698, "learning_rate": 1.7130087438850252e-05, "loss": 0.553, "step": 2798 }, { "epoch": 0.7762063227953411, "grad_norm": 0.19393737614154816, "learning_rate": 1.7124958103391516e-05, "loss": 0.514, "step": 2799 }, { "epoch": 0.776483638380477, "grad_norm": 0.2177378535270691, "learning_rate": 1.711982786548092e-05, "loss": 0.5187, "step": 2800 }, { "epoch": 0.7767609539656128, "grad_norm": 0.18803465366363525, "learning_rate": 1.7114696726119505e-05, "loss": 0.5661, "step": 2801 }, { "epoch": 0.7770382695507487, "grad_norm": 0.19588807225227356, "learning_rate": 1.7109564686308498e-05, "loss": 0.5463, "step": 2802 }, { "epoch": 0.7773155851358846, "grad_norm": 0.18581606447696686, "learning_rate": 1.710443174704929e-05, "loss": 0.548, "step": 2803 }, { "epoch": 0.7775929007210205, "grad_norm": 0.18562249839305878, "learning_rate": 1.7099297909343455e-05, "loss": 0.5313, "step": 2804 }, { "epoch": 0.7778702163061564, "grad_norm": 0.17982806265354156, "learning_rate": 1.7094163174192744e-05, "loss": 0.5499, "step": 2805 }, { "epoch": 0.7781475318912923, "grad_norm": 0.18412619829177856, "learning_rate": 1.708902754259908e-05, "loss": 0.5495, "step": 2806 }, { "epoch": 0.7784248474764282, "grad_norm": 0.19657637178897858, "learning_rate": 1.7083891015564555e-05, "loss": 0.5565, "step": 2807 }, { "epoch": 0.778702163061564, "grad_norm": 0.17935718595981598, "learning_rate": 1.7078753594091445e-05, "loss": 0.5446, "step": 2808 }, { "epoch": 0.7789794786466999, "grad_norm": 0.18647761642932892, "learning_rate": 1.7073615279182198e-05, "loss": 0.5416, "step": 2809 }, { "epoch": 0.7792567942318358, "grad_norm": 0.19295147061347961, "learning_rate": 1.7068476071839434e-05, "loss": 0.5498, "step": 2810 }, { "epoch": 0.7795341098169717, "grad_norm": 0.19305361807346344, "learning_rate": 1.706333597306595e-05, "loss": 0.5613, "step": 2811 }, { "epoch": 0.7798114254021076, "grad_norm": 0.1887744814157486, "learning_rate": 1.7058194983864715e-05, "loss": 0.5493, "step": 2812 }, { "epoch": 0.7800887409872435, "grad_norm": 0.18676388263702393, "learning_rate": 1.7053053105238866e-05, "loss": 0.527, "step": 2813 }, { "epoch": 0.7803660565723793, "grad_norm": 0.1888217329978943, "learning_rate": 1.7047910338191732e-05, "loss": 0.5543, "step": 2814 }, { "epoch": 0.7806433721575152, "grad_norm": 0.19341908395290375, "learning_rate": 1.7042766683726793e-05, "loss": 0.5188, "step": 2815 }, { "epoch": 0.7809206877426511, "grad_norm": 0.18204385042190552, "learning_rate": 1.7037622142847717e-05, "loss": 0.541, "step": 2816 }, { "epoch": 0.781198003327787, "grad_norm": 0.20170709490776062, "learning_rate": 1.7032476716558338e-05, "loss": 0.5925, "step": 2817 }, { "epoch": 0.7814753189129229, "grad_norm": 0.18066342175006866, "learning_rate": 1.7027330405862668e-05, "loss": 0.5537, "step": 2818 }, { "epoch": 0.7817526344980588, "grad_norm": 0.18021415174007416, "learning_rate": 1.7022183211764886e-05, "loss": 0.5098, "step": 2819 }, { "epoch": 0.7820299500831946, "grad_norm": 0.19226256012916565, "learning_rate": 1.7017035135269345e-05, "loss": 0.5626, "step": 2820 }, { "epoch": 0.7823072656683305, "grad_norm": 0.1880798488855362, "learning_rate": 1.7011886177380572e-05, "loss": 0.5601, "step": 2821 }, { "epoch": 0.7825845812534664, "grad_norm": 0.19588159024715424, "learning_rate": 1.7006736339103267e-05, "loss": 0.56, "step": 2822 }, { "epoch": 0.7828618968386023, "grad_norm": 0.21878387033939362, "learning_rate": 1.7001585621442295e-05, "loss": 0.5683, "step": 2823 }, { "epoch": 0.7831392124237382, "grad_norm": 0.18837064504623413, "learning_rate": 1.6996434025402706e-05, "loss": 0.5827, "step": 2824 }, { "epoch": 0.7834165280088741, "grad_norm": 0.1791253536939621, "learning_rate": 1.6991281551989704e-05, "loss": 0.5125, "step": 2825 }, { "epoch": 0.78369384359401, "grad_norm": 0.19195421040058136, "learning_rate": 1.698612820220868e-05, "loss": 0.5348, "step": 2826 }, { "epoch": 0.7839711591791458, "grad_norm": 0.1831100434064865, "learning_rate": 1.6980973977065185e-05, "loss": 0.5354, "step": 2827 }, { "epoch": 0.7842484747642817, "grad_norm": 0.19085463881492615, "learning_rate": 1.6975818877564945e-05, "loss": 0.5724, "step": 2828 }, { "epoch": 0.7845257903494176, "grad_norm": 0.18546035885810852, "learning_rate": 1.6970662904713857e-05, "loss": 0.5433, "step": 2829 }, { "epoch": 0.7848031059345535, "grad_norm": 0.17541223764419556, "learning_rate": 1.6965506059517988e-05, "loss": 0.5348, "step": 2830 }, { "epoch": 0.7850804215196894, "grad_norm": 0.18578267097473145, "learning_rate": 1.696034834298358e-05, "loss": 0.5355, "step": 2831 }, { "epoch": 0.7853577371048253, "grad_norm": 0.19077135622501373, "learning_rate": 1.6955189756117028e-05, "loss": 0.5387, "step": 2832 }, { "epoch": 0.7856350526899611, "grad_norm": 0.19363151490688324, "learning_rate": 1.6950030299924925e-05, "loss": 0.54, "step": 2833 }, { "epoch": 0.785912368275097, "grad_norm": 0.19038641452789307, "learning_rate": 1.6944869975414e-05, "loss": 0.5365, "step": 2834 }, { "epoch": 0.7861896838602329, "grad_norm": 0.18864920735359192, "learning_rate": 1.6939708783591184e-05, "loss": 0.5272, "step": 2835 }, { "epoch": 0.7864669994453688, "grad_norm": 0.195342555642128, "learning_rate": 1.6934546725463558e-05, "loss": 0.5459, "step": 2836 }, { "epoch": 0.7867443150305047, "grad_norm": 0.1781705915927887, "learning_rate": 1.6929383802038372e-05, "loss": 0.525, "step": 2837 }, { "epoch": 0.7870216306156406, "grad_norm": 0.18777886033058167, "learning_rate": 1.6924220014323054e-05, "loss": 0.5254, "step": 2838 }, { "epoch": 0.7872989462007765, "grad_norm": 0.19651533663272858, "learning_rate": 1.6919055363325193e-05, "loss": 0.5388, "step": 2839 }, { "epoch": 0.7875762617859123, "grad_norm": 0.19021400809288025, "learning_rate": 1.6913889850052546e-05, "loss": 0.5362, "step": 2840 }, { "epoch": 0.7878535773710482, "grad_norm": 0.18892168998718262, "learning_rate": 1.690872347551305e-05, "loss": 0.5446, "step": 2841 }, { "epoch": 0.7881308929561841, "grad_norm": 0.1848224401473999, "learning_rate": 1.6903556240714795e-05, "loss": 0.5353, "step": 2842 }, { "epoch": 0.78840820854132, "grad_norm": 0.19968490302562714, "learning_rate": 1.6898388146666046e-05, "loss": 0.5173, "step": 2843 }, { "epoch": 0.7886855241264559, "grad_norm": 0.19547419250011444, "learning_rate": 1.689321919437524e-05, "loss": 0.5421, "step": 2844 }, { "epoch": 0.7889628397115918, "grad_norm": 0.18620312213897705, "learning_rate": 1.688804938485097e-05, "loss": 0.5297, "step": 2845 }, { "epoch": 0.7892401552967276, "grad_norm": 0.18695352971553802, "learning_rate": 1.6882878719102007e-05, "loss": 0.5386, "step": 2846 }, { "epoch": 0.7895174708818635, "grad_norm": 0.18656794726848602, "learning_rate": 1.6877707198137285e-05, "loss": 0.557, "step": 2847 }, { "epoch": 0.7897947864669994, "grad_norm": 0.18816913664340973, "learning_rate": 1.6872534822965903e-05, "loss": 0.5638, "step": 2848 }, { "epoch": 0.7900721020521353, "grad_norm": 0.18742120265960693, "learning_rate": 1.6867361594597126e-05, "loss": 0.5482, "step": 2849 }, { "epoch": 0.7903494176372712, "grad_norm": 0.1863267421722412, "learning_rate": 1.6862187514040396e-05, "loss": 0.5471, "step": 2850 }, { "epoch": 0.7906267332224071, "grad_norm": 0.24633349478244781, "learning_rate": 1.6857012582305303e-05, "loss": 0.5251, "step": 2851 }, { "epoch": 0.790904048807543, "grad_norm": 0.1922018826007843, "learning_rate": 1.6851836800401624e-05, "loss": 0.5706, "step": 2852 }, { "epoch": 0.7911813643926788, "grad_norm": 0.18678666651248932, "learning_rate": 1.684666016933928e-05, "loss": 0.55, "step": 2853 }, { "epoch": 0.7914586799778147, "grad_norm": 0.1959305703639984, "learning_rate": 1.6841482690128376e-05, "loss": 0.5414, "step": 2854 }, { "epoch": 0.7917359955629506, "grad_norm": 0.19318552315235138, "learning_rate": 1.6836304363779178e-05, "loss": 0.5383, "step": 2855 }, { "epoch": 0.7920133111480865, "grad_norm": 0.18508724868297577, "learning_rate": 1.6831125191302104e-05, "loss": 0.5371, "step": 2856 }, { "epoch": 0.7922906267332224, "grad_norm": 0.18992586433887482, "learning_rate": 1.682594517370776e-05, "loss": 0.499, "step": 2857 }, { "epoch": 0.7925679423183583, "grad_norm": 0.19711445271968842, "learning_rate": 1.682076431200689e-05, "loss": 0.5777, "step": 2858 }, { "epoch": 0.7928452579034941, "grad_norm": 0.19418618083000183, "learning_rate": 1.6815582607210435e-05, "loss": 0.5459, "step": 2859 }, { "epoch": 0.79312257348863, "grad_norm": 0.19879227876663208, "learning_rate": 1.6810400060329472e-05, "loss": 0.5578, "step": 2860 }, { "epoch": 0.7933998890737659, "grad_norm": 0.1837853342294693, "learning_rate": 1.680521667237525e-05, "loss": 0.534, "step": 2861 }, { "epoch": 0.7936772046589018, "grad_norm": 0.18517963588237762, "learning_rate": 1.68000324443592e-05, "loss": 0.5368, "step": 2862 }, { "epoch": 0.7939545202440377, "grad_norm": 0.18853336572647095, "learning_rate": 1.6794847377292885e-05, "loss": 0.5728, "step": 2863 }, { "epoch": 0.7942318358291736, "grad_norm": 0.1775895059108734, "learning_rate": 1.678966147218806e-05, "loss": 0.5283, "step": 2864 }, { "epoch": 0.7945091514143094, "grad_norm": 0.18305779993534088, "learning_rate": 1.678447473005663e-05, "loss": 0.531, "step": 2865 }, { "epoch": 0.7947864669994453, "grad_norm": 0.19156721234321594, "learning_rate": 1.6779287151910665e-05, "loss": 0.5485, "step": 2866 }, { "epoch": 0.7950637825845812, "grad_norm": 0.18802639842033386, "learning_rate": 1.6774098738762398e-05, "loss": 0.548, "step": 2867 }, { "epoch": 0.7953410981697171, "grad_norm": 0.19210045039653778, "learning_rate": 1.6768909491624224e-05, "loss": 0.548, "step": 2868 }, { "epoch": 0.795618413754853, "grad_norm": 0.18298813700675964, "learning_rate": 1.6763719411508713e-05, "loss": 0.5385, "step": 2869 }, { "epoch": 0.7958957293399889, "grad_norm": 0.206680029630661, "learning_rate": 1.675852849942857e-05, "loss": 0.5443, "step": 2870 }, { "epoch": 0.7961730449251248, "grad_norm": 0.19752557575702667, "learning_rate": 1.67533367563967e-05, "loss": 0.541, "step": 2871 }, { "epoch": 0.7964503605102606, "grad_norm": 0.1768447309732437, "learning_rate": 1.674814418342613e-05, "loss": 0.5125, "step": 2872 }, { "epoch": 0.7967276760953965, "grad_norm": 0.16779829561710358, "learning_rate": 1.6742950781530086e-05, "loss": 0.5446, "step": 2873 }, { "epoch": 0.7970049916805324, "grad_norm": 0.18453587591648102, "learning_rate": 1.6737756551721924e-05, "loss": 0.538, "step": 2874 }, { "epoch": 0.7972823072656683, "grad_norm": 0.18118412792682648, "learning_rate": 1.673256149501518e-05, "loss": 0.538, "step": 2875 }, { "epoch": 0.7975596228508042, "grad_norm": 0.18454188108444214, "learning_rate": 1.672736561242355e-05, "loss": 0.5208, "step": 2876 }, { "epoch": 0.7978369384359401, "grad_norm": 0.18930627405643463, "learning_rate": 1.672216890496089e-05, "loss": 0.5545, "step": 2877 }, { "epoch": 0.7981142540210759, "grad_norm": 0.18117345869541168, "learning_rate": 1.6716971373641212e-05, "loss": 0.5444, "step": 2878 }, { "epoch": 0.7983915696062118, "grad_norm": 0.20891335606575012, "learning_rate": 1.671177301947869e-05, "loss": 0.5583, "step": 2879 }, { "epoch": 0.7986688851913477, "grad_norm": 0.18599353730678558, "learning_rate": 1.670657384348766e-05, "loss": 0.56, "step": 2880 }, { "epoch": 0.7989462007764836, "grad_norm": 0.23091299831867218, "learning_rate": 1.6701373846682626e-05, "loss": 0.5332, "step": 2881 }, { "epoch": 0.7992235163616195, "grad_norm": 0.18974825739860535, "learning_rate": 1.6696173030078242e-05, "loss": 0.56, "step": 2882 }, { "epoch": 0.7995008319467554, "grad_norm": 0.1794738471508026, "learning_rate": 1.6690971394689324e-05, "loss": 0.5671, "step": 2883 }, { "epoch": 0.7997781475318912, "grad_norm": 0.24422964453697205, "learning_rate": 1.6685768941530848e-05, "loss": 0.5261, "step": 2884 }, { "epoch": 0.8000554631170271, "grad_norm": 0.18428778648376465, "learning_rate": 1.6680565671617955e-05, "loss": 0.5315, "step": 2885 }, { "epoch": 0.800332778702163, "grad_norm": 0.18910729885101318, "learning_rate": 1.667536158596593e-05, "loss": 0.5447, "step": 2886 }, { "epoch": 0.8006100942872989, "grad_norm": 0.1889326423406601, "learning_rate": 1.667015668559024e-05, "loss": 0.5313, "step": 2887 }, { "epoch": 0.8008874098724348, "grad_norm": 0.19478273391723633, "learning_rate": 1.66649509715065e-05, "loss": 0.5358, "step": 2888 }, { "epoch": 0.8011647254575707, "grad_norm": 0.1966940313577652, "learning_rate": 1.6659744444730467e-05, "loss": 0.5367, "step": 2889 }, { "epoch": 0.8014420410427066, "grad_norm": 0.19594216346740723, "learning_rate": 1.665453710627809e-05, "loss": 0.5571, "step": 2890 }, { "epoch": 0.8017193566278424, "grad_norm": 0.19384315609931946, "learning_rate": 1.6649328957165448e-05, "loss": 0.5478, "step": 2891 }, { "epoch": 0.8019966722129783, "grad_norm": 0.18817134201526642, "learning_rate": 1.6644119998408795e-05, "loss": 0.5533, "step": 2892 }, { "epoch": 0.8022739877981142, "grad_norm": 0.19234129786491394, "learning_rate": 1.6638910231024528e-05, "loss": 0.55, "step": 2893 }, { "epoch": 0.8025513033832501, "grad_norm": 0.19165417551994324, "learning_rate": 1.6633699656029224e-05, "loss": 0.5372, "step": 2894 }, { "epoch": 0.802828618968386, "grad_norm": 0.20091277360916138, "learning_rate": 1.6628488274439592e-05, "loss": 0.5571, "step": 2895 }, { "epoch": 0.8031059345535219, "grad_norm": 0.1875421106815338, "learning_rate": 1.6623276087272517e-05, "loss": 0.5346, "step": 2896 }, { "epoch": 0.8033832501386577, "grad_norm": 0.24504978954792023, "learning_rate": 1.661806309554503e-05, "loss": 0.5277, "step": 2897 }, { "epoch": 0.8036605657237936, "grad_norm": 0.188764289021492, "learning_rate": 1.661284930027433e-05, "loss": 0.5354, "step": 2898 }, { "epoch": 0.8039378813089295, "grad_norm": 0.1898432970046997, "learning_rate": 1.6607634702477765e-05, "loss": 0.5349, "step": 2899 }, { "epoch": 0.8042151968940654, "grad_norm": 0.18189935386180878, "learning_rate": 1.6602419303172835e-05, "loss": 0.5146, "step": 2900 }, { "epoch": 0.8044925124792013, "grad_norm": 0.19001318514347076, "learning_rate": 1.659720310337721e-05, "loss": 0.5438, "step": 2901 }, { "epoch": 0.8047698280643372, "grad_norm": 0.19630925357341766, "learning_rate": 1.6591986104108706e-05, "loss": 0.5644, "step": 2902 }, { "epoch": 0.805047143649473, "grad_norm": 0.1834658980369568, "learning_rate": 1.65867683063853e-05, "loss": 0.5588, "step": 2903 }, { "epoch": 0.8053244592346089, "grad_norm": 0.198238343000412, "learning_rate": 1.658154971122512e-05, "loss": 0.5471, "step": 2904 }, { "epoch": 0.8056017748197448, "grad_norm": 0.18793722987174988, "learning_rate": 1.657633031964645e-05, "loss": 0.5416, "step": 2905 }, { "epoch": 0.8058790904048807, "grad_norm": 0.1919628083705902, "learning_rate": 1.657111013266774e-05, "loss": 0.5489, "step": 2906 }, { "epoch": 0.8061564059900166, "grad_norm": 0.18970000743865967, "learning_rate": 1.6565889151307576e-05, "loss": 0.5374, "step": 2907 }, { "epoch": 0.8064337215751525, "grad_norm": 0.19195859134197235, "learning_rate": 1.656066737658471e-05, "loss": 0.5696, "step": 2908 }, { "epoch": 0.8067110371602884, "grad_norm": 0.18715813755989075, "learning_rate": 1.6555444809518066e-05, "loss": 0.5365, "step": 2909 }, { "epoch": 0.8069883527454242, "grad_norm": 0.19223229587078094, "learning_rate": 1.6550221451126682e-05, "loss": 0.5148, "step": 2910 }, { "epoch": 0.8072656683305601, "grad_norm": 0.2136392742395401, "learning_rate": 1.6544997302429794e-05, "loss": 0.5269, "step": 2911 }, { "epoch": 0.807542983915696, "grad_norm": 0.19737182557582855, "learning_rate": 1.6539772364446755e-05, "loss": 0.5592, "step": 2912 }, { "epoch": 0.8078202995008319, "grad_norm": 0.20777487754821777, "learning_rate": 1.6534546638197098e-05, "loss": 0.5124, "step": 2913 }, { "epoch": 0.8080976150859678, "grad_norm": 0.1888158619403839, "learning_rate": 1.6529320124700495e-05, "loss": 0.5433, "step": 2914 }, { "epoch": 0.8083749306711037, "grad_norm": 0.24319934844970703, "learning_rate": 1.6524092824976787e-05, "loss": 0.5763, "step": 2915 }, { "epoch": 0.8086522462562395, "grad_norm": 0.19144728779792786, "learning_rate": 1.6518864740045947e-05, "loss": 0.5245, "step": 2916 }, { "epoch": 0.8089295618413754, "grad_norm": 0.19189846515655518, "learning_rate": 1.6513635870928122e-05, "loss": 0.5435, "step": 2917 }, { "epoch": 0.8092068774265113, "grad_norm": 0.24125546216964722, "learning_rate": 1.6508406218643597e-05, "loss": 0.5599, "step": 2918 }, { "epoch": 0.8094841930116472, "grad_norm": 0.2077561616897583, "learning_rate": 1.650317578421282e-05, "loss": 0.5466, "step": 2919 }, { "epoch": 0.8097615085967831, "grad_norm": 0.2178879827260971, "learning_rate": 1.6497944568656383e-05, "loss": 0.5306, "step": 2920 }, { "epoch": 0.810038824181919, "grad_norm": 0.24422143399715424, "learning_rate": 1.649271257299504e-05, "loss": 0.5388, "step": 2921 }, { "epoch": 0.8103161397670549, "grad_norm": 0.18929331004619598, "learning_rate": 1.6487479798249687e-05, "loss": 0.5236, "step": 2922 }, { "epoch": 0.8105934553521907, "grad_norm": 0.17720143496990204, "learning_rate": 1.648224624544138e-05, "loss": 0.5172, "step": 2923 }, { "epoch": 0.8108707709373266, "grad_norm": 0.18783891201019287, "learning_rate": 1.6477011915591325e-05, "loss": 0.5389, "step": 2924 }, { "epoch": 0.8111480865224625, "grad_norm": 0.19297097623348236, "learning_rate": 1.6471776809720873e-05, "loss": 0.543, "step": 2925 }, { "epoch": 0.8114254021075984, "grad_norm": 0.18489378690719604, "learning_rate": 1.6466540928851538e-05, "loss": 0.5512, "step": 2926 }, { "epoch": 0.8117027176927343, "grad_norm": 0.18271875381469727, "learning_rate": 1.6461304274004972e-05, "loss": 0.5417, "step": 2927 }, { "epoch": 0.8119800332778702, "grad_norm": 0.1847040057182312, "learning_rate": 1.6456066846202994e-05, "loss": 0.5387, "step": 2928 }, { "epoch": 0.812257348863006, "grad_norm": 0.20353619754314423, "learning_rate": 1.6450828646467555e-05, "loss": 0.5454, "step": 2929 }, { "epoch": 0.812534664448142, "grad_norm": 0.2704160809516907, "learning_rate": 1.644558967582078e-05, "loss": 0.5598, "step": 2930 }, { "epoch": 0.8128119800332779, "grad_norm": 0.2357567995786667, "learning_rate": 1.6440349935284917e-05, "loss": 0.5724, "step": 2931 }, { "epoch": 0.8130892956184138, "grad_norm": 0.18530336022377014, "learning_rate": 1.6435109425882385e-05, "loss": 0.5365, "step": 2932 }, { "epoch": 0.8133666112035497, "grad_norm": 0.18962644040584564, "learning_rate": 1.6429868148635745e-05, "loss": 0.5429, "step": 2933 }, { "epoch": 0.8136439267886856, "grad_norm": 0.1827327460050583, "learning_rate": 1.6424626104567708e-05, "loss": 0.5368, "step": 2934 }, { "epoch": 0.8139212423738215, "grad_norm": 0.19838617742061615, "learning_rate": 1.641938329470114e-05, "loss": 0.5366, "step": 2935 }, { "epoch": 0.8141985579589573, "grad_norm": 0.18842142820358276, "learning_rate": 1.6414139720059045e-05, "loss": 0.546, "step": 2936 }, { "epoch": 0.8144758735440932, "grad_norm": 0.1957085281610489, "learning_rate": 1.6408895381664594e-05, "loss": 0.5144, "step": 2937 }, { "epoch": 0.8147531891292291, "grad_norm": 0.18413354456424713, "learning_rate": 1.6403650280541087e-05, "loss": 0.5441, "step": 2938 }, { "epoch": 0.815030504714365, "grad_norm": 0.18572686612606049, "learning_rate": 1.6398404417711984e-05, "loss": 0.543, "step": 2939 }, { "epoch": 0.8153078202995009, "grad_norm": 0.18513022363185883, "learning_rate": 1.639315779420089e-05, "loss": 0.5416, "step": 2940 }, { "epoch": 0.8155851358846368, "grad_norm": 0.1878947764635086, "learning_rate": 1.6387910411031564e-05, "loss": 0.5487, "step": 2941 }, { "epoch": 0.8158624514697727, "grad_norm": 0.20760974287986755, "learning_rate": 1.6382662269227912e-05, "loss": 0.5861, "step": 2942 }, { "epoch": 0.8161397670549085, "grad_norm": 0.1904260218143463, "learning_rate": 1.637741336981398e-05, "loss": 0.549, "step": 2943 }, { "epoch": 0.8164170826400444, "grad_norm": 0.1811297982931137, "learning_rate": 1.637216371381397e-05, "loss": 0.5556, "step": 2944 }, { "epoch": 0.8166943982251803, "grad_norm": 0.18820282816886902, "learning_rate": 1.6366913302252228e-05, "loss": 0.5262, "step": 2945 }, { "epoch": 0.8169717138103162, "grad_norm": 0.18562357127666473, "learning_rate": 1.636166213615325e-05, "loss": 0.5353, "step": 2946 }, { "epoch": 0.8172490293954521, "grad_norm": 0.19570721685886383, "learning_rate": 1.6356410216541675e-05, "loss": 0.524, "step": 2947 }, { "epoch": 0.817526344980588, "grad_norm": 0.19673757255077362, "learning_rate": 1.635115754444229e-05, "loss": 0.5532, "step": 2948 }, { "epoch": 0.8178036605657238, "grad_norm": 0.1866602897644043, "learning_rate": 1.6345904120880045e-05, "loss": 0.5407, "step": 2949 }, { "epoch": 0.8180809761508597, "grad_norm": 0.1872847080230713, "learning_rate": 1.634064994688e-05, "loss": 0.522, "step": 2950 }, { "epoch": 0.8183582917359956, "grad_norm": 0.18448910117149353, "learning_rate": 1.63353950234674e-05, "loss": 0.5506, "step": 2951 }, { "epoch": 0.8186356073211315, "grad_norm": 0.19117306172847748, "learning_rate": 1.6330139351667607e-05, "loss": 0.538, "step": 2952 }, { "epoch": 0.8189129229062674, "grad_norm": 0.19007255136966705, "learning_rate": 1.6324882932506152e-05, "loss": 0.545, "step": 2953 }, { "epoch": 0.8191902384914033, "grad_norm": 0.1907234787940979, "learning_rate": 1.63196257670087e-05, "loss": 0.5216, "step": 2954 }, { "epoch": 0.8194675540765392, "grad_norm": 0.19023658335208893, "learning_rate": 1.6314367856201063e-05, "loss": 0.528, "step": 2955 }, { "epoch": 0.819744869661675, "grad_norm": 0.19689738750457764, "learning_rate": 1.6309109201109197e-05, "loss": 0.5579, "step": 2956 }, { "epoch": 0.8200221852468109, "grad_norm": 0.18936869502067566, "learning_rate": 1.63038498027592e-05, "loss": 0.5171, "step": 2957 }, { "epoch": 0.8202995008319468, "grad_norm": 0.18959283828735352, "learning_rate": 1.6298589662177334e-05, "loss": 0.5109, "step": 2958 }, { "epoch": 0.8205768164170827, "grad_norm": 0.18975764513015747, "learning_rate": 1.6293328780389976e-05, "loss": 0.5241, "step": 2959 }, { "epoch": 0.8208541320022186, "grad_norm": 0.1963840126991272, "learning_rate": 1.6288067158423676e-05, "loss": 0.5311, "step": 2960 }, { "epoch": 0.8211314475873545, "grad_norm": 0.1869080811738968, "learning_rate": 1.6282804797305107e-05, "loss": 0.5268, "step": 2961 }, { "epoch": 0.8214087631724903, "grad_norm": 0.19208255410194397, "learning_rate": 1.62775416980611e-05, "loss": 0.5561, "step": 2962 }, { "epoch": 0.8216860787576262, "grad_norm": 0.21065904200077057, "learning_rate": 1.6272277861718622e-05, "loss": 0.5612, "step": 2963 }, { "epoch": 0.8219633943427621, "grad_norm": 0.19388340413570404, "learning_rate": 1.626701328930479e-05, "loss": 0.5448, "step": 2964 }, { "epoch": 0.822240709927898, "grad_norm": 0.19354204833507538, "learning_rate": 1.626174798184686e-05, "loss": 0.5521, "step": 2965 }, { "epoch": 0.8225180255130339, "grad_norm": 0.1918734908103943, "learning_rate": 1.6256481940372235e-05, "loss": 0.5737, "step": 2966 }, { "epoch": 0.8227953410981698, "grad_norm": 0.19950683414936066, "learning_rate": 1.625121516590845e-05, "loss": 0.5251, "step": 2967 }, { "epoch": 0.8230726566833056, "grad_norm": 0.1884751170873642, "learning_rate": 1.624594765948321e-05, "loss": 0.5487, "step": 2968 }, { "epoch": 0.8233499722684415, "grad_norm": 0.20038394629955292, "learning_rate": 1.624067942212433e-05, "loss": 0.5366, "step": 2969 }, { "epoch": 0.8236272878535774, "grad_norm": 0.18730241060256958, "learning_rate": 1.6235410454859784e-05, "loss": 0.5574, "step": 2970 }, { "epoch": 0.8239046034387133, "grad_norm": 0.1982334703207016, "learning_rate": 1.6230140758717692e-05, "loss": 0.5472, "step": 2971 }, { "epoch": 0.8241819190238492, "grad_norm": 0.19432491064071655, "learning_rate": 1.6224870334726315e-05, "loss": 0.54, "step": 2972 }, { "epoch": 0.8244592346089851, "grad_norm": 0.1818057745695114, "learning_rate": 1.6219599183914038e-05, "loss": 0.5594, "step": 2973 }, { "epoch": 0.824736550194121, "grad_norm": 0.18658170104026794, "learning_rate": 1.6214327307309417e-05, "loss": 0.537, "step": 2974 }, { "epoch": 0.8250138657792568, "grad_norm": 0.19775459170341492, "learning_rate": 1.620905470594113e-05, "loss": 0.5423, "step": 2975 }, { "epoch": 0.8252911813643927, "grad_norm": 0.1877543330192566, "learning_rate": 1.6203781380837997e-05, "loss": 0.5337, "step": 2976 }, { "epoch": 0.8255684969495286, "grad_norm": 0.1873265951871872, "learning_rate": 1.619850733302899e-05, "loss": 0.5153, "step": 2977 }, { "epoch": 0.8258458125346645, "grad_norm": 0.1832212209701538, "learning_rate": 1.619323256354321e-05, "loss": 0.5291, "step": 2978 }, { "epoch": 0.8261231281198004, "grad_norm": 0.1939922571182251, "learning_rate": 1.6187957073409907e-05, "loss": 0.5428, "step": 2979 }, { "epoch": 0.8264004437049363, "grad_norm": 0.2593255639076233, "learning_rate": 1.6182680863658468e-05, "loss": 0.5448, "step": 2980 }, { "epoch": 0.8266777592900721, "grad_norm": 0.18556508421897888, "learning_rate": 1.6177403935318422e-05, "loss": 0.529, "step": 2981 }, { "epoch": 0.826955074875208, "grad_norm": 0.20643781125545502, "learning_rate": 1.6172126289419437e-05, "loss": 0.535, "step": 2982 }, { "epoch": 0.8272323904603439, "grad_norm": 0.192849263548851, "learning_rate": 1.6166847926991324e-05, "loss": 0.5457, "step": 2983 }, { "epoch": 0.8275097060454798, "grad_norm": 0.19120796024799347, "learning_rate": 1.616156884906403e-05, "loss": 0.542, "step": 2984 }, { "epoch": 0.8277870216306157, "grad_norm": 0.20270869135856628, "learning_rate": 1.615628905666764e-05, "loss": 0.5621, "step": 2985 }, { "epoch": 0.8280643372157516, "grad_norm": 0.20787782967090607, "learning_rate": 1.6151008550832377e-05, "loss": 0.5007, "step": 2986 }, { "epoch": 0.8283416528008875, "grad_norm": 0.23962002992630005, "learning_rate": 1.6145727332588626e-05, "loss": 0.5351, "step": 2987 }, { "epoch": 0.8286189683860233, "grad_norm": 0.19550423324108124, "learning_rate": 1.614044540296687e-05, "loss": 0.5599, "step": 2988 }, { "epoch": 0.8288962839711592, "grad_norm": 0.20781289041042328, "learning_rate": 1.6135162762997776e-05, "loss": 0.5381, "step": 2989 }, { "epoch": 0.8291735995562951, "grad_norm": 0.1908697932958603, "learning_rate": 1.61298794137121e-05, "loss": 0.5109, "step": 2990 }, { "epoch": 0.829450915141431, "grad_norm": 0.19248178601264954, "learning_rate": 1.6124595356140794e-05, "loss": 0.5061, "step": 2991 }, { "epoch": 0.8297282307265669, "grad_norm": 0.4288308918476105, "learning_rate": 1.611931059131489e-05, "loss": 0.5418, "step": 2992 }, { "epoch": 0.8300055463117028, "grad_norm": 0.2553388178348541, "learning_rate": 1.6114025120265604e-05, "loss": 0.5308, "step": 2993 }, { "epoch": 0.8302828618968386, "grad_norm": 0.19662028551101685, "learning_rate": 1.6108738944024265e-05, "loss": 0.5647, "step": 2994 }, { "epoch": 0.8305601774819745, "grad_norm": 0.1986682265996933, "learning_rate": 1.6103452063622343e-05, "loss": 0.5594, "step": 2995 }, { "epoch": 0.8308374930671104, "grad_norm": 0.18815158307552338, "learning_rate": 1.6098164480091454e-05, "loss": 0.5347, "step": 2996 }, { "epoch": 0.8311148086522463, "grad_norm": 0.19016186892986298, "learning_rate": 1.6092876194463343e-05, "loss": 0.5022, "step": 2997 }, { "epoch": 0.8313921242373822, "grad_norm": 0.19305314123630524, "learning_rate": 1.6087587207769897e-05, "loss": 0.5561, "step": 2998 }, { "epoch": 0.8316694398225181, "grad_norm": 0.19363488256931305, "learning_rate": 1.6082297521043134e-05, "loss": 0.5704, "step": 2999 }, { "epoch": 0.831946755407654, "grad_norm": 0.2677402198314667, "learning_rate": 1.6077007135315212e-05, "loss": 0.5709, "step": 3000 }, { "epoch": 0.8322240709927898, "grad_norm": 0.18514001369476318, "learning_rate": 1.6071716051618426e-05, "loss": 0.5328, "step": 3001 }, { "epoch": 0.8325013865779257, "grad_norm": 0.20008349418640137, "learning_rate": 1.606642427098521e-05, "loss": 0.5535, "step": 3002 }, { "epoch": 0.8327787021630616, "grad_norm": 0.20995013415813446, "learning_rate": 1.606113179444813e-05, "loss": 0.5541, "step": 3003 }, { "epoch": 0.8330560177481975, "grad_norm": 0.21254870295524597, "learning_rate": 1.6055838623039886e-05, "loss": 0.5221, "step": 3004 }, { "epoch": 0.8333333333333334, "grad_norm": 0.1829461008310318, "learning_rate": 1.6050544757793312e-05, "loss": 0.5445, "step": 3005 }, { "epoch": 0.8336106489184693, "grad_norm": 0.20135165750980377, "learning_rate": 1.604525019974139e-05, "loss": 0.5337, "step": 3006 }, { "epoch": 0.8338879645036051, "grad_norm": 0.1872393935918808, "learning_rate": 1.6039954949917218e-05, "loss": 0.5506, "step": 3007 }, { "epoch": 0.834165280088741, "grad_norm": 0.17982006072998047, "learning_rate": 1.6034659009354055e-05, "loss": 0.5252, "step": 3008 }, { "epoch": 0.8344425956738769, "grad_norm": 0.19605563580989838, "learning_rate": 1.6029362379085264e-05, "loss": 0.527, "step": 3009 }, { "epoch": 0.8347199112590128, "grad_norm": 0.20329341292381287, "learning_rate": 1.602406506014437e-05, "loss": 0.5729, "step": 3010 }, { "epoch": 0.8349972268441487, "grad_norm": 0.18132315576076508, "learning_rate": 1.6018767053565008e-05, "loss": 0.5391, "step": 3011 }, { "epoch": 0.8352745424292846, "grad_norm": 0.19777770340442657, "learning_rate": 1.6013468360380966e-05, "loss": 0.5366, "step": 3012 }, { "epoch": 0.8355518580144204, "grad_norm": 0.19389207661151886, "learning_rate": 1.6008168981626164e-05, "loss": 0.5569, "step": 3013 }, { "epoch": 0.8358291735995563, "grad_norm": 0.18683470785617828, "learning_rate": 1.6002868918334647e-05, "loss": 0.5496, "step": 3014 }, { "epoch": 0.8361064891846922, "grad_norm": 0.18466855585575104, "learning_rate": 1.5997568171540594e-05, "loss": 0.5547, "step": 3015 }, { "epoch": 0.8363838047698281, "grad_norm": 0.19307468831539154, "learning_rate": 1.5992266742278322e-05, "loss": 0.5518, "step": 3016 }, { "epoch": 0.836661120354964, "grad_norm": 0.19225814938545227, "learning_rate": 1.5986964631582287e-05, "loss": 0.548, "step": 3017 }, { "epoch": 0.8369384359400999, "grad_norm": 0.19282346963882446, "learning_rate": 1.5981661840487063e-05, "loss": 0.5594, "step": 3018 }, { "epoch": 0.8372157515252358, "grad_norm": 0.18779483437538147, "learning_rate": 1.5976358370027373e-05, "loss": 0.5191, "step": 3019 }, { "epoch": 0.8374930671103716, "grad_norm": 0.18765175342559814, "learning_rate": 1.597105422123806e-05, "loss": 0.5401, "step": 3020 }, { "epoch": 0.8377703826955075, "grad_norm": 0.1844259798526764, "learning_rate": 1.5965749395154107e-05, "loss": 0.5459, "step": 3021 }, { "epoch": 0.8380476982806434, "grad_norm": 0.18752428889274597, "learning_rate": 1.5960443892810617e-05, "loss": 0.5339, "step": 3022 }, { "epoch": 0.8383250138657793, "grad_norm": 0.19530834257602692, "learning_rate": 1.5955137715242847e-05, "loss": 0.5479, "step": 3023 }, { "epoch": 0.8386023294509152, "grad_norm": 0.1960788518190384, "learning_rate": 1.5949830863486166e-05, "loss": 0.5511, "step": 3024 }, { "epoch": 0.8388796450360511, "grad_norm": 0.2669583261013031, "learning_rate": 1.594452333857608e-05, "loss": 0.5637, "step": 3025 }, { "epoch": 0.8391569606211869, "grad_norm": 0.18217816948890686, "learning_rate": 1.5939215141548224e-05, "loss": 0.4905, "step": 3026 }, { "epoch": 0.8394342762063228, "grad_norm": 0.19247345626354218, "learning_rate": 1.5933906273438383e-05, "loss": 0.5234, "step": 3027 }, { "epoch": 0.8397115917914587, "grad_norm": 0.1867692619562149, "learning_rate": 1.592859673528244e-05, "loss": 0.5166, "step": 3028 }, { "epoch": 0.8399889073765946, "grad_norm": 0.18724261224269867, "learning_rate": 1.5923286528116446e-05, "loss": 0.5524, "step": 3029 }, { "epoch": 0.8402662229617305, "grad_norm": 0.18874408304691315, "learning_rate": 1.5917975652976544e-05, "loss": 0.5283, "step": 3030 }, { "epoch": 0.8405435385468664, "grad_norm": 0.2242184728384018, "learning_rate": 1.5912664110899038e-05, "loss": 0.532, "step": 3031 }, { "epoch": 0.8408208541320022, "grad_norm": 0.20637090504169464, "learning_rate": 1.5907351902920346e-05, "loss": 0.5619, "step": 3032 }, { "epoch": 0.8410981697171381, "grad_norm": 0.20153935253620148, "learning_rate": 1.590203903007702e-05, "loss": 0.5704, "step": 3033 }, { "epoch": 0.841375485302274, "grad_norm": 0.1838448941707611, "learning_rate": 1.5896725493405746e-05, "loss": 0.5533, "step": 3034 }, { "epoch": 0.8416528008874099, "grad_norm": 0.187151700258255, "learning_rate": 1.589141129394333e-05, "loss": 0.5276, "step": 3035 }, { "epoch": 0.8419301164725458, "grad_norm": 0.19188909232616425, "learning_rate": 1.5886096432726723e-05, "loss": 0.5588, "step": 3036 }, { "epoch": 0.8422074320576817, "grad_norm": 0.18532563745975494, "learning_rate": 1.5880780910792984e-05, "loss": 0.5363, "step": 3037 }, { "epoch": 0.8424847476428176, "grad_norm": 0.18388155102729797, "learning_rate": 1.587546472917932e-05, "loss": 0.5544, "step": 3038 }, { "epoch": 0.8427620632279534, "grad_norm": 0.18780486285686493, "learning_rate": 1.5870147888923054e-05, "loss": 0.5501, "step": 3039 }, { "epoch": 0.8430393788130893, "grad_norm": 0.18718671798706055, "learning_rate": 1.5864830391061644e-05, "loss": 0.5491, "step": 3040 }, { "epoch": 0.8433166943982252, "grad_norm": 0.1841050237417221, "learning_rate": 1.585951223663268e-05, "loss": 0.539, "step": 3041 }, { "epoch": 0.8435940099833611, "grad_norm": 0.17960785329341888, "learning_rate": 1.5854193426673862e-05, "loss": 0.5134, "step": 3042 }, { "epoch": 0.843871325568497, "grad_norm": 0.1937013566493988, "learning_rate": 1.5848873962223044e-05, "loss": 0.5083, "step": 3043 }, { "epoch": 0.8441486411536329, "grad_norm": 0.18566550314426422, "learning_rate": 1.5843553844318193e-05, "loss": 0.5569, "step": 3044 }, { "epoch": 0.8444259567387687, "grad_norm": 0.1879187971353531, "learning_rate": 1.5838233073997395e-05, "loss": 0.5611, "step": 3045 }, { "epoch": 0.8447032723239046, "grad_norm": 0.1857539415359497, "learning_rate": 1.5832911652298882e-05, "loss": 0.5464, "step": 3046 }, { "epoch": 0.8449805879090405, "grad_norm": 0.1914960891008377, "learning_rate": 1.5827589580261e-05, "loss": 0.5547, "step": 3047 }, { "epoch": 0.8452579034941764, "grad_norm": 0.18924832344055176, "learning_rate": 1.582226685892223e-05, "loss": 0.5739, "step": 3048 }, { "epoch": 0.8455352190793123, "grad_norm": 0.18020758032798767, "learning_rate": 1.5816943489321174e-05, "loss": 0.5355, "step": 3049 }, { "epoch": 0.8458125346644482, "grad_norm": 0.22242438793182373, "learning_rate": 1.5811619472496562e-05, "loss": 0.5551, "step": 3050 }, { "epoch": 0.846089850249584, "grad_norm": 0.18375588953495026, "learning_rate": 1.5806294809487248e-05, "loss": 0.5393, "step": 3051 }, { "epoch": 0.8463671658347199, "grad_norm": 0.20009155571460724, "learning_rate": 1.5800969501332223e-05, "loss": 0.5555, "step": 3052 }, { "epoch": 0.8466444814198558, "grad_norm": 0.1867419183254242, "learning_rate": 1.5795643549070588e-05, "loss": 0.5541, "step": 3053 }, { "epoch": 0.8469217970049917, "grad_norm": 0.20315201580524445, "learning_rate": 1.5790316953741583e-05, "loss": 0.5302, "step": 3054 }, { "epoch": 0.8471991125901276, "grad_norm": 0.19646741449832916, "learning_rate": 1.578498971638456e-05, "loss": 0.5491, "step": 3055 }, { "epoch": 0.8474764281752635, "grad_norm": 0.1929592341184616, "learning_rate": 1.5779661838039013e-05, "loss": 0.554, "step": 3056 }, { "epoch": 0.8477537437603994, "grad_norm": 0.18516919016838074, "learning_rate": 1.577433331974455e-05, "loss": 0.5474, "step": 3057 }, { "epoch": 0.8480310593455352, "grad_norm": 0.17926859855651855, "learning_rate": 1.57690041625409e-05, "loss": 0.541, "step": 3058 }, { "epoch": 0.8483083749306711, "grad_norm": 0.19980867207050323, "learning_rate": 1.576367436746793e-05, "loss": 0.581, "step": 3059 }, { "epoch": 0.848585690515807, "grad_norm": 0.20209050178527832, "learning_rate": 1.575834393556562e-05, "loss": 0.5749, "step": 3060 }, { "epoch": 0.8488630061009429, "grad_norm": 0.19711333513259888, "learning_rate": 1.575301286787408e-05, "loss": 0.5537, "step": 3061 }, { "epoch": 0.8491403216860788, "grad_norm": 0.185336172580719, "learning_rate": 1.5747681165433544e-05, "loss": 0.5378, "step": 3062 }, { "epoch": 0.8494176372712147, "grad_norm": 0.2029864341020584, "learning_rate": 1.5742348829284366e-05, "loss": 0.5489, "step": 3063 }, { "epoch": 0.8496949528563505, "grad_norm": 0.18520487844944, "learning_rate": 1.5737015860467032e-05, "loss": 0.5589, "step": 3064 }, { "epoch": 0.8499722684414864, "grad_norm": 0.18894660472869873, "learning_rate": 1.573168226002213e-05, "loss": 0.5486, "step": 3065 }, { "epoch": 0.8502495840266223, "grad_norm": 0.2093021273612976, "learning_rate": 1.5726348028990404e-05, "loss": 0.528, "step": 3066 }, { "epoch": 0.8505268996117582, "grad_norm": 0.18530364334583282, "learning_rate": 1.5721013168412698e-05, "loss": 0.5244, "step": 3067 }, { "epoch": 0.8508042151968941, "grad_norm": 0.18845012784004211, "learning_rate": 1.5715677679329978e-05, "loss": 0.5211, "step": 3068 }, { "epoch": 0.85108153078203, "grad_norm": 0.20350565016269684, "learning_rate": 1.571034156278335e-05, "loss": 0.5711, "step": 3069 }, { "epoch": 0.8513588463671659, "grad_norm": 0.18500900268554688, "learning_rate": 1.5705004819814025e-05, "loss": 0.5116, "step": 3070 }, { "epoch": 0.8516361619523017, "grad_norm": 0.18262405693531036, "learning_rate": 1.5699667451463344e-05, "loss": 0.5485, "step": 3071 }, { "epoch": 0.8519134775374376, "grad_norm": 0.19544097781181335, "learning_rate": 1.5694329458772776e-05, "loss": 0.5465, "step": 3072 }, { "epoch": 0.8521907931225735, "grad_norm": 0.18556345999240875, "learning_rate": 1.5688990842783892e-05, "loss": 0.5409, "step": 3073 }, { "epoch": 0.8524681087077094, "grad_norm": 0.23834922909736633, "learning_rate": 1.5683651604538405e-05, "loss": 0.5192, "step": 3074 }, { "epoch": 0.8527454242928453, "grad_norm": 0.20461677014827728, "learning_rate": 1.5678311745078138e-05, "loss": 0.5674, "step": 3075 }, { "epoch": 0.8530227398779812, "grad_norm": 0.19039872288703918, "learning_rate": 1.5672971265445046e-05, "loss": 0.5433, "step": 3076 }, { "epoch": 0.853300055463117, "grad_norm": 0.19499389827251434, "learning_rate": 1.566763016668119e-05, "loss": 0.5577, "step": 3077 }, { "epoch": 0.8535773710482529, "grad_norm": 0.185109481215477, "learning_rate": 1.5662288449828767e-05, "loss": 0.5192, "step": 3078 }, { "epoch": 0.8538546866333888, "grad_norm": 0.2005089968442917, "learning_rate": 1.5656946115930084e-05, "loss": 0.5423, "step": 3079 }, { "epoch": 0.8541320022185247, "grad_norm": 0.19989848136901855, "learning_rate": 1.5651603166027574e-05, "loss": 0.5307, "step": 3080 }, { "epoch": 0.8544093178036606, "grad_norm": 0.18417179584503174, "learning_rate": 1.5646259601163783e-05, "loss": 0.5479, "step": 3081 }, { "epoch": 0.8546866333887965, "grad_norm": 0.19350256025791168, "learning_rate": 1.5640915422381387e-05, "loss": 0.547, "step": 3082 }, { "epoch": 0.8549639489739324, "grad_norm": 0.19599801301956177, "learning_rate": 1.5635570630723173e-05, "loss": 0.572, "step": 3083 }, { "epoch": 0.8552412645590682, "grad_norm": 0.18424679338932037, "learning_rate": 1.5630225227232055e-05, "loss": 0.5421, "step": 3084 }, { "epoch": 0.8555185801442041, "grad_norm": 0.18912889063358307, "learning_rate": 1.562487921295106e-05, "loss": 0.5503, "step": 3085 }, { "epoch": 0.85579589572934, "grad_norm": 0.1971350610256195, "learning_rate": 1.561953258892334e-05, "loss": 0.5493, "step": 3086 }, { "epoch": 0.8560732113144759, "grad_norm": 0.19339221715927124, "learning_rate": 1.5614185356192156e-05, "loss": 0.5249, "step": 3087 }, { "epoch": 0.8563505268996118, "grad_norm": 0.22969551384449005, "learning_rate": 1.5608837515800906e-05, "loss": 0.5398, "step": 3088 }, { "epoch": 0.8566278424847477, "grad_norm": 0.20485931634902954, "learning_rate": 1.560348906879309e-05, "loss": 0.5455, "step": 3089 }, { "epoch": 0.8569051580698835, "grad_norm": 0.2660558521747589, "learning_rate": 1.5598140016212328e-05, "loss": 0.5636, "step": 3090 }, { "epoch": 0.8571824736550194, "grad_norm": 0.19878970086574554, "learning_rate": 1.559279035910237e-05, "loss": 0.5513, "step": 3091 }, { "epoch": 0.8574597892401553, "grad_norm": 0.19739742577075958, "learning_rate": 1.5587440098507067e-05, "loss": 0.5521, "step": 3092 }, { "epoch": 0.8577371048252912, "grad_norm": 0.19214226305484772, "learning_rate": 1.55820892354704e-05, "loss": 0.5721, "step": 3093 }, { "epoch": 0.8580144204104271, "grad_norm": 0.19705668091773987, "learning_rate": 1.5576737771036464e-05, "loss": 0.555, "step": 3094 }, { "epoch": 0.858291735995563, "grad_norm": 0.17740927636623383, "learning_rate": 1.557138570624948e-05, "loss": 0.5478, "step": 3095 }, { "epoch": 0.8585690515806988, "grad_norm": 0.19050319492816925, "learning_rate": 1.556603304215376e-05, "loss": 0.5511, "step": 3096 }, { "epoch": 0.8588463671658347, "grad_norm": 0.1907372921705246, "learning_rate": 1.556067977979377e-05, "loss": 0.543, "step": 3097 }, { "epoch": 0.8591236827509706, "grad_norm": 0.18947333097457886, "learning_rate": 1.5555325920214055e-05, "loss": 0.56, "step": 3098 }, { "epoch": 0.8594009983361065, "grad_norm": 0.19369062781333923, "learning_rate": 1.5549971464459308e-05, "loss": 0.5506, "step": 3099 }, { "epoch": 0.8596783139212424, "grad_norm": 0.18818899989128113, "learning_rate": 1.554461641357432e-05, "loss": 0.5434, "step": 3100 }, { "epoch": 0.8599556295063783, "grad_norm": 0.19132985174655914, "learning_rate": 1.5539260768604e-05, "loss": 0.5378, "step": 3101 }, { "epoch": 0.8602329450915142, "grad_norm": 0.20558297634124756, "learning_rate": 1.5533904530593386e-05, "loss": 0.5428, "step": 3102 }, { "epoch": 0.86051026067665, "grad_norm": 0.19788722693920135, "learning_rate": 1.5528547700587616e-05, "loss": 0.5417, "step": 3103 }, { "epoch": 0.8607875762617859, "grad_norm": 0.20902171730995178, "learning_rate": 1.552319027963195e-05, "loss": 0.5426, "step": 3104 }, { "epoch": 0.8610648918469218, "grad_norm": 0.18526656925678253, "learning_rate": 1.5517832268771764e-05, "loss": 0.5342, "step": 3105 }, { "epoch": 0.8613422074320577, "grad_norm": 0.1860983669757843, "learning_rate": 1.551247366905254e-05, "loss": 0.5466, "step": 3106 }, { "epoch": 0.8616195230171936, "grad_norm": 0.18875092267990112, "learning_rate": 1.5507114481519895e-05, "loss": 0.5417, "step": 3107 }, { "epoch": 0.8618968386023295, "grad_norm": 0.1913326233625412, "learning_rate": 1.5501754707219536e-05, "loss": 0.5589, "step": 3108 }, { "epoch": 0.8621741541874653, "grad_norm": 0.19133377075195312, "learning_rate": 1.549639434719731e-05, "loss": 0.5345, "step": 3109 }, { "epoch": 0.8624514697726012, "grad_norm": 0.18826799094676971, "learning_rate": 1.549103340249916e-05, "loss": 0.527, "step": 3110 }, { "epoch": 0.8627287853577371, "grad_norm": 0.18173575401306152, "learning_rate": 1.548567187417114e-05, "loss": 0.5431, "step": 3111 }, { "epoch": 0.863006100942873, "grad_norm": 0.19065049290657043, "learning_rate": 1.548030976325944e-05, "loss": 0.5147, "step": 3112 }, { "epoch": 0.8632834165280089, "grad_norm": 0.2025771290063858, "learning_rate": 1.547494707081034e-05, "loss": 0.5635, "step": 3113 }, { "epoch": 0.8635607321131448, "grad_norm": 0.18411415815353394, "learning_rate": 1.546958379787025e-05, "loss": 0.5246, "step": 3114 }, { "epoch": 0.8638380476982807, "grad_norm": 0.19902820885181427, "learning_rate": 1.546421994548568e-05, "loss": 0.5564, "step": 3115 }, { "epoch": 0.8641153632834165, "grad_norm": 0.18202657997608185, "learning_rate": 1.5458855514703266e-05, "loss": 0.5364, "step": 3116 }, { "epoch": 0.8643926788685524, "grad_norm": 0.19231395423412323, "learning_rate": 1.545349050656974e-05, "loss": 0.5521, "step": 3117 }, { "epoch": 0.8646699944536883, "grad_norm": 0.18620963394641876, "learning_rate": 1.5448124922131974e-05, "loss": 0.5308, "step": 3118 }, { "epoch": 0.8649473100388242, "grad_norm": 0.18969541788101196, "learning_rate": 1.5442758762436923e-05, "loss": 0.5446, "step": 3119 }, { "epoch": 0.8652246256239601, "grad_norm": 0.19268915057182312, "learning_rate": 1.543739202853167e-05, "loss": 0.531, "step": 3120 }, { "epoch": 0.865501941209096, "grad_norm": 0.4672553837299347, "learning_rate": 1.5432024721463413e-05, "loss": 0.5564, "step": 3121 }, { "epoch": 0.8657792567942318, "grad_norm": 0.19262655079364777, "learning_rate": 1.5426656842279445e-05, "loss": 0.558, "step": 3122 }, { "epoch": 0.8660565723793677, "grad_norm": 0.20529362559318542, "learning_rate": 1.5421288392027185e-05, "loss": 0.5247, "step": 3123 }, { "epoch": 0.8663338879645036, "grad_norm": 0.19508616626262665, "learning_rate": 1.5415919371754166e-05, "loss": 0.5307, "step": 3124 }, { "epoch": 0.8666112035496395, "grad_norm": 0.18688172101974487, "learning_rate": 1.541054978250802e-05, "loss": 0.5422, "step": 3125 }, { "epoch": 0.8668885191347754, "grad_norm": 0.19961917400360107, "learning_rate": 1.5405179625336495e-05, "loss": 0.5598, "step": 3126 }, { "epoch": 0.8671658347199113, "grad_norm": 0.18617486953735352, "learning_rate": 1.5399808901287457e-05, "loss": 0.5221, "step": 3127 }, { "epoch": 0.8674431503050472, "grad_norm": 0.1895580142736435, "learning_rate": 1.5394437611408873e-05, "loss": 0.5515, "step": 3128 }, { "epoch": 0.867720465890183, "grad_norm": 0.19370147585868835, "learning_rate": 1.5389065756748826e-05, "loss": 0.5312, "step": 3129 }, { "epoch": 0.8679977814753189, "grad_norm": 0.19625920057296753, "learning_rate": 1.5383693338355504e-05, "loss": 0.526, "step": 3130 }, { "epoch": 0.8682750970604548, "grad_norm": 0.1878737509250641, "learning_rate": 1.537832035727721e-05, "loss": 0.5229, "step": 3131 }, { "epoch": 0.8685524126455907, "grad_norm": 0.19030463695526123, "learning_rate": 1.537294681456235e-05, "loss": 0.5197, "step": 3132 }, { "epoch": 0.8688297282307266, "grad_norm": 0.19420188665390015, "learning_rate": 1.536757271125946e-05, "loss": 0.5369, "step": 3133 }, { "epoch": 0.8691070438158625, "grad_norm": 0.2041894942522049, "learning_rate": 1.5362198048417147e-05, "loss": 0.5315, "step": 3134 }, { "epoch": 0.8693843594009983, "grad_norm": 0.1823713183403015, "learning_rate": 1.535682282708417e-05, "loss": 0.5358, "step": 3135 }, { "epoch": 0.8696616749861342, "grad_norm": 0.18803556263446808, "learning_rate": 1.5351447048309367e-05, "loss": 0.5234, "step": 3136 }, { "epoch": 0.8699389905712701, "grad_norm": 0.20315398275852203, "learning_rate": 1.5346070713141697e-05, "loss": 0.5523, "step": 3137 }, { "epoch": 0.870216306156406, "grad_norm": 0.18089507520198822, "learning_rate": 1.5340693822630224e-05, "loss": 0.5425, "step": 3138 }, { "epoch": 0.8704936217415419, "grad_norm": 0.19417926669120789, "learning_rate": 1.5335316377824127e-05, "loss": 0.5329, "step": 3139 }, { "epoch": 0.8707709373266778, "grad_norm": 0.19868281483650208, "learning_rate": 1.5329938379772685e-05, "loss": 0.5564, "step": 3140 }, { "epoch": 0.8710482529118136, "grad_norm": 0.186373770236969, "learning_rate": 1.5324559829525285e-05, "loss": 0.5498, "step": 3141 }, { "epoch": 0.8713255684969495, "grad_norm": 0.21897022426128387, "learning_rate": 1.531918072813143e-05, "loss": 0.5508, "step": 3142 }, { "epoch": 0.8716028840820854, "grad_norm": 0.19098646938800812, "learning_rate": 1.5313801076640715e-05, "loss": 0.5481, "step": 3143 }, { "epoch": 0.8718801996672213, "grad_norm": 0.1954183280467987, "learning_rate": 1.5308420876102863e-05, "loss": 0.5531, "step": 3144 }, { "epoch": 0.8721575152523572, "grad_norm": 0.18958479166030884, "learning_rate": 1.5303040127567694e-05, "loss": 0.5437, "step": 3145 }, { "epoch": 0.8724348308374931, "grad_norm": 0.17530956864356995, "learning_rate": 1.5297658832085126e-05, "loss": 0.5216, "step": 3146 }, { "epoch": 0.872712146422629, "grad_norm": 0.19810372591018677, "learning_rate": 1.5292276990705202e-05, "loss": 0.5623, "step": 3147 }, { "epoch": 0.8729894620077648, "grad_norm": 0.19809319078922272, "learning_rate": 1.5286894604478054e-05, "loss": 0.5226, "step": 3148 }, { "epoch": 0.8732667775929007, "grad_norm": 0.20191790163516998, "learning_rate": 1.528151167445393e-05, "loss": 0.5612, "step": 3149 }, { "epoch": 0.8735440931780366, "grad_norm": 0.18988439440727234, "learning_rate": 1.5276128201683187e-05, "loss": 0.5362, "step": 3150 }, { "epoch": 0.8738214087631725, "grad_norm": 0.181321382522583, "learning_rate": 1.5270744187216277e-05, "loss": 0.5082, "step": 3151 }, { "epoch": 0.8740987243483084, "grad_norm": 0.18067540228366852, "learning_rate": 1.526535963210377e-05, "loss": 0.529, "step": 3152 }, { "epoch": 0.8743760399334443, "grad_norm": 0.19349637627601624, "learning_rate": 1.5259974537396325e-05, "loss": 0.5495, "step": 3153 }, { "epoch": 0.8746533555185801, "grad_norm": 0.17970281839370728, "learning_rate": 1.5254588904144735e-05, "loss": 0.529, "step": 3154 }, { "epoch": 0.874930671103716, "grad_norm": 0.18023180961608887, "learning_rate": 1.5249202733399859e-05, "loss": 0.5638, "step": 3155 }, { "epoch": 0.8752079866888519, "grad_norm": 0.20077262818813324, "learning_rate": 1.5243816026212695e-05, "loss": 0.5346, "step": 3156 }, { "epoch": 0.8754853022739878, "grad_norm": 0.18828438222408295, "learning_rate": 1.5238428783634326e-05, "loss": 0.5721, "step": 3157 }, { "epoch": 0.8757626178591237, "grad_norm": 0.19122296571731567, "learning_rate": 1.5233041006715948e-05, "loss": 0.5158, "step": 3158 }, { "epoch": 0.8760399334442596, "grad_norm": 0.19371193647384644, "learning_rate": 1.5227652696508859e-05, "loss": 0.5758, "step": 3159 }, { "epoch": 0.8763172490293955, "grad_norm": 0.19302338361740112, "learning_rate": 1.5222263854064465e-05, "loss": 0.5643, "step": 3160 }, { "epoch": 0.8765945646145313, "grad_norm": 0.28244075179100037, "learning_rate": 1.5216874480434264e-05, "loss": 0.5462, "step": 3161 }, { "epoch": 0.8768718801996672, "grad_norm": 0.1919064074754715, "learning_rate": 1.521148457666987e-05, "loss": 0.5317, "step": 3162 }, { "epoch": 0.8771491957848031, "grad_norm": 0.18293742835521698, "learning_rate": 1.5206094143823e-05, "loss": 0.514, "step": 3163 }, { "epoch": 0.877426511369939, "grad_norm": 0.193914532661438, "learning_rate": 1.520070318294546e-05, "loss": 0.5488, "step": 3164 }, { "epoch": 0.8777038269550749, "grad_norm": 0.1869155466556549, "learning_rate": 1.5195311695089175e-05, "loss": 0.5591, "step": 3165 }, { "epoch": 0.8779811425402108, "grad_norm": 0.19056154787540436, "learning_rate": 1.5189919681306173e-05, "loss": 0.5584, "step": 3166 }, { "epoch": 0.8782584581253466, "grad_norm": 0.1936255842447281, "learning_rate": 1.5184527142648569e-05, "loss": 0.5477, "step": 3167 }, { "epoch": 0.8785357737104825, "grad_norm": 0.19368582963943481, "learning_rate": 1.5179134080168595e-05, "loss": 0.5568, "step": 3168 }, { "epoch": 0.8788130892956184, "grad_norm": 0.19679602980613708, "learning_rate": 1.517374049491858e-05, "loss": 0.5549, "step": 3169 }, { "epoch": 0.8790904048807543, "grad_norm": 0.20116961002349854, "learning_rate": 1.5168346387950955e-05, "loss": 0.5565, "step": 3170 }, { "epoch": 0.8793677204658902, "grad_norm": 0.1948871910572052, "learning_rate": 1.5162951760318256e-05, "loss": 0.5502, "step": 3171 }, { "epoch": 0.8796450360510261, "grad_norm": 0.19126254320144653, "learning_rate": 1.515755661307311e-05, "loss": 0.5574, "step": 3172 }, { "epoch": 0.879922351636162, "grad_norm": 0.18868835270404816, "learning_rate": 1.5152160947268262e-05, "loss": 0.5377, "step": 3173 }, { "epoch": 0.8801996672212978, "grad_norm": 0.20093926787376404, "learning_rate": 1.5146764763956542e-05, "loss": 0.5486, "step": 3174 }, { "epoch": 0.8804769828064337, "grad_norm": 0.20768702030181885, "learning_rate": 1.5141368064190897e-05, "loss": 0.5431, "step": 3175 }, { "epoch": 0.8807542983915696, "grad_norm": 0.19229131937026978, "learning_rate": 1.5135970849024356e-05, "loss": 0.5298, "step": 3176 }, { "epoch": 0.8810316139767055, "grad_norm": 0.18817956745624542, "learning_rate": 1.5130573119510064e-05, "loss": 0.5414, "step": 3177 }, { "epoch": 0.8813089295618414, "grad_norm": 0.19554218649864197, "learning_rate": 1.5125174876701262e-05, "loss": 0.5381, "step": 3178 }, { "epoch": 0.8815862451469773, "grad_norm": 0.1836453080177307, "learning_rate": 1.5119776121651288e-05, "loss": 0.508, "step": 3179 }, { "epoch": 0.8818635607321131, "grad_norm": 0.2512352764606476, "learning_rate": 1.5114376855413586e-05, "loss": 0.5146, "step": 3180 }, { "epoch": 0.882140876317249, "grad_norm": 0.19116099178791046, "learning_rate": 1.5108977079041692e-05, "loss": 0.5332, "step": 3181 }, { "epoch": 0.8824181919023849, "grad_norm": 0.19376236200332642, "learning_rate": 1.5103576793589244e-05, "loss": 0.5188, "step": 3182 }, { "epoch": 0.8826955074875208, "grad_norm": 0.2322767972946167, "learning_rate": 1.5098176000109984e-05, "loss": 0.5468, "step": 3183 }, { "epoch": 0.8829728230726567, "grad_norm": 0.19449107348918915, "learning_rate": 1.5092774699657747e-05, "loss": 0.5223, "step": 3184 }, { "epoch": 0.8832501386577926, "grad_norm": 0.19483071565628052, "learning_rate": 1.5087372893286475e-05, "loss": 0.5526, "step": 3185 }, { "epoch": 0.8835274542429284, "grad_norm": 0.18979991972446442, "learning_rate": 1.5081970582050201e-05, "loss": 0.5145, "step": 3186 }, { "epoch": 0.8838047698280643, "grad_norm": 0.18646521866321564, "learning_rate": 1.5076567767003056e-05, "loss": 0.5308, "step": 3187 }, { "epoch": 0.8840820854132002, "grad_norm": 0.1952681541442871, "learning_rate": 1.5071164449199277e-05, "loss": 0.5384, "step": 3188 }, { "epoch": 0.8843594009983361, "grad_norm": 0.1863500475883484, "learning_rate": 1.506576062969319e-05, "loss": 0.5721, "step": 3189 }, { "epoch": 0.884636716583472, "grad_norm": 0.19182690978050232, "learning_rate": 1.5060356309539226e-05, "loss": 0.5171, "step": 3190 }, { "epoch": 0.8849140321686079, "grad_norm": 0.18404962122440338, "learning_rate": 1.5054951489791908e-05, "loss": 0.5362, "step": 3191 }, { "epoch": 0.8851913477537438, "grad_norm": 0.19122952222824097, "learning_rate": 1.5049546171505869e-05, "loss": 0.5424, "step": 3192 }, { "epoch": 0.8854686633388796, "grad_norm": 0.1863165944814682, "learning_rate": 1.5044140355735816e-05, "loss": 0.53, "step": 3193 }, { "epoch": 0.8857459789240155, "grad_norm": 0.19876129925251007, "learning_rate": 1.5038734043536582e-05, "loss": 0.5484, "step": 3194 }, { "epoch": 0.8860232945091514, "grad_norm": 0.18133728206157684, "learning_rate": 1.5033327235963065e-05, "loss": 0.5036, "step": 3195 }, { "epoch": 0.8863006100942873, "grad_norm": 0.19382363557815552, "learning_rate": 1.5027919934070291e-05, "loss": 0.5558, "step": 3196 }, { "epoch": 0.8865779256794232, "grad_norm": 0.2071426957845688, "learning_rate": 1.5022512138913358e-05, "loss": 0.539, "step": 3197 }, { "epoch": 0.8868552412645591, "grad_norm": 0.19322386384010315, "learning_rate": 1.5017103851547476e-05, "loss": 0.551, "step": 3198 }, { "epoch": 0.8871325568496949, "grad_norm": 0.18517249822616577, "learning_rate": 1.5011695073027942e-05, "loss": 0.5171, "step": 3199 }, { "epoch": 0.8874098724348308, "grad_norm": 0.18865883350372314, "learning_rate": 1.5006285804410156e-05, "loss": 0.5206, "step": 3200 }, { "epoch": 0.8876871880199667, "grad_norm": 0.19007782638072968, "learning_rate": 1.5000876046749603e-05, "loss": 0.5194, "step": 3201 }, { "epoch": 0.8879645036051026, "grad_norm": 0.18567977845668793, "learning_rate": 1.4995465801101877e-05, "loss": 0.535, "step": 3202 }, { "epoch": 0.8882418191902385, "grad_norm": 0.19124850630760193, "learning_rate": 1.4990055068522654e-05, "loss": 0.5304, "step": 3203 }, { "epoch": 0.8885191347753744, "grad_norm": 0.19194400310516357, "learning_rate": 1.4984643850067717e-05, "loss": 0.549, "step": 3204 }, { "epoch": 0.8887964503605102, "grad_norm": 0.18528962135314941, "learning_rate": 1.4979232146792936e-05, "loss": 0.5053, "step": 3205 }, { "epoch": 0.8890737659456461, "grad_norm": 0.1908206045627594, "learning_rate": 1.4973819959754273e-05, "loss": 0.5427, "step": 3206 }, { "epoch": 0.889351081530782, "grad_norm": 0.18456414341926575, "learning_rate": 1.4968407290007796e-05, "loss": 0.567, "step": 3207 }, { "epoch": 0.8896283971159179, "grad_norm": 0.19162026047706604, "learning_rate": 1.496299413860966e-05, "loss": 0.5609, "step": 3208 }, { "epoch": 0.8899057127010538, "grad_norm": 0.18303340673446655, "learning_rate": 1.4957580506616109e-05, "loss": 0.5281, "step": 3209 }, { "epoch": 0.8901830282861897, "grad_norm": 0.19368955492973328, "learning_rate": 1.4952166395083486e-05, "loss": 0.5348, "step": 3210 }, { "epoch": 0.8904603438713256, "grad_norm": 0.1844678521156311, "learning_rate": 1.4946751805068238e-05, "loss": 0.542, "step": 3211 }, { "epoch": 0.8907376594564614, "grad_norm": 0.18950358033180237, "learning_rate": 1.4941336737626879e-05, "loss": 0.5558, "step": 3212 }, { "epoch": 0.8910149750415973, "grad_norm": 0.19502434134483337, "learning_rate": 1.4935921193816046e-05, "loss": 0.5109, "step": 3213 }, { "epoch": 0.8912922906267332, "grad_norm": 0.18396303057670593, "learning_rate": 1.4930505174692447e-05, "loss": 0.5152, "step": 3214 }, { "epoch": 0.8915696062118691, "grad_norm": 0.20252270996570587, "learning_rate": 1.4925088681312895e-05, "loss": 0.5219, "step": 3215 }, { "epoch": 0.891846921797005, "grad_norm": 0.18550090491771698, "learning_rate": 1.4919671714734288e-05, "loss": 0.5321, "step": 3216 }, { "epoch": 0.8921242373821409, "grad_norm": 0.19548344612121582, "learning_rate": 1.4914254276013622e-05, "loss": 0.559, "step": 3217 }, { "epoch": 0.8924015529672767, "grad_norm": 0.1747296005487442, "learning_rate": 1.4908836366207985e-05, "loss": 0.5218, "step": 3218 }, { "epoch": 0.8926788685524126, "grad_norm": 0.18561501801013947, "learning_rate": 1.4903417986374548e-05, "loss": 0.5615, "step": 3219 }, { "epoch": 0.8929561841375485, "grad_norm": 0.19387111067771912, "learning_rate": 1.4897999137570586e-05, "loss": 0.5437, "step": 3220 }, { "epoch": 0.8932334997226844, "grad_norm": 0.19557684659957886, "learning_rate": 1.4892579820853459e-05, "loss": 0.5387, "step": 3221 }, { "epoch": 0.8935108153078203, "grad_norm": 0.1842055469751358, "learning_rate": 1.488716003728062e-05, "loss": 0.5277, "step": 3222 }, { "epoch": 0.8937881308929562, "grad_norm": 0.1835155189037323, "learning_rate": 1.4881739787909607e-05, "loss": 0.5342, "step": 3223 }, { "epoch": 0.894065446478092, "grad_norm": 0.1914055347442627, "learning_rate": 1.4876319073798061e-05, "loss": 0.548, "step": 3224 }, { "epoch": 0.8943427620632279, "grad_norm": 0.18737882375717163, "learning_rate": 1.4870897896003705e-05, "loss": 0.5281, "step": 3225 }, { "epoch": 0.8946200776483638, "grad_norm": 0.20303812623023987, "learning_rate": 1.4865476255584351e-05, "loss": 0.5603, "step": 3226 }, { "epoch": 0.8948973932334997, "grad_norm": 0.20020204782485962, "learning_rate": 1.486005415359791e-05, "loss": 0.5434, "step": 3227 }, { "epoch": 0.8951747088186356, "grad_norm": 0.1958005726337433, "learning_rate": 1.4854631591102374e-05, "loss": 0.522, "step": 3228 }, { "epoch": 0.8954520244037715, "grad_norm": 0.23392035067081451, "learning_rate": 1.4849208569155829e-05, "loss": 0.5574, "step": 3229 }, { "epoch": 0.8957293399889074, "grad_norm": 0.1955118626356125, "learning_rate": 1.4843785088816455e-05, "loss": 0.5327, "step": 3230 }, { "epoch": 0.8960066555740432, "grad_norm": 0.18367242813110352, "learning_rate": 1.4838361151142511e-05, "loss": 0.5166, "step": 3231 }, { "epoch": 0.8962839711591791, "grad_norm": 0.19902175664901733, "learning_rate": 1.4832936757192354e-05, "loss": 0.5283, "step": 3232 }, { "epoch": 0.896561286744315, "grad_norm": 0.19180850684642792, "learning_rate": 1.4827511908024419e-05, "loss": 0.5277, "step": 3233 }, { "epoch": 0.8968386023294509, "grad_norm": 0.18553797900676727, "learning_rate": 1.4822086604697253e-05, "loss": 0.5325, "step": 3234 }, { "epoch": 0.8971159179145868, "grad_norm": 0.1895224153995514, "learning_rate": 1.4816660848269462e-05, "loss": 0.552, "step": 3235 }, { "epoch": 0.8973932334997227, "grad_norm": 0.18854071199893951, "learning_rate": 1.4811234639799761e-05, "loss": 0.5471, "step": 3236 }, { "epoch": 0.8976705490848585, "grad_norm": 0.17953041195869446, "learning_rate": 1.480580798034695e-05, "loss": 0.5556, "step": 3237 }, { "epoch": 0.8979478646699944, "grad_norm": 0.1898965984582901, "learning_rate": 1.480038087096991e-05, "loss": 0.5282, "step": 3238 }, { "epoch": 0.8982251802551303, "grad_norm": 0.19018866121768951, "learning_rate": 1.4794953312727613e-05, "loss": 0.5301, "step": 3239 }, { "epoch": 0.8985024958402662, "grad_norm": 0.1814391165971756, "learning_rate": 1.4789525306679122e-05, "loss": 0.5526, "step": 3240 }, { "epoch": 0.8987798114254021, "grad_norm": 0.19814236462116241, "learning_rate": 1.4784096853883586e-05, "loss": 0.5741, "step": 3241 }, { "epoch": 0.899057127010538, "grad_norm": 0.19339510798454285, "learning_rate": 1.4778667955400233e-05, "loss": 0.5375, "step": 3242 }, { "epoch": 0.8993344425956739, "grad_norm": 0.29308080673217773, "learning_rate": 1.4773238612288393e-05, "loss": 0.5239, "step": 3243 }, { "epoch": 0.8996117581808097, "grad_norm": 0.1853429526090622, "learning_rate": 1.476780882560747e-05, "loss": 0.5436, "step": 3244 }, { "epoch": 0.8998890737659456, "grad_norm": 0.18454696238040924, "learning_rate": 1.4762378596416961e-05, "loss": 0.5032, "step": 3245 }, { "epoch": 0.9001663893510815, "grad_norm": 0.19088363647460938, "learning_rate": 1.4756947925776448e-05, "loss": 0.5369, "step": 3246 }, { "epoch": 0.9004437049362174, "grad_norm": 0.18263565003871918, "learning_rate": 1.4751516814745598e-05, "loss": 0.5313, "step": 3247 }, { "epoch": 0.9007210205213533, "grad_norm": 0.1870230883359909, "learning_rate": 1.4746085264384165e-05, "loss": 0.551, "step": 3248 }, { "epoch": 0.9009983361064892, "grad_norm": 0.18567577004432678, "learning_rate": 1.4740653275751987e-05, "loss": 0.5585, "step": 3249 }, { "epoch": 0.901275651691625, "grad_norm": 0.20058301091194153, "learning_rate": 1.4735220849908987e-05, "loss": 0.5031, "step": 3250 }, { "epoch": 0.9015529672767609, "grad_norm": 0.18694059550762177, "learning_rate": 1.4729787987915186e-05, "loss": 0.5334, "step": 3251 }, { "epoch": 0.9018302828618968, "grad_norm": 0.18202227354049683, "learning_rate": 1.4724354690830663e-05, "loss": 0.5553, "step": 3252 }, { "epoch": 0.9021075984470327, "grad_norm": 0.19465142488479614, "learning_rate": 1.4718920959715616e-05, "loss": 0.5115, "step": 3253 }, { "epoch": 0.9023849140321686, "grad_norm": 0.18764632940292358, "learning_rate": 1.4713486795630291e-05, "loss": 0.5546, "step": 3254 }, { "epoch": 0.9026622296173045, "grad_norm": 0.2006525844335556, "learning_rate": 1.4708052199635053e-05, "loss": 0.5239, "step": 3255 }, { "epoch": 0.9029395452024404, "grad_norm": 0.18893550336360931, "learning_rate": 1.4702617172790325e-05, "loss": 0.5246, "step": 3256 }, { "epoch": 0.9032168607875762, "grad_norm": 0.2028273344039917, "learning_rate": 1.4697181716156633e-05, "loss": 0.5548, "step": 3257 }, { "epoch": 0.9034941763727121, "grad_norm": 0.18957914412021637, "learning_rate": 1.4691745830794574e-05, "loss": 0.5261, "step": 3258 }, { "epoch": 0.903771491957848, "grad_norm": 0.19126209616661072, "learning_rate": 1.4686309517764835e-05, "loss": 0.5479, "step": 3259 }, { "epoch": 0.9040488075429839, "grad_norm": 0.1924603134393692, "learning_rate": 1.4680872778128183e-05, "loss": 0.5456, "step": 3260 }, { "epoch": 0.9043261231281198, "grad_norm": 0.1945699006319046, "learning_rate": 1.4675435612945468e-05, "loss": 0.5437, "step": 3261 }, { "epoch": 0.9046034387132557, "grad_norm": 0.18470460176467896, "learning_rate": 1.466999802327763e-05, "loss": 0.5342, "step": 3262 }, { "epoch": 0.9048807542983915, "grad_norm": 0.18286101520061493, "learning_rate": 1.4664560010185685e-05, "loss": 0.5007, "step": 3263 }, { "epoch": 0.9051580698835274, "grad_norm": 0.20376376807689667, "learning_rate": 1.4659121574730736e-05, "loss": 0.5517, "step": 3264 }, { "epoch": 0.9054353854686633, "grad_norm": 0.18289178609848022, "learning_rate": 1.465368271797396e-05, "loss": 0.5254, "step": 3265 }, { "epoch": 0.9057127010537992, "grad_norm": 0.18297207355499268, "learning_rate": 1.4648243440976625e-05, "loss": 0.5287, "step": 3266 }, { "epoch": 0.9059900166389351, "grad_norm": 0.19243334233760834, "learning_rate": 1.464280374480008e-05, "loss": 0.5408, "step": 3267 }, { "epoch": 0.906267332224071, "grad_norm": 0.19214653968811035, "learning_rate": 1.463736363050575e-05, "loss": 0.5455, "step": 3268 }, { "epoch": 0.9065446478092068, "grad_norm": 0.1994084268808365, "learning_rate": 1.4631923099155143e-05, "loss": 0.546, "step": 3269 }, { "epoch": 0.9068219633943427, "grad_norm": 0.19682733714580536, "learning_rate": 1.4626482151809865e-05, "loss": 0.5031, "step": 3270 }, { "epoch": 0.9070992789794786, "grad_norm": 0.3178056478500366, "learning_rate": 1.462104078953157e-05, "loss": 0.5183, "step": 3271 }, { "epoch": 0.9073765945646145, "grad_norm": 0.1968078464269638, "learning_rate": 1.4615599013382028e-05, "loss": 0.5475, "step": 3272 }, { "epoch": 0.9076539101497504, "grad_norm": 0.18579484522342682, "learning_rate": 1.461015682442306e-05, "loss": 0.5296, "step": 3273 }, { "epoch": 0.9079312257348863, "grad_norm": 0.1962941437959671, "learning_rate": 1.4604714223716595e-05, "loss": 0.5565, "step": 3274 }, { "epoch": 0.9082085413200222, "grad_norm": 0.19378679990768433, "learning_rate": 1.4599271212324617e-05, "loss": 0.5506, "step": 3275 }, { "epoch": 0.908485856905158, "grad_norm": 0.22947950661182404, "learning_rate": 1.4593827791309206e-05, "loss": 0.5625, "step": 3276 }, { "epoch": 0.9087631724902939, "grad_norm": 0.1949935257434845, "learning_rate": 1.458838396173252e-05, "loss": 0.5327, "step": 3277 }, { "epoch": 0.9090404880754298, "grad_norm": 0.19015999138355255, "learning_rate": 1.458293972465679e-05, "loss": 0.5168, "step": 3278 }, { "epoch": 0.9093178036605657, "grad_norm": 0.19812412559986115, "learning_rate": 1.4577495081144337e-05, "loss": 0.5389, "step": 3279 }, { "epoch": 0.9095951192457016, "grad_norm": 0.19743192195892334, "learning_rate": 1.4572050032257548e-05, "loss": 0.5492, "step": 3280 }, { "epoch": 0.9098724348308375, "grad_norm": 0.17543548345565796, "learning_rate": 1.4566604579058904e-05, "loss": 0.5064, "step": 3281 }, { "epoch": 0.9101497504159733, "grad_norm": 0.20693279802799225, "learning_rate": 1.4561158722610948e-05, "loss": 0.5312, "step": 3282 }, { "epoch": 0.9104270660011092, "grad_norm": 0.19201938807964325, "learning_rate": 1.4555712463976318e-05, "loss": 0.518, "step": 3283 }, { "epoch": 0.9107043815862451, "grad_norm": 0.1977842152118683, "learning_rate": 1.4550265804217722e-05, "loss": 0.5263, "step": 3284 }, { "epoch": 0.910981697171381, "grad_norm": 0.18007364869117737, "learning_rate": 1.4544818744397947e-05, "loss": 0.521, "step": 3285 }, { "epoch": 0.9112590127565169, "grad_norm": 0.19621910154819489, "learning_rate": 1.453937128557986e-05, "loss": 0.5701, "step": 3286 }, { "epoch": 0.9115363283416528, "grad_norm": 0.1956057995557785, "learning_rate": 1.4533923428826399e-05, "loss": 0.5542, "step": 3287 }, { "epoch": 0.9118136439267887, "grad_norm": 0.20553717017173767, "learning_rate": 1.452847517520059e-05, "loss": 0.5379, "step": 3288 }, { "epoch": 0.9120909595119245, "grad_norm": 0.19297057390213013, "learning_rate": 1.4523026525765532e-05, "loss": 0.5562, "step": 3289 }, { "epoch": 0.9123682750970604, "grad_norm": 0.19896887242794037, "learning_rate": 1.4517577481584399e-05, "loss": 0.5502, "step": 3290 }, { "epoch": 0.9126455906821963, "grad_norm": 0.18490025401115417, "learning_rate": 1.4512128043720447e-05, "loss": 0.5426, "step": 3291 }, { "epoch": 0.9129229062673322, "grad_norm": 0.19716937839984894, "learning_rate": 1.4506678213236998e-05, "loss": 0.5433, "step": 3292 }, { "epoch": 0.9132002218524681, "grad_norm": 0.19106687605381012, "learning_rate": 1.4501227991197472e-05, "loss": 0.5437, "step": 3293 }, { "epoch": 0.913477537437604, "grad_norm": 0.18353629112243652, "learning_rate": 1.4495777378665337e-05, "loss": 0.5374, "step": 3294 }, { "epoch": 0.9137548530227398, "grad_norm": 0.19178996980190277, "learning_rate": 1.4490326376704161e-05, "loss": 0.5471, "step": 3295 }, { "epoch": 0.9140321686078757, "grad_norm": 0.18615961074829102, "learning_rate": 1.4484874986377573e-05, "loss": 0.5503, "step": 3296 }, { "epoch": 0.9143094841930116, "grad_norm": 0.19387958943843842, "learning_rate": 1.447942320874929e-05, "loss": 0.5174, "step": 3297 }, { "epoch": 0.9145867997781475, "grad_norm": 0.18303687870502472, "learning_rate": 1.4473971044883095e-05, "loss": 0.5335, "step": 3298 }, { "epoch": 0.9148641153632834, "grad_norm": 0.9202612638473511, "learning_rate": 1.4468518495842848e-05, "loss": 0.547, "step": 3299 }, { "epoch": 0.9151414309484193, "grad_norm": 0.1895207315683365, "learning_rate": 1.446306556269249e-05, "loss": 0.5561, "step": 3300 }, { "epoch": 0.9154187465335551, "grad_norm": 0.18821243941783905, "learning_rate": 1.4457612246496027e-05, "loss": 0.5318, "step": 3301 }, { "epoch": 0.915696062118691, "grad_norm": 0.1886141002178192, "learning_rate": 1.4452158548317551e-05, "loss": 0.5517, "step": 3302 }, { "epoch": 0.9159733777038269, "grad_norm": 0.1940373033285141, "learning_rate": 1.444670446922122e-05, "loss": 0.5617, "step": 3303 }, { "epoch": 0.9162506932889628, "grad_norm": 0.18342509865760803, "learning_rate": 1.444125001027127e-05, "loss": 0.5438, "step": 3304 }, { "epoch": 0.9165280088740987, "grad_norm": 0.19425570964813232, "learning_rate": 1.4435795172532014e-05, "loss": 0.5413, "step": 3305 }, { "epoch": 0.9168053244592346, "grad_norm": 0.19505858421325684, "learning_rate": 1.4430339957067826e-05, "loss": 0.5666, "step": 3306 }, { "epoch": 0.9170826400443705, "grad_norm": 0.19822010397911072, "learning_rate": 1.4424884364943172e-05, "loss": 0.521, "step": 3307 }, { "epoch": 0.9173599556295063, "grad_norm": 0.1916007101535797, "learning_rate": 1.4419428397222582e-05, "loss": 0.5649, "step": 3308 }, { "epoch": 0.9176372712146422, "grad_norm": 0.19743779301643372, "learning_rate": 1.441397205497065e-05, "loss": 0.5337, "step": 3309 }, { "epoch": 0.9179145867997781, "grad_norm": 0.19250613451004028, "learning_rate": 1.4408515339252068e-05, "loss": 0.5477, "step": 3310 }, { "epoch": 0.918191902384914, "grad_norm": 0.18895046412944794, "learning_rate": 1.4403058251131574e-05, "loss": 0.5782, "step": 3311 }, { "epoch": 0.9184692179700499, "grad_norm": 0.1895778477191925, "learning_rate": 1.4397600791673999e-05, "loss": 0.5344, "step": 3312 }, { "epoch": 0.9187465335551858, "grad_norm": 0.1935378462076187, "learning_rate": 1.4392142961944228e-05, "loss": 0.5568, "step": 3313 }, { "epoch": 0.9190238491403216, "grad_norm": 0.18773426115512848, "learning_rate": 1.4386684763007235e-05, "loss": 0.546, "step": 3314 }, { "epoch": 0.9193011647254575, "grad_norm": 0.18107493221759796, "learning_rate": 1.438122619592806e-05, "loss": 0.5007, "step": 3315 }, { "epoch": 0.9195784803105934, "grad_norm": 0.19908292591571808, "learning_rate": 1.4375767261771814e-05, "loss": 0.545, "step": 3316 }, { "epoch": 0.9198557958957293, "grad_norm": 0.19010509550571442, "learning_rate": 1.4370307961603673e-05, "loss": 0.5593, "step": 3317 }, { "epoch": 0.9201331114808652, "grad_norm": 0.19176799058914185, "learning_rate": 1.4364848296488897e-05, "loss": 0.5676, "step": 3318 }, { "epoch": 0.9204104270660011, "grad_norm": 0.1841881275177002, "learning_rate": 1.4359388267492812e-05, "loss": 0.5361, "step": 3319 }, { "epoch": 0.920687742651137, "grad_norm": 0.19731801748275757, "learning_rate": 1.4353927875680808e-05, "loss": 0.5213, "step": 3320 }, { "epoch": 0.9209650582362728, "grad_norm": 0.18500731885433197, "learning_rate": 1.4348467122118364e-05, "loss": 0.5417, "step": 3321 }, { "epoch": 0.9212423738214087, "grad_norm": 0.2047429382801056, "learning_rate": 1.4343006007871004e-05, "loss": 0.5333, "step": 3322 }, { "epoch": 0.9215196894065446, "grad_norm": 0.2538173198699951, "learning_rate": 1.4337544534004346e-05, "loss": 0.5529, "step": 3323 }, { "epoch": 0.9217970049916805, "grad_norm": 0.1965561956167221, "learning_rate": 1.4332082701584063e-05, "loss": 0.5432, "step": 3324 }, { "epoch": 0.9220743205768164, "grad_norm": 0.1974562108516693, "learning_rate": 1.4326620511675906e-05, "loss": 0.5159, "step": 3325 }, { "epoch": 0.9223516361619523, "grad_norm": 0.19231897592544556, "learning_rate": 1.4321157965345688e-05, "loss": 0.5674, "step": 3326 }, { "epoch": 0.9226289517470881, "grad_norm": 0.19817808270454407, "learning_rate": 1.4315695063659304e-05, "loss": 0.5496, "step": 3327 }, { "epoch": 0.922906267332224, "grad_norm": 0.1902833878993988, "learning_rate": 1.4310231807682706e-05, "loss": 0.5342, "step": 3328 }, { "epoch": 0.9231835829173599, "grad_norm": 0.19699136912822723, "learning_rate": 1.4304768198481923e-05, "loss": 0.5417, "step": 3329 }, { "epoch": 0.9234608985024958, "grad_norm": 0.19054579734802246, "learning_rate": 1.4299304237123043e-05, "loss": 0.5556, "step": 3330 }, { "epoch": 0.9237382140876317, "grad_norm": 0.191939577460289, "learning_rate": 1.4293839924672242e-05, "loss": 0.5304, "step": 3331 }, { "epoch": 0.9240155296727676, "grad_norm": 0.22973594069480896, "learning_rate": 1.4288375262195739e-05, "loss": 0.5337, "step": 3332 }, { "epoch": 0.9242928452579035, "grad_norm": 0.1892794668674469, "learning_rate": 1.4282910250759843e-05, "loss": 0.5527, "step": 3333 }, { "epoch": 0.9245701608430393, "grad_norm": 0.19066397845745087, "learning_rate": 1.427744489143092e-05, "loss": 0.5446, "step": 3334 }, { "epoch": 0.9248474764281752, "grad_norm": 0.18730609118938446, "learning_rate": 1.4271979185275402e-05, "loss": 0.5425, "step": 3335 }, { "epoch": 0.9251247920133111, "grad_norm": 0.18019746243953705, "learning_rate": 1.4266513133359802e-05, "loss": 0.5263, "step": 3336 }, { "epoch": 0.925402107598447, "grad_norm": 0.19604218006134033, "learning_rate": 1.4261046736750686e-05, "loss": 0.5552, "step": 3337 }, { "epoch": 0.9256794231835829, "grad_norm": 0.1902369260787964, "learning_rate": 1.4255579996514693e-05, "loss": 0.5531, "step": 3338 }, { "epoch": 0.9259567387687188, "grad_norm": 0.19522936642169952, "learning_rate": 1.4250112913718525e-05, "loss": 0.545, "step": 3339 }, { "epoch": 0.9262340543538546, "grad_norm": 0.21855325996875763, "learning_rate": 1.4244645489428968e-05, "loss": 0.5133, "step": 3340 }, { "epoch": 0.9265113699389905, "grad_norm": 0.20022931694984436, "learning_rate": 1.4239177724712843e-05, "loss": 0.5392, "step": 3341 }, { "epoch": 0.9267886855241264, "grad_norm": 0.19183357059955597, "learning_rate": 1.423370962063707e-05, "loss": 0.5338, "step": 3342 }, { "epoch": 0.9270660011092623, "grad_norm": 0.18884657323360443, "learning_rate": 1.4228241178268617e-05, "loss": 0.5143, "step": 3343 }, { "epoch": 0.9273433166943982, "grad_norm": 0.19082637131214142, "learning_rate": 1.4222772398674522e-05, "loss": 0.5281, "step": 3344 }, { "epoch": 0.9276206322795341, "grad_norm": 0.20209652185440063, "learning_rate": 1.4217303282921888e-05, "loss": 0.526, "step": 3345 }, { "epoch": 0.92789794786467, "grad_norm": 0.18856458365917206, "learning_rate": 1.4211833832077881e-05, "loss": 0.52, "step": 3346 }, { "epoch": 0.9281752634498058, "grad_norm": 0.1941954344511032, "learning_rate": 1.4206364047209742e-05, "loss": 0.5401, "step": 3347 }, { "epoch": 0.9284525790349417, "grad_norm": 0.18749526143074036, "learning_rate": 1.4200893929384767e-05, "loss": 0.5449, "step": 3348 }, { "epoch": 0.9287298946200776, "grad_norm": 0.18534240126609802, "learning_rate": 1.4195423479670319e-05, "loss": 0.5039, "step": 3349 }, { "epoch": 0.9290072102052135, "grad_norm": 0.18905483186244965, "learning_rate": 1.4189952699133837e-05, "loss": 0.5632, "step": 3350 }, { "epoch": 0.9292845257903494, "grad_norm": 0.20442554354667664, "learning_rate": 1.4184481588842805e-05, "loss": 0.5339, "step": 3351 }, { "epoch": 0.9295618413754853, "grad_norm": 0.1810925155878067, "learning_rate": 1.4179010149864785e-05, "loss": 0.512, "step": 3352 }, { "epoch": 0.9298391569606211, "grad_norm": 0.2010018676519394, "learning_rate": 1.4173538383267404e-05, "loss": 0.5514, "step": 3353 }, { "epoch": 0.930116472545757, "grad_norm": 0.18728989362716675, "learning_rate": 1.4168066290118342e-05, "loss": 0.5161, "step": 3354 }, { "epoch": 0.9303937881308929, "grad_norm": 0.18747897446155548, "learning_rate": 1.4162593871485352e-05, "loss": 0.5549, "step": 3355 }, { "epoch": 0.9306711037160288, "grad_norm": 0.20961935818195343, "learning_rate": 1.415712112843625e-05, "loss": 0.5422, "step": 3356 }, { "epoch": 0.9309484193011647, "grad_norm": 0.1834210753440857, "learning_rate": 1.415164806203891e-05, "loss": 0.5074, "step": 3357 }, { "epoch": 0.9312257348863006, "grad_norm": 0.1954893171787262, "learning_rate": 1.4146174673361273e-05, "loss": 0.5722, "step": 3358 }, { "epoch": 0.9315030504714364, "grad_norm": 0.1870308816432953, "learning_rate": 1.4140700963471346e-05, "loss": 0.5381, "step": 3359 }, { "epoch": 0.9317803660565723, "grad_norm": 0.19276247918605804, "learning_rate": 1.4135226933437185e-05, "loss": 0.5517, "step": 3360 }, { "epoch": 0.9320576816417082, "grad_norm": 0.18969060480594635, "learning_rate": 1.412975258432693e-05, "loss": 0.5347, "step": 3361 }, { "epoch": 0.9323349972268441, "grad_norm": 0.19431856274604797, "learning_rate": 1.4124277917208765e-05, "loss": 0.5077, "step": 3362 }, { "epoch": 0.93261231281198, "grad_norm": 0.1918652504682541, "learning_rate": 1.4118802933150943e-05, "loss": 0.526, "step": 3363 }, { "epoch": 0.9328896283971159, "grad_norm": 0.19903786480426788, "learning_rate": 1.4113327633221782e-05, "loss": 0.5312, "step": 3364 }, { "epoch": 0.9331669439822518, "grad_norm": 0.21233876049518585, "learning_rate": 1.4107852018489653e-05, "loss": 0.5256, "step": 3365 }, { "epoch": 0.9334442595673876, "grad_norm": 0.1878969967365265, "learning_rate": 1.4102376090022997e-05, "loss": 0.5439, "step": 3366 }, { "epoch": 0.9337215751525235, "grad_norm": 0.194586381316185, "learning_rate": 1.409689984889031e-05, "loss": 0.5392, "step": 3367 }, { "epoch": 0.9339988907376594, "grad_norm": 0.1912042200565338, "learning_rate": 1.4091423296160152e-05, "loss": 0.5269, "step": 3368 }, { "epoch": 0.9342762063227953, "grad_norm": 0.18847908079624176, "learning_rate": 1.4085946432901154e-05, "loss": 0.5175, "step": 3369 }, { "epoch": 0.9345535219079312, "grad_norm": 0.19912898540496826, "learning_rate": 1.4080469260181977e-05, "loss": 0.5336, "step": 3370 }, { "epoch": 0.9348308374930671, "grad_norm": 0.1928989291191101, "learning_rate": 1.4074991779071378e-05, "loss": 0.5847, "step": 3371 }, { "epoch": 0.9351081530782029, "grad_norm": 0.20281600952148438, "learning_rate": 1.4069513990638156e-05, "loss": 0.5772, "step": 3372 }, { "epoch": 0.9353854686633388, "grad_norm": 0.21058966219425201, "learning_rate": 1.4064035895951169e-05, "loss": 0.5201, "step": 3373 }, { "epoch": 0.9356627842484747, "grad_norm": 0.190606027841568, "learning_rate": 1.4058557496079342e-05, "loss": 0.5226, "step": 3374 }, { "epoch": 0.9359400998336106, "grad_norm": 0.1862584948539734, "learning_rate": 1.4053078792091654e-05, "loss": 0.5275, "step": 3375 }, { "epoch": 0.9362174154187465, "grad_norm": 0.1864033192396164, "learning_rate": 1.4047599785057144e-05, "loss": 0.515, "step": 3376 }, { "epoch": 0.9364947310038824, "grad_norm": 0.21961943805217743, "learning_rate": 1.4042120476044912e-05, "loss": 0.5344, "step": 3377 }, { "epoch": 0.9367720465890182, "grad_norm": 0.18101632595062256, "learning_rate": 1.4036640866124123e-05, "loss": 0.547, "step": 3378 }, { "epoch": 0.9370493621741541, "grad_norm": 0.21359553933143616, "learning_rate": 1.4031160956363982e-05, "loss": 0.5392, "step": 3379 }, { "epoch": 0.93732667775929, "grad_norm": 0.1979496330022812, "learning_rate": 1.4025680747833775e-05, "loss": 0.5188, "step": 3380 }, { "epoch": 0.937603993344426, "grad_norm": 0.18914289772510529, "learning_rate": 1.402020024160283e-05, "loss": 0.5291, "step": 3381 }, { "epoch": 0.9378813089295619, "grad_norm": 0.18374024331569672, "learning_rate": 1.4014719438740543e-05, "loss": 0.5368, "step": 3382 }, { "epoch": 0.9381586245146978, "grad_norm": 0.201919287443161, "learning_rate": 1.4009238340316358e-05, "loss": 0.5429, "step": 3383 }, { "epoch": 0.9384359400998337, "grad_norm": 0.19644081592559814, "learning_rate": 1.400375694739979e-05, "loss": 0.5151, "step": 3384 }, { "epoch": 0.9387132556849695, "grad_norm": 0.18970987200737, "learning_rate": 1.39982752610604e-05, "loss": 0.5852, "step": 3385 }, { "epoch": 0.9389905712701054, "grad_norm": 0.19986121356487274, "learning_rate": 1.3992793282367808e-05, "loss": 0.5209, "step": 3386 }, { "epoch": 0.9392678868552413, "grad_norm": 0.19541729986667633, "learning_rate": 1.3987311012391698e-05, "loss": 0.5315, "step": 3387 }, { "epoch": 0.9395452024403772, "grad_norm": 0.1904657930135727, "learning_rate": 1.3981828452201804e-05, "loss": 0.5483, "step": 3388 }, { "epoch": 0.9398225180255131, "grad_norm": 0.19447685778141022, "learning_rate": 1.3976345602867916e-05, "loss": 0.5446, "step": 3389 }, { "epoch": 0.940099833610649, "grad_norm": 0.1892615556716919, "learning_rate": 1.3970862465459891e-05, "loss": 0.5366, "step": 3390 }, { "epoch": 0.9403771491957849, "grad_norm": 0.1949482262134552, "learning_rate": 1.3965379041047624e-05, "loss": 0.5577, "step": 3391 }, { "epoch": 0.9406544647809207, "grad_norm": 0.19417253136634827, "learning_rate": 1.3959895330701083e-05, "loss": 0.5431, "step": 3392 }, { "epoch": 0.9409317803660566, "grad_norm": 0.19106963276863098, "learning_rate": 1.3954411335490284e-05, "loss": 0.5308, "step": 3393 }, { "epoch": 0.9412090959511925, "grad_norm": 0.19582101702690125, "learning_rate": 1.39489270564853e-05, "loss": 0.5221, "step": 3394 }, { "epoch": 0.9414864115363284, "grad_norm": 0.1944214105606079, "learning_rate": 1.3943442494756259e-05, "loss": 0.5428, "step": 3395 }, { "epoch": 0.9417637271214643, "grad_norm": 0.19008556008338928, "learning_rate": 1.3937957651373342e-05, "loss": 0.5458, "step": 3396 }, { "epoch": 0.9420410427066002, "grad_norm": 0.186477929353714, "learning_rate": 1.3932472527406792e-05, "loss": 0.5315, "step": 3397 }, { "epoch": 0.942318358291736, "grad_norm": 0.20386064052581787, "learning_rate": 1.3926987123926897e-05, "loss": 0.5309, "step": 3398 }, { "epoch": 0.9425956738768719, "grad_norm": 0.18348954617977142, "learning_rate": 1.3921501442004011e-05, "loss": 0.5333, "step": 3399 }, { "epoch": 0.9428729894620078, "grad_norm": 0.19923634827136993, "learning_rate": 1.3916015482708528e-05, "loss": 0.5301, "step": 3400 }, { "epoch": 0.9431503050471437, "grad_norm": 0.19134169816970825, "learning_rate": 1.3910529247110906e-05, "loss": 0.519, "step": 3401 }, { "epoch": 0.9434276206322796, "grad_norm": 0.18783889710903168, "learning_rate": 1.390504273628166e-05, "loss": 0.5207, "step": 3402 }, { "epoch": 0.9437049362174155, "grad_norm": 0.18133123219013214, "learning_rate": 1.3899555951291348e-05, "loss": 0.5244, "step": 3403 }, { "epoch": 0.9439822518025514, "grad_norm": 0.17967382073402405, "learning_rate": 1.3894068893210594e-05, "loss": 0.558, "step": 3404 }, { "epoch": 0.9442595673876872, "grad_norm": 0.1942291408777237, "learning_rate": 1.3888581563110059e-05, "loss": 0.5307, "step": 3405 }, { "epoch": 0.9445368829728231, "grad_norm": 0.22371730208396912, "learning_rate": 1.3883093962060472e-05, "loss": 0.5386, "step": 3406 }, { "epoch": 0.944814198557959, "grad_norm": 0.18771930038928986, "learning_rate": 1.387760609113261e-05, "loss": 0.5267, "step": 3407 }, { "epoch": 0.9450915141430949, "grad_norm": 0.19484716653823853, "learning_rate": 1.3872117951397298e-05, "loss": 0.5578, "step": 3408 }, { "epoch": 0.9453688297282308, "grad_norm": 0.190118208527565, "learning_rate": 1.3866629543925424e-05, "loss": 0.4886, "step": 3409 }, { "epoch": 0.9456461453133667, "grad_norm": 0.18805035948753357, "learning_rate": 1.3861140869787914e-05, "loss": 0.5393, "step": 3410 }, { "epoch": 0.9459234608985025, "grad_norm": 0.1881994605064392, "learning_rate": 1.385565193005576e-05, "loss": 0.5368, "step": 3411 }, { "epoch": 0.9462007764836384, "grad_norm": 0.18705572187900543, "learning_rate": 1.3850162725799997e-05, "loss": 0.5706, "step": 3412 }, { "epoch": 0.9464780920687743, "grad_norm": 0.23857946693897247, "learning_rate": 1.3844673258091714e-05, "loss": 0.5151, "step": 3413 }, { "epoch": 0.9467554076539102, "grad_norm": 0.190442755818367, "learning_rate": 1.383918352800205e-05, "loss": 0.5287, "step": 3414 }, { "epoch": 0.9470327232390461, "grad_norm": 0.19661828875541687, "learning_rate": 1.38336935366022e-05, "loss": 0.5474, "step": 3415 }, { "epoch": 0.947310038824182, "grad_norm": 0.1797836571931839, "learning_rate": 1.3828203284963409e-05, "loss": 0.5416, "step": 3416 }, { "epoch": 0.9475873544093179, "grad_norm": 0.23815502226352692, "learning_rate": 1.382271277415696e-05, "loss": 0.5501, "step": 3417 }, { "epoch": 0.9478646699944537, "grad_norm": 0.19180168211460114, "learning_rate": 1.381722200525421e-05, "loss": 0.5254, "step": 3418 }, { "epoch": 0.9481419855795896, "grad_norm": 0.2042774260044098, "learning_rate": 1.3811730979326545e-05, "loss": 0.5437, "step": 3419 }, { "epoch": 0.9484193011647255, "grad_norm": 0.1914202868938446, "learning_rate": 1.3806239697445414e-05, "loss": 0.5602, "step": 3420 }, { "epoch": 0.9486966167498614, "grad_norm": 0.2009340077638626, "learning_rate": 1.3800748160682309e-05, "loss": 0.5497, "step": 3421 }, { "epoch": 0.9489739323349973, "grad_norm": 0.19518032670021057, "learning_rate": 1.3795256370108776e-05, "loss": 0.5149, "step": 3422 }, { "epoch": 0.9492512479201332, "grad_norm": 0.19419553875923157, "learning_rate": 1.3789764326796407e-05, "loss": 0.5407, "step": 3423 }, { "epoch": 0.949528563505269, "grad_norm": 0.19669625163078308, "learning_rate": 1.3784272031816844e-05, "loss": 0.537, "step": 3424 }, { "epoch": 0.9498058790904049, "grad_norm": 0.1969965547323227, "learning_rate": 1.3778779486241786e-05, "loss": 0.5324, "step": 3425 }, { "epoch": 0.9500831946755408, "grad_norm": 0.1851070374250412, "learning_rate": 1.3773286691142966e-05, "loss": 0.5533, "step": 3426 }, { "epoch": 0.9503605102606767, "grad_norm": 0.18586984276771545, "learning_rate": 1.3767793647592175e-05, "loss": 0.5347, "step": 3427 }, { "epoch": 0.9506378258458126, "grad_norm": 0.1796058714389801, "learning_rate": 1.3762300356661261e-05, "loss": 0.5223, "step": 3428 }, { "epoch": 0.9509151414309485, "grad_norm": 0.1968797892332077, "learning_rate": 1.3756806819422097e-05, "loss": 0.5296, "step": 3429 }, { "epoch": 0.9511924570160843, "grad_norm": 0.19946354627609253, "learning_rate": 1.3751313036946627e-05, "loss": 0.555, "step": 3430 }, { "epoch": 0.9514697726012202, "grad_norm": 0.19412177801132202, "learning_rate": 1.3745819010306832e-05, "loss": 0.5367, "step": 3431 }, { "epoch": 0.9517470881863561, "grad_norm": 0.1967850923538208, "learning_rate": 1.374032474057474e-05, "loss": 0.5355, "step": 3432 }, { "epoch": 0.952024403771492, "grad_norm": 0.26605215668678284, "learning_rate": 1.3734830228822428e-05, "loss": 0.5161, "step": 3433 }, { "epoch": 0.9523017193566279, "grad_norm": 0.21650773286819458, "learning_rate": 1.372933547612202e-05, "loss": 0.5398, "step": 3434 }, { "epoch": 0.9525790349417638, "grad_norm": 0.20562243461608887, "learning_rate": 1.3723840483545697e-05, "loss": 0.5509, "step": 3435 }, { "epoch": 0.9528563505268997, "grad_norm": 0.19407188892364502, "learning_rate": 1.3718345252165663e-05, "loss": 0.5373, "step": 3436 }, { "epoch": 0.9531336661120355, "grad_norm": 0.19095157086849213, "learning_rate": 1.3712849783054197e-05, "loss": 0.54, "step": 3437 }, { "epoch": 0.9534109816971714, "grad_norm": 0.1831715852022171, "learning_rate": 1.3707354077283599e-05, "loss": 0.5451, "step": 3438 }, { "epoch": 0.9536882972823073, "grad_norm": 0.19795599579811096, "learning_rate": 1.3701858135926238e-05, "loss": 0.5329, "step": 3439 }, { "epoch": 0.9539656128674432, "grad_norm": 0.19685760140419006, "learning_rate": 1.3696361960054506e-05, "loss": 0.5525, "step": 3440 }, { "epoch": 0.9542429284525791, "grad_norm": 0.1895620971918106, "learning_rate": 1.3690865550740864e-05, "loss": 0.539, "step": 3441 }, { "epoch": 0.954520244037715, "grad_norm": 0.1906086504459381, "learning_rate": 1.3685368909057799e-05, "loss": 0.5449, "step": 3442 }, { "epoch": 0.9547975596228508, "grad_norm": 0.19026583433151245, "learning_rate": 1.3679872036077853e-05, "loss": 0.5303, "step": 3443 }, { "epoch": 0.9550748752079867, "grad_norm": 0.1858745813369751, "learning_rate": 1.3674374932873615e-05, "loss": 0.5364, "step": 3444 }, { "epoch": 0.9553521907931226, "grad_norm": 0.19552922248840332, "learning_rate": 1.3668877600517712e-05, "loss": 0.5367, "step": 3445 }, { "epoch": 0.9556295063782585, "grad_norm": 0.1855076402425766, "learning_rate": 1.3663380040082821e-05, "loss": 0.532, "step": 3446 }, { "epoch": 0.9559068219633944, "grad_norm": 0.19864481687545776, "learning_rate": 1.365788225264166e-05, "loss": 0.5252, "step": 3447 }, { "epoch": 0.9561841375485303, "grad_norm": 0.18544144928455353, "learning_rate": 1.3652384239266993e-05, "loss": 0.5359, "step": 3448 }, { "epoch": 0.9564614531336662, "grad_norm": 0.18177370727062225, "learning_rate": 1.364688600103163e-05, "loss": 0.5516, "step": 3449 }, { "epoch": 0.956738768718802, "grad_norm": 0.2049761265516281, "learning_rate": 1.3641387539008424e-05, "loss": 0.5446, "step": 3450 }, { "epoch": 0.9570160843039379, "grad_norm": 0.18312260508537292, "learning_rate": 1.3635888854270268e-05, "loss": 0.5292, "step": 3451 }, { "epoch": 0.9572933998890738, "grad_norm": 0.18809424340724945, "learning_rate": 1.36303899478901e-05, "loss": 0.5296, "step": 3452 }, { "epoch": 0.9575707154742097, "grad_norm": 0.183831587433815, "learning_rate": 1.3624890820940902e-05, "loss": 0.5213, "step": 3453 }, { "epoch": 0.9578480310593456, "grad_norm": 0.18347223103046417, "learning_rate": 1.3619391474495708e-05, "loss": 0.5516, "step": 3454 }, { "epoch": 0.9581253466444815, "grad_norm": 0.1897597759962082, "learning_rate": 1.3613891909627575e-05, "loss": 0.5539, "step": 3455 }, { "epoch": 0.9584026622296173, "grad_norm": 0.19610610604286194, "learning_rate": 1.360839212740962e-05, "loss": 0.5581, "step": 3456 }, { "epoch": 0.9586799778147532, "grad_norm": 0.20589366555213928, "learning_rate": 1.3602892128914992e-05, "loss": 0.5266, "step": 3457 }, { "epoch": 0.9589572933998891, "grad_norm": 0.18203216791152954, "learning_rate": 1.3597391915216896e-05, "loss": 0.5222, "step": 3458 }, { "epoch": 0.959234608985025, "grad_norm": 0.19360937178134918, "learning_rate": 1.3591891487388553e-05, "loss": 0.5271, "step": 3459 }, { "epoch": 0.9595119245701609, "grad_norm": 0.2998809814453125, "learning_rate": 1.3586390846503259e-05, "loss": 0.5281, "step": 3460 }, { "epoch": 0.9597892401552968, "grad_norm": 0.18753725290298462, "learning_rate": 1.3580889993634322e-05, "loss": 0.5553, "step": 3461 }, { "epoch": 0.9600665557404326, "grad_norm": 0.19862516224384308, "learning_rate": 1.3575388929855112e-05, "loss": 0.4975, "step": 3462 }, { "epoch": 0.9603438713255685, "grad_norm": 0.18844319880008698, "learning_rate": 1.3569887656239033e-05, "loss": 0.4977, "step": 3463 }, { "epoch": 0.9606211869107044, "grad_norm": 0.2365204095840454, "learning_rate": 1.3564386173859523e-05, "loss": 0.5243, "step": 3464 }, { "epoch": 0.9608985024958403, "grad_norm": 0.19346970319747925, "learning_rate": 1.3558884483790072e-05, "loss": 0.5504, "step": 3465 }, { "epoch": 0.9611758180809762, "grad_norm": 0.19397403299808502, "learning_rate": 1.3553382587104201e-05, "loss": 0.5448, "step": 3466 }, { "epoch": 0.9614531336661121, "grad_norm": 0.19531095027923584, "learning_rate": 1.3547880484875477e-05, "loss": 0.5614, "step": 3467 }, { "epoch": 0.961730449251248, "grad_norm": 0.20053645968437195, "learning_rate": 1.354237817817751e-05, "loss": 0.5412, "step": 3468 }, { "epoch": 0.9620077648363838, "grad_norm": 0.19779855012893677, "learning_rate": 1.3536875668083943e-05, "loss": 0.5675, "step": 3469 }, { "epoch": 0.9622850804215197, "grad_norm": 0.19257789850234985, "learning_rate": 1.3531372955668462e-05, "loss": 0.5128, "step": 3470 }, { "epoch": 0.9625623960066556, "grad_norm": 0.19003498554229736, "learning_rate": 1.352587004200479e-05, "loss": 0.545, "step": 3471 }, { "epoch": 0.9628397115917915, "grad_norm": 0.1935829222202301, "learning_rate": 1.3520366928166695e-05, "loss": 0.5568, "step": 3472 }, { "epoch": 0.9631170271769274, "grad_norm": 0.19856667518615723, "learning_rate": 1.3514863615227979e-05, "loss": 0.5247, "step": 3473 }, { "epoch": 0.9633943427620633, "grad_norm": 0.18320922553539276, "learning_rate": 1.3509360104262478e-05, "loss": 0.5358, "step": 3474 }, { "epoch": 0.9636716583471991, "grad_norm": 0.20552265644073486, "learning_rate": 1.3503856396344086e-05, "loss": 0.5631, "step": 3475 }, { "epoch": 0.963948973932335, "grad_norm": 0.19140848517417908, "learning_rate": 1.3498352492546706e-05, "loss": 0.5241, "step": 3476 }, { "epoch": 0.9642262895174709, "grad_norm": 0.18474581837654114, "learning_rate": 1.3492848393944312e-05, "loss": 0.4961, "step": 3477 }, { "epoch": 0.9645036051026068, "grad_norm": 0.1950991153717041, "learning_rate": 1.3487344101610885e-05, "loss": 0.5223, "step": 3478 }, { "epoch": 0.9647809206877427, "grad_norm": 0.19947074353694916, "learning_rate": 1.348183961662047e-05, "loss": 0.5232, "step": 3479 }, { "epoch": 0.9650582362728786, "grad_norm": 0.19454912841320038, "learning_rate": 1.3476334940047127e-05, "loss": 0.5611, "step": 3480 }, { "epoch": 0.9653355518580145, "grad_norm": 0.19796130061149597, "learning_rate": 1.3470830072964973e-05, "loss": 0.5459, "step": 3481 }, { "epoch": 0.9656128674431503, "grad_norm": 0.1934625804424286, "learning_rate": 1.346532501644815e-05, "loss": 0.5142, "step": 3482 }, { "epoch": 0.9658901830282862, "grad_norm": 0.18265816569328308, "learning_rate": 1.345981977157084e-05, "loss": 0.5416, "step": 3483 }, { "epoch": 0.9661674986134221, "grad_norm": 0.19438108801841736, "learning_rate": 1.3454314339407262e-05, "loss": 0.5264, "step": 3484 }, { "epoch": 0.966444814198558, "grad_norm": 0.20060043036937714, "learning_rate": 1.3448808721031673e-05, "loss": 0.5472, "step": 3485 }, { "epoch": 0.9667221297836939, "grad_norm": 0.20769962668418884, "learning_rate": 1.3443302917518361e-05, "loss": 0.5334, "step": 3486 }, { "epoch": 0.9669994453688298, "grad_norm": 0.19313625991344452, "learning_rate": 1.3437796929941661e-05, "loss": 0.5565, "step": 3487 }, { "epoch": 0.9672767609539656, "grad_norm": 0.20908023416996002, "learning_rate": 1.3432290759375935e-05, "loss": 0.5485, "step": 3488 }, { "epoch": 0.9675540765391015, "grad_norm": 0.1994648575782776, "learning_rate": 1.342678440689558e-05, "loss": 0.5566, "step": 3489 }, { "epoch": 0.9678313921242374, "grad_norm": 0.18501751124858856, "learning_rate": 1.342127787357503e-05, "loss": 0.5345, "step": 3490 }, { "epoch": 0.9681087077093733, "grad_norm": 0.17410939931869507, "learning_rate": 1.341577116048876e-05, "loss": 0.5523, "step": 3491 }, { "epoch": 0.9683860232945092, "grad_norm": 0.1948653906583786, "learning_rate": 1.3410264268711276e-05, "loss": 0.5419, "step": 3492 }, { "epoch": 0.9686633388796451, "grad_norm": 0.1910163313150406, "learning_rate": 1.3404757199317108e-05, "loss": 0.5215, "step": 3493 }, { "epoch": 0.968940654464781, "grad_norm": 0.21614839136600494, "learning_rate": 1.3399249953380849e-05, "loss": 0.5235, "step": 3494 }, { "epoch": 0.9692179700499168, "grad_norm": 0.19417038559913635, "learning_rate": 1.3393742531977094e-05, "loss": 0.5504, "step": 3495 }, { "epoch": 0.9694952856350527, "grad_norm": 0.18813247978687286, "learning_rate": 1.3388234936180493e-05, "loss": 0.5334, "step": 3496 }, { "epoch": 0.9697726012201886, "grad_norm": 0.18560314178466797, "learning_rate": 1.3382727167065723e-05, "loss": 0.5295, "step": 3497 }, { "epoch": 0.9700499168053245, "grad_norm": 0.19894826412200928, "learning_rate": 1.3377219225707495e-05, "loss": 0.5235, "step": 3498 }, { "epoch": 0.9703272323904604, "grad_norm": 0.18726693093776703, "learning_rate": 1.3371711113180552e-05, "loss": 0.5032, "step": 3499 }, { "epoch": 0.9706045479755963, "grad_norm": 0.20481140911579132, "learning_rate": 1.3366202830559679e-05, "loss": 0.5465, "step": 3500 }, { "epoch": 0.9708818635607321, "grad_norm": 0.18740220367908478, "learning_rate": 1.3360694378919683e-05, "loss": 0.5052, "step": 3501 }, { "epoch": 0.971159179145868, "grad_norm": 0.2005304992198944, "learning_rate": 1.3355185759335409e-05, "loss": 0.5565, "step": 3502 }, { "epoch": 0.9714364947310039, "grad_norm": 0.1884390115737915, "learning_rate": 1.3349676972881736e-05, "loss": 0.5513, "step": 3503 }, { "epoch": 0.9717138103161398, "grad_norm": 0.19259166717529297, "learning_rate": 1.3344168020633574e-05, "loss": 0.5136, "step": 3504 }, { "epoch": 0.9719911259012757, "grad_norm": 0.18772351741790771, "learning_rate": 1.3338658903665868e-05, "loss": 0.5418, "step": 3505 }, { "epoch": 0.9722684414864116, "grad_norm": 0.19562283158302307, "learning_rate": 1.3333149623053584e-05, "loss": 0.5291, "step": 3506 }, { "epoch": 0.9725457570715474, "grad_norm": 0.18652015924453735, "learning_rate": 1.332764017987174e-05, "loss": 0.5406, "step": 3507 }, { "epoch": 0.9728230726566833, "grad_norm": 0.19781257212162018, "learning_rate": 1.3322130575195366e-05, "loss": 0.5253, "step": 3508 }, { "epoch": 0.9731003882418192, "grad_norm": 0.20048947632312775, "learning_rate": 1.3316620810099536e-05, "loss": 0.5339, "step": 3509 }, { "epoch": 0.9733777038269551, "grad_norm": 0.1913914531469345, "learning_rate": 1.331111088565935e-05, "loss": 0.5399, "step": 3510 }, { "epoch": 0.973655019412091, "grad_norm": 0.18670813739299774, "learning_rate": 1.3305600802949941e-05, "loss": 0.4995, "step": 3511 }, { "epoch": 0.9739323349972269, "grad_norm": 0.1913759857416153, "learning_rate": 1.3300090563046472e-05, "loss": 0.5324, "step": 3512 }, { "epoch": 0.9742096505823628, "grad_norm": 0.18199758231639862, "learning_rate": 1.3294580167024135e-05, "loss": 0.535, "step": 3513 }, { "epoch": 0.9744869661674986, "grad_norm": 0.18627969920635223, "learning_rate": 1.328906961595815e-05, "loss": 0.5282, "step": 3514 }, { "epoch": 0.9747642817526345, "grad_norm": 0.1850520372390747, "learning_rate": 1.3283558910923785e-05, "loss": 0.5126, "step": 3515 }, { "epoch": 0.9750415973377704, "grad_norm": 0.19907443225383759, "learning_rate": 1.327804805299631e-05, "loss": 0.5228, "step": 3516 }, { "epoch": 0.9753189129229063, "grad_norm": 0.19155220687389374, "learning_rate": 1.3272537043251054e-05, "loss": 0.5289, "step": 3517 }, { "epoch": 0.9755962285080422, "grad_norm": 0.19737032055854797, "learning_rate": 1.3267025882763345e-05, "loss": 0.5342, "step": 3518 }, { "epoch": 0.9758735440931781, "grad_norm": 0.186650350689888, "learning_rate": 1.3261514572608569e-05, "loss": 0.5542, "step": 3519 }, { "epoch": 0.9761508596783139, "grad_norm": 0.1934208869934082, "learning_rate": 1.3256003113862122e-05, "loss": 0.53, "step": 3520 }, { "epoch": 0.9764281752634498, "grad_norm": 0.1932929903268814, "learning_rate": 1.3250491507599439e-05, "loss": 0.5322, "step": 3521 }, { "epoch": 0.9767054908485857, "grad_norm": 0.20356619358062744, "learning_rate": 1.3244979754895978e-05, "loss": 0.5443, "step": 3522 }, { "epoch": 0.9769828064337216, "grad_norm": 0.19713589549064636, "learning_rate": 1.3239467856827229e-05, "loss": 0.5335, "step": 3523 }, { "epoch": 0.9772601220188575, "grad_norm": 0.18841132521629333, "learning_rate": 1.323395581446871e-05, "loss": 0.5614, "step": 3524 }, { "epoch": 0.9775374376039934, "grad_norm": 0.18878857791423798, "learning_rate": 1.3228443628895962e-05, "loss": 0.536, "step": 3525 }, { "epoch": 0.9778147531891292, "grad_norm": 0.1925961673259735, "learning_rate": 1.3222931301184565e-05, "loss": 0.5743, "step": 3526 }, { "epoch": 0.9780920687742651, "grad_norm": 0.2000453919172287, "learning_rate": 1.321741883241012e-05, "loss": 0.5144, "step": 3527 }, { "epoch": 0.978369384359401, "grad_norm": 0.1939268261194229, "learning_rate": 1.3211906223648251e-05, "loss": 0.5379, "step": 3528 }, { "epoch": 0.9786466999445369, "grad_norm": 0.19331765174865723, "learning_rate": 1.3206393475974615e-05, "loss": 0.5428, "step": 3529 }, { "epoch": 0.9789240155296728, "grad_norm": 0.20399914681911469, "learning_rate": 1.3200880590464898e-05, "loss": 0.5323, "step": 3530 }, { "epoch": 0.9792013311148087, "grad_norm": 0.2036525458097458, "learning_rate": 1.3195367568194807e-05, "loss": 0.541, "step": 3531 }, { "epoch": 0.9794786466999446, "grad_norm": 0.19523896276950836, "learning_rate": 1.3189854410240082e-05, "loss": 0.5479, "step": 3532 }, { "epoch": 0.9797559622850804, "grad_norm": 0.20129665732383728, "learning_rate": 1.318434111767648e-05, "loss": 0.5565, "step": 3533 }, { "epoch": 0.9800332778702163, "grad_norm": 0.19564275443553925, "learning_rate": 1.3178827691579801e-05, "loss": 0.5781, "step": 3534 }, { "epoch": 0.9803105934553522, "grad_norm": 0.2035827934741974, "learning_rate": 1.317331413302585e-05, "loss": 0.554, "step": 3535 }, { "epoch": 0.9805879090404881, "grad_norm": 0.1899930238723755, "learning_rate": 1.3167800443090475e-05, "loss": 0.5187, "step": 3536 }, { "epoch": 0.980865224625624, "grad_norm": 0.1863166242837906, "learning_rate": 1.3162286622849538e-05, "loss": 0.5199, "step": 3537 }, { "epoch": 0.9811425402107599, "grad_norm": 0.20462384819984436, "learning_rate": 1.3156772673378936e-05, "loss": 0.5479, "step": 3538 }, { "epoch": 0.9814198557958957, "grad_norm": 0.19956259429454803, "learning_rate": 1.3151258595754581e-05, "loss": 0.5492, "step": 3539 }, { "epoch": 0.9816971713810316, "grad_norm": 0.2239493578672409, "learning_rate": 1.3145744391052422e-05, "loss": 0.563, "step": 3540 }, { "epoch": 0.9819744869661675, "grad_norm": 0.20383387804031372, "learning_rate": 1.3140230060348425e-05, "loss": 0.5309, "step": 3541 }, { "epoch": 0.9822518025513034, "grad_norm": 0.1931990385055542, "learning_rate": 1.3134715604718579e-05, "loss": 0.5233, "step": 3542 }, { "epoch": 0.9825291181364393, "grad_norm": 0.19594894349575043, "learning_rate": 1.3129201025238902e-05, "loss": 0.5366, "step": 3543 }, { "epoch": 0.9828064337215752, "grad_norm": 0.3933532238006592, "learning_rate": 1.3123686322985434e-05, "loss": 0.5762, "step": 3544 }, { "epoch": 0.983083749306711, "grad_norm": 0.19356705248355865, "learning_rate": 1.311817149903424e-05, "loss": 0.5247, "step": 3545 }, { "epoch": 0.9833610648918469, "grad_norm": 0.19310222566127777, "learning_rate": 1.3112656554461405e-05, "loss": 0.5501, "step": 3546 }, { "epoch": 0.9836383804769828, "grad_norm": 0.1923639476299286, "learning_rate": 1.310714149034305e-05, "loss": 0.5265, "step": 3547 }, { "epoch": 0.9839156960621187, "grad_norm": 0.19612760841846466, "learning_rate": 1.3101626307755303e-05, "loss": 0.5292, "step": 3548 }, { "epoch": 0.9841930116472546, "grad_norm": 0.19919681549072266, "learning_rate": 1.3096111007774322e-05, "loss": 0.565, "step": 3549 }, { "epoch": 0.9844703272323905, "grad_norm": 0.20053736865520477, "learning_rate": 1.3090595591476293e-05, "loss": 0.5669, "step": 3550 }, { "epoch": 0.9847476428175264, "grad_norm": 0.18242114782333374, "learning_rate": 1.3085080059937413e-05, "loss": 0.531, "step": 3551 }, { "epoch": 0.9850249584026622, "grad_norm": 0.18836815655231476, "learning_rate": 1.3079564414233912e-05, "loss": 0.5238, "step": 3552 }, { "epoch": 0.9853022739877981, "grad_norm": 0.185248464345932, "learning_rate": 1.3074048655442042e-05, "loss": 0.5294, "step": 3553 }, { "epoch": 0.985579589572934, "grad_norm": 0.18509770929813385, "learning_rate": 1.3068532784638065e-05, "loss": 0.5492, "step": 3554 }, { "epoch": 0.9858569051580699, "grad_norm": 0.1903439611196518, "learning_rate": 1.3063016802898288e-05, "loss": 0.5463, "step": 3555 }, { "epoch": 0.9861342207432058, "grad_norm": 0.19315816462039948, "learning_rate": 1.3057500711299006e-05, "loss": 0.533, "step": 3556 }, { "epoch": 0.9864115363283417, "grad_norm": 0.19946327805519104, "learning_rate": 1.305198451091657e-05, "loss": 0.5282, "step": 3557 }, { "epoch": 0.9866888519134775, "grad_norm": 0.1908363550901413, "learning_rate": 1.3046468202827328e-05, "loss": 0.5208, "step": 3558 }, { "epoch": 0.9869661674986134, "grad_norm": 0.1873141974210739, "learning_rate": 1.304095178810766e-05, "loss": 0.5146, "step": 3559 }, { "epoch": 0.9872434830837493, "grad_norm": 0.18219764530658722, "learning_rate": 1.303543526783397e-05, "loss": 0.5109, "step": 3560 }, { "epoch": 0.9875207986688852, "grad_norm": 0.1835818737745285, "learning_rate": 1.3029918643082673e-05, "loss": 0.5352, "step": 3561 }, { "epoch": 0.9877981142540211, "grad_norm": 0.1892389953136444, "learning_rate": 1.3024401914930207e-05, "loss": 0.5209, "step": 3562 }, { "epoch": 0.988075429839157, "grad_norm": 0.18364709615707397, "learning_rate": 1.3018885084453036e-05, "loss": 0.5213, "step": 3563 }, { "epoch": 0.9883527454242929, "grad_norm": 0.18862204253673553, "learning_rate": 1.3013368152727634e-05, "loss": 0.5151, "step": 3564 }, { "epoch": 0.9886300610094287, "grad_norm": 0.1929624229669571, "learning_rate": 1.3007851120830506e-05, "loss": 0.5347, "step": 3565 }, { "epoch": 0.9889073765945646, "grad_norm": 0.19009216129779816, "learning_rate": 1.3002333989838167e-05, "loss": 0.5589, "step": 3566 }, { "epoch": 0.9891846921797005, "grad_norm": 0.1797810047864914, "learning_rate": 1.299681676082716e-05, "loss": 0.5186, "step": 3567 }, { "epoch": 0.9894620077648364, "grad_norm": 0.1861737072467804, "learning_rate": 1.2991299434874038e-05, "loss": 0.5292, "step": 3568 }, { "epoch": 0.9897393233499723, "grad_norm": 0.186946839094162, "learning_rate": 1.298578201305538e-05, "loss": 0.5393, "step": 3569 }, { "epoch": 0.9900166389351082, "grad_norm": 0.20192833244800568, "learning_rate": 1.2980264496447784e-05, "loss": 0.5487, "step": 3570 }, { "epoch": 0.990293954520244, "grad_norm": 0.1849672794342041, "learning_rate": 1.2974746886127858e-05, "loss": 0.5342, "step": 3571 }, { "epoch": 0.9905712701053799, "grad_norm": 0.18184737861156464, "learning_rate": 1.2969229183172236e-05, "loss": 0.5387, "step": 3572 }, { "epoch": 0.9908485856905158, "grad_norm": 0.19623394310474396, "learning_rate": 1.2963711388657566e-05, "loss": 0.5588, "step": 3573 }, { "epoch": 0.9911259012756517, "grad_norm": 0.19016727805137634, "learning_rate": 1.2958193503660524e-05, "loss": 0.5393, "step": 3574 }, { "epoch": 0.9914032168607876, "grad_norm": 0.19216102361679077, "learning_rate": 1.2952675529257785e-05, "loss": 0.5383, "step": 3575 }, { "epoch": 0.9916805324459235, "grad_norm": 0.21044804155826569, "learning_rate": 1.2947157466526062e-05, "loss": 0.5453, "step": 3576 }, { "epoch": 0.9919578480310594, "grad_norm": 0.19791410863399506, "learning_rate": 1.2941639316542062e-05, "loss": 0.5562, "step": 3577 }, { "epoch": 0.9922351636161952, "grad_norm": 0.18991726636886597, "learning_rate": 1.2936121080382534e-05, "loss": 0.4977, "step": 3578 }, { "epoch": 0.9925124792013311, "grad_norm": 0.19211836159229279, "learning_rate": 1.293060275912423e-05, "loss": 0.5227, "step": 3579 }, { "epoch": 0.992789794786467, "grad_norm": 0.19635222852230072, "learning_rate": 1.292508435384392e-05, "loss": 0.5454, "step": 3580 }, { "epoch": 0.9930671103716029, "grad_norm": 0.18526218831539154, "learning_rate": 1.2919565865618388e-05, "loss": 0.5429, "step": 3581 }, { "epoch": 0.9933444259567388, "grad_norm": 0.1954220086336136, "learning_rate": 1.291404729552444e-05, "loss": 0.5479, "step": 3582 }, { "epoch": 0.9936217415418747, "grad_norm": 0.1860000044107437, "learning_rate": 1.2908528644638895e-05, "loss": 0.5291, "step": 3583 }, { "epoch": 0.9938990571270105, "grad_norm": 0.19651706516742706, "learning_rate": 1.2903009914038586e-05, "loss": 0.5345, "step": 3584 }, { "epoch": 0.9941763727121464, "grad_norm": 0.18839724361896515, "learning_rate": 1.2897491104800366e-05, "loss": 0.5624, "step": 3585 }, { "epoch": 0.9944536882972823, "grad_norm": 0.1895817518234253, "learning_rate": 1.28919722180011e-05, "loss": 0.5577, "step": 3586 }, { "epoch": 0.9947310038824182, "grad_norm": 0.18223224580287933, "learning_rate": 1.288645325471767e-05, "loss": 0.5511, "step": 3587 }, { "epoch": 0.9950083194675541, "grad_norm": 0.18843968212604523, "learning_rate": 1.2880934216026971e-05, "loss": 0.5416, "step": 3588 }, { "epoch": 0.99528563505269, "grad_norm": 0.18690787255764008, "learning_rate": 1.2875415103005915e-05, "loss": 0.5348, "step": 3589 }, { "epoch": 0.9955629506378258, "grad_norm": 0.18256263434886932, "learning_rate": 1.2869895916731426e-05, "loss": 0.5147, "step": 3590 }, { "epoch": 0.9958402662229617, "grad_norm": 0.18710075318813324, "learning_rate": 1.2864376658280441e-05, "loss": 0.5245, "step": 3591 }, { "epoch": 0.9961175818080976, "grad_norm": 0.20847736299037933, "learning_rate": 1.2858857328729915e-05, "loss": 0.5432, "step": 3592 }, { "epoch": 0.9963948973932335, "grad_norm": 0.18452630937099457, "learning_rate": 1.2853337929156822e-05, "loss": 0.5279, "step": 3593 }, { "epoch": 0.9966722129783694, "grad_norm": 0.19052648544311523, "learning_rate": 1.2847818460638131e-05, "loss": 0.5312, "step": 3594 }, { "epoch": 0.9969495285635053, "grad_norm": 0.19351086020469666, "learning_rate": 1.2842298924250848e-05, "loss": 0.525, "step": 3595 }, { "epoch": 0.9972268441486412, "grad_norm": 0.17867478728294373, "learning_rate": 1.2836779321071974e-05, "loss": 0.5493, "step": 3596 }, { "epoch": 0.997504159733777, "grad_norm": 0.18988832831382751, "learning_rate": 1.2831259652178532e-05, "loss": 0.5449, "step": 3597 }, { "epoch": 0.9977814753189129, "grad_norm": 0.18697360157966614, "learning_rate": 1.2825739918647553e-05, "loss": 0.5146, "step": 3598 }, { "epoch": 0.9980587909040488, "grad_norm": 0.19430890679359436, "learning_rate": 1.2820220121556087e-05, "loss": 0.5423, "step": 3599 }, { "epoch": 0.9983361064891847, "grad_norm": 0.19263558089733124, "learning_rate": 1.2814700261981195e-05, "loss": 0.5393, "step": 3600 }, { "epoch": 0.9986134220743206, "grad_norm": 0.1892475187778473, "learning_rate": 1.2809180340999938e-05, "loss": 0.5205, "step": 3601 }, { "epoch": 0.9988907376594565, "grad_norm": 0.19061142206192017, "learning_rate": 1.280366035968941e-05, "loss": 0.5195, "step": 3602 }, { "epoch": 0.9991680532445923, "grad_norm": 0.1829683780670166, "learning_rate": 1.2798140319126695e-05, "loss": 0.5111, "step": 3603 }, { "epoch": 0.9994453688297282, "grad_norm": 0.18301549553871155, "learning_rate": 1.279262022038891e-05, "loss": 0.5393, "step": 3604 }, { "epoch": 0.9997226844148641, "grad_norm": 0.18907521665096283, "learning_rate": 1.2787100064553162e-05, "loss": 0.534, "step": 3605 }, { "epoch": 1.0, "grad_norm": 0.20034904778003693, "learning_rate": 1.2781579852696588e-05, "loss": 0.5388, "step": 3606 }, { "epoch": 1.0, "eval_loss": 0.8209076523780823, "eval_runtime": 415.5978, "eval_samples_per_second": 98.684, "eval_steps_per_second": 1.542, "step": 3606 } ], "logging_steps": 1, "max_steps": 7212, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7655222359700695e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }