{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004347826086956522, "grad_norm": 209.3180389404297, "learning_rate": 0.0, "loss": 5.8188, "step": 1 }, { "epoch": 0.008695652173913044, "grad_norm": 215.69874572753906, "learning_rate": 4.347826086956522e-06, "loss": 5.9259, "step": 2 }, { "epoch": 0.013043478260869565, "grad_norm": 62.712825775146484, "learning_rate": 8.695652173913044e-06, "loss": 5.4202, "step": 3 }, { "epoch": 0.017391304347826087, "grad_norm": 85.59194946289062, "learning_rate": 1.3043478260869566e-05, "loss": 5.3079, "step": 4 }, { "epoch": 0.021739130434782608, "grad_norm": 22.901897430419922, "learning_rate": 1.739130434782609e-05, "loss": 5.0196, "step": 5 }, { "epoch": 0.02608695652173913, "grad_norm": 22.081829071044922, "learning_rate": 2.173913043478261e-05, "loss": 4.8222, "step": 6 }, { "epoch": 0.030434782608695653, "grad_norm": 11.022245407104492, "learning_rate": 2.608695652173913e-05, "loss": 4.4617, "step": 7 }, { "epoch": 0.034782608695652174, "grad_norm": 7.274469375610352, "learning_rate": 3.0434782608695656e-05, "loss": 4.335, "step": 8 }, { "epoch": 0.0391304347826087, "grad_norm": 3.8645834922790527, "learning_rate": 3.478260869565218e-05, "loss": 4.0476, "step": 9 }, { "epoch": 0.043478260869565216, "grad_norm": 2.6724016666412354, "learning_rate": 3.91304347826087e-05, "loss": 3.8387, "step": 10 }, { "epoch": 0.04782608695652174, "grad_norm": 2.258195161819458, "learning_rate": 4.347826086956522e-05, "loss": 3.8144, "step": 11 }, { "epoch": 0.05217391304347826, "grad_norm": 1.8822625875473022, "learning_rate": 4.782608695652174e-05, "loss": 3.4008, "step": 12 }, { "epoch": 0.05652173913043478, "grad_norm": 2.047840118408203, "learning_rate": 5.217391304347826e-05, "loss": 3.2554, "step": 13 }, { "epoch": 0.06086956521739131, "grad_norm": 1.8671568632125854, "learning_rate": 5.652173913043478e-05, "loss": 3.2461, "step": 14 }, { "epoch": 0.06521739130434782, "grad_norm": 1.6069483757019043, "learning_rate": 6.086956521739131e-05, "loss": 2.9738, "step": 15 }, { "epoch": 0.06956521739130435, "grad_norm": 1.3096915483474731, "learning_rate": 6.521739130434783e-05, "loss": 2.7823, "step": 16 }, { "epoch": 0.07391304347826087, "grad_norm": 1.3594956398010254, "learning_rate": 6.956521739130436e-05, "loss": 2.6255, "step": 17 }, { "epoch": 0.0782608695652174, "grad_norm": 1.0210895538330078, "learning_rate": 7.391304347826086e-05, "loss": 2.4501, "step": 18 }, { "epoch": 0.08260869565217391, "grad_norm": 0.8942164182662964, "learning_rate": 7.82608695652174e-05, "loss": 2.2934, "step": 19 }, { "epoch": 0.08695652173913043, "grad_norm": 0.8361735343933105, "learning_rate": 8.260869565217392e-05, "loss": 2.2029, "step": 20 }, { "epoch": 0.09130434782608696, "grad_norm": 0.794482409954071, "learning_rate": 8.695652173913044e-05, "loss": 2.0223, "step": 21 }, { "epoch": 0.09565217391304348, "grad_norm": 0.7513137459754944, "learning_rate": 9.130434782608696e-05, "loss": 1.8504, "step": 22 }, { "epoch": 0.1, "grad_norm": 0.76312655210495, "learning_rate": 9.565217391304348e-05, "loss": 1.6577, "step": 23 }, { "epoch": 0.10434782608695652, "grad_norm": 0.8560758829116821, "learning_rate": 0.0001, "loss": 1.5565, "step": 24 }, { "epoch": 0.10869565217391304, "grad_norm": 0.7479954957962036, "learning_rate": 0.00010434782608695653, "loss": 1.4364, "step": 25 }, { "epoch": 0.11304347826086956, "grad_norm": 0.5951140522956848, "learning_rate": 0.00010869565217391305, "loss": 1.2957, "step": 26 }, { "epoch": 0.11739130434782609, "grad_norm": 0.503224790096283, "learning_rate": 0.00011304347826086956, "loss": 1.1799, "step": 27 }, { "epoch": 0.12173913043478261, "grad_norm": 0.47480374574661255, "learning_rate": 0.0001173913043478261, "loss": 1.1277, "step": 28 }, { "epoch": 0.12608695652173912, "grad_norm": 0.38552260398864746, "learning_rate": 0.00012173913043478263, "loss": 1.0744, "step": 29 }, { "epoch": 0.13043478260869565, "grad_norm": 0.35596558451652527, "learning_rate": 0.00012608695652173915, "loss": 1.0023, "step": 30 }, { "epoch": 0.13478260869565217, "grad_norm": 0.32971665263175964, "learning_rate": 0.00013043478260869567, "loss": 0.9691, "step": 31 }, { "epoch": 0.1391304347826087, "grad_norm": 0.37770169973373413, "learning_rate": 0.0001347826086956522, "loss": 0.9116, "step": 32 }, { "epoch": 0.14347826086956522, "grad_norm": 0.22640736401081085, "learning_rate": 0.0001391304347826087, "loss": 0.8613, "step": 33 }, { "epoch": 0.14782608695652175, "grad_norm": 0.20925410091876984, "learning_rate": 0.0001434782608695652, "loss": 0.8836, "step": 34 }, { "epoch": 0.15217391304347827, "grad_norm": 0.20542123913764954, "learning_rate": 0.00014782608695652173, "loss": 0.8502, "step": 35 }, { "epoch": 0.1565217391304348, "grad_norm": 0.16715222597122192, "learning_rate": 0.00015217391304347827, "loss": 0.8292, "step": 36 }, { "epoch": 0.1608695652173913, "grad_norm": 0.1648133248090744, "learning_rate": 0.0001565217391304348, "loss": 0.8189, "step": 37 }, { "epoch": 0.16521739130434782, "grad_norm": 0.13562779128551483, "learning_rate": 0.00016086956521739132, "loss": 0.8078, "step": 38 }, { "epoch": 0.16956521739130434, "grad_norm": 0.1290610432624817, "learning_rate": 0.00016521739130434784, "loss": 0.7712, "step": 39 }, { "epoch": 0.17391304347826086, "grad_norm": 0.11024343967437744, "learning_rate": 0.00016956521739130436, "loss": 0.7448, "step": 40 }, { "epoch": 0.1782608695652174, "grad_norm": 0.12418993562459946, "learning_rate": 0.00017391304347826088, "loss": 0.7633, "step": 41 }, { "epoch": 0.1826086956521739, "grad_norm": 0.10319849103689194, "learning_rate": 0.0001782608695652174, "loss": 0.7463, "step": 42 }, { "epoch": 0.18695652173913044, "grad_norm": 0.10371455550193787, "learning_rate": 0.00018260869565217392, "loss": 0.7516, "step": 43 }, { "epoch": 0.19130434782608696, "grad_norm": 0.09219090640544891, "learning_rate": 0.00018695652173913045, "loss": 0.7265, "step": 44 }, { "epoch": 0.1956521739130435, "grad_norm": 0.09577666968107224, "learning_rate": 0.00019130434782608697, "loss": 0.7382, "step": 45 }, { "epoch": 0.2, "grad_norm": 0.08755916357040405, "learning_rate": 0.0001956521739130435, "loss": 0.7392, "step": 46 }, { "epoch": 0.20434782608695654, "grad_norm": 0.08335893601179123, "learning_rate": 0.0002, "loss": 0.7182, "step": 47 }, { "epoch": 0.20869565217391303, "grad_norm": 0.08622466027736664, "learning_rate": 0.00019999712083215463, "loss": 0.7196, "step": 48 }, { "epoch": 0.21304347826086956, "grad_norm": 0.07222707569599152, "learning_rate": 0.00019998848349441062, "loss": 0.7014, "step": 49 }, { "epoch": 0.21739130434782608, "grad_norm": 0.07286012172698975, "learning_rate": 0.00019997408848413493, "loss": 0.6986, "step": 50 }, { "epoch": 0.2217391304347826, "grad_norm": 0.07811558246612549, "learning_rate": 0.00019995393663024054, "loss": 0.6922, "step": 51 }, { "epoch": 0.22608695652173913, "grad_norm": 0.07095416635274887, "learning_rate": 0.0001999280290931388, "loss": 0.7188, "step": 52 }, { "epoch": 0.23043478260869565, "grad_norm": 0.0705651044845581, "learning_rate": 0.00019989636736467278, "loss": 0.7135, "step": 53 }, { "epoch": 0.23478260869565218, "grad_norm": 0.0649741142988205, "learning_rate": 0.00019985895326803097, "loss": 0.6833, "step": 54 }, { "epoch": 0.2391304347826087, "grad_norm": 0.07023416459560394, "learning_rate": 0.00019981578895764273, "loss": 0.6902, "step": 55 }, { "epoch": 0.24347826086956523, "grad_norm": 0.065043605864048, "learning_rate": 0.00019976687691905393, "loss": 0.6933, "step": 56 }, { "epoch": 0.24782608695652175, "grad_norm": 0.0647321566939354, "learning_rate": 0.00019971221996878394, "loss": 0.6946, "step": 57 }, { "epoch": 0.25217391304347825, "grad_norm": 0.08214448392391205, "learning_rate": 0.0001996518212541634, "loss": 0.6789, "step": 58 }, { "epoch": 0.2565217391304348, "grad_norm": 0.06106014922261238, "learning_rate": 0.00019958568425315314, "loss": 0.6826, "step": 59 }, { "epoch": 0.2608695652173913, "grad_norm": 0.06052952632308006, "learning_rate": 0.0001995138127741436, "loss": 0.6706, "step": 60 }, { "epoch": 0.26521739130434785, "grad_norm": 0.06265316903591156, "learning_rate": 0.00019943621095573586, "loss": 0.6809, "step": 61 }, { "epoch": 0.26956521739130435, "grad_norm": 0.0603368878364563, "learning_rate": 0.00019935288326650312, "loss": 0.6728, "step": 62 }, { "epoch": 0.27391304347826084, "grad_norm": 0.06611189991235733, "learning_rate": 0.00019926383450473344, "loss": 0.6499, "step": 63 }, { "epoch": 0.2782608695652174, "grad_norm": 0.06278355419635773, "learning_rate": 0.00019916906979815347, "loss": 0.6561, "step": 64 }, { "epoch": 0.2826086956521739, "grad_norm": 0.07379094511270523, "learning_rate": 0.00019906859460363307, "loss": 0.6786, "step": 65 }, { "epoch": 0.28695652173913044, "grad_norm": 0.09574166685342789, "learning_rate": 0.0001989624147068713, "loss": 0.6625, "step": 66 }, { "epoch": 0.29130434782608694, "grad_norm": 0.08743462711572647, "learning_rate": 0.00019885053622206304, "loss": 0.648, "step": 67 }, { "epoch": 0.2956521739130435, "grad_norm": 0.08914034813642502, "learning_rate": 0.00019873296559154698, "loss": 0.6561, "step": 68 }, { "epoch": 0.3, "grad_norm": 0.06804706901311874, "learning_rate": 0.0001986097095854347, "loss": 0.658, "step": 69 }, { "epoch": 0.30434782608695654, "grad_norm": 0.09893489629030228, "learning_rate": 0.00019848077530122083, "loss": 0.6708, "step": 70 }, { "epoch": 0.30869565217391304, "grad_norm": 0.07928409427404404, "learning_rate": 0.0001983461701633742, "loss": 0.6407, "step": 71 }, { "epoch": 0.3130434782608696, "grad_norm": 0.07455449551343918, "learning_rate": 0.0001982059019229106, "loss": 0.676, "step": 72 }, { "epoch": 0.3173913043478261, "grad_norm": 0.0770968496799469, "learning_rate": 0.00019805997865694614, "loss": 0.6639, "step": 73 }, { "epoch": 0.3217391304347826, "grad_norm": 0.06771919876337051, "learning_rate": 0.00019790840876823232, "loss": 0.6486, "step": 74 }, { "epoch": 0.32608695652173914, "grad_norm": 0.07457810640335083, "learning_rate": 0.0001977512009846721, "loss": 0.6681, "step": 75 }, { "epoch": 0.33043478260869563, "grad_norm": 0.0826922208070755, "learning_rate": 0.00019758836435881746, "loss": 0.6356, "step": 76 }, { "epoch": 0.3347826086956522, "grad_norm": 0.07923886179924011, "learning_rate": 0.00019741990826734794, "loss": 0.6682, "step": 77 }, { "epoch": 0.3391304347826087, "grad_norm": 0.11045071482658386, "learning_rate": 0.0001972458424105307, "loss": 0.6203, "step": 78 }, { "epoch": 0.34347826086956523, "grad_norm": 0.11731227487325668, "learning_rate": 0.00019706617681166218, "loss": 0.66, "step": 79 }, { "epoch": 0.34782608695652173, "grad_norm": 0.12649305164813995, "learning_rate": 0.00019688092181649065, "loss": 0.6613, "step": 80 }, { "epoch": 0.3521739130434783, "grad_norm": 0.1144268661737442, "learning_rate": 0.00019669008809262062, "loss": 0.6606, "step": 81 }, { "epoch": 0.3565217391304348, "grad_norm": 0.11361440271139145, "learning_rate": 0.00019649368662889855, "loss": 0.629, "step": 82 }, { "epoch": 0.36086956521739133, "grad_norm": 0.12539249658584595, "learning_rate": 0.00019629172873477995, "loss": 0.6676, "step": 83 }, { "epoch": 0.3652173913043478, "grad_norm": 0.11141279339790344, "learning_rate": 0.00019608422603967836, "loss": 0.6376, "step": 84 }, { "epoch": 0.3695652173913043, "grad_norm": 0.09837634861469269, "learning_rate": 0.00019587119049229557, "loss": 0.6503, "step": 85 }, { "epoch": 0.3739130434782609, "grad_norm": 0.15677575767040253, "learning_rate": 0.0001956526343599335, "loss": 0.6638, "step": 86 }, { "epoch": 0.3782608695652174, "grad_norm": 0.252825528383255, "learning_rate": 0.0001954285702277879, "loss": 0.6713, "step": 87 }, { "epoch": 0.3826086956521739, "grad_norm": 0.3602813482284546, "learning_rate": 0.00019519901099822372, "loss": 0.6596, "step": 88 }, { "epoch": 0.3869565217391304, "grad_norm": 0.3970949053764343, "learning_rate": 0.00019496396989003193, "loss": 0.6617, "step": 89 }, { "epoch": 0.391304347826087, "grad_norm": 0.284343421459198, "learning_rate": 0.00019472346043766865, "loss": 0.6229, "step": 90 }, { "epoch": 0.39565217391304347, "grad_norm": 0.19832171499729156, "learning_rate": 0.00019447749649047542, "loss": 0.6665, "step": 91 }, { "epoch": 0.4, "grad_norm": 0.24541743099689484, "learning_rate": 0.00019422609221188207, "loss": 0.6585, "step": 92 }, { "epoch": 0.4043478260869565, "grad_norm": 0.1915537267923355, "learning_rate": 0.00019396926207859084, "loss": 0.6343, "step": 93 }, { "epoch": 0.40869565217391307, "grad_norm": 0.20492875576019287, "learning_rate": 0.00019370702087974302, "loss": 0.6438, "step": 94 }, { "epoch": 0.41304347826086957, "grad_norm": 0.25835996866226196, "learning_rate": 0.00019343938371606712, "loss": 0.6502, "step": 95 }, { "epoch": 0.41739130434782606, "grad_norm": 0.2585464417934418, "learning_rate": 0.00019316636599900946, "loss": 0.6393, "step": 96 }, { "epoch": 0.4217391304347826, "grad_norm": 0.2317182868719101, "learning_rate": 0.00019288798344984672, "loss": 0.6275, "step": 97 }, { "epoch": 0.4260869565217391, "grad_norm": 0.23632416129112244, "learning_rate": 0.00019260425209878052, "loss": 0.6414, "step": 98 }, { "epoch": 0.43043478260869567, "grad_norm": 0.1801244169473648, "learning_rate": 0.00019231518828401458, "loss": 0.6491, "step": 99 }, { "epoch": 0.43478260869565216, "grad_norm": 0.24871514737606049, "learning_rate": 0.00019202080865081368, "loss": 0.6581, "step": 100 }, { "epoch": 0.4391304347826087, "grad_norm": 0.26276353001594543, "learning_rate": 0.00019172113015054532, "loss": 0.644, "step": 101 }, { "epoch": 0.4434782608695652, "grad_norm": 0.19743724167346954, "learning_rate": 0.0001914161700397035, "loss": 0.6519, "step": 102 }, { "epoch": 0.44782608695652176, "grad_norm": 0.31385916471481323, "learning_rate": 0.00019110594587891519, "loss": 0.6462, "step": 103 }, { "epoch": 0.45217391304347826, "grad_norm": 0.2689647674560547, "learning_rate": 0.0001907904755319289, "loss": 0.6517, "step": 104 }, { "epoch": 0.45652173913043476, "grad_norm": 0.17245543003082275, "learning_rate": 0.00019046977716458626, "loss": 0.6245, "step": 105 }, { "epoch": 0.4608695652173913, "grad_norm": 0.4380849003791809, "learning_rate": 0.00019014386924377582, "loss": 0.6519, "step": 106 }, { "epoch": 0.4652173913043478, "grad_norm": 0.305043488740921, "learning_rate": 0.0001898127705363696, "loss": 0.6606, "step": 107 }, { "epoch": 0.46956521739130436, "grad_norm": 0.20340269804000854, "learning_rate": 0.0001894765001081428, "loss": 0.6359, "step": 108 }, { "epoch": 0.47391304347826085, "grad_norm": 0.15703125298023224, "learning_rate": 0.0001891350773226754, "loss": 0.6461, "step": 109 }, { "epoch": 0.4782608695652174, "grad_norm": 0.16932646930217743, "learning_rate": 0.0001887885218402375, "loss": 0.6413, "step": 110 }, { "epoch": 0.4826086956521739, "grad_norm": 0.1790553480386734, "learning_rate": 0.00018843685361665723, "loss": 0.6378, "step": 111 }, { "epoch": 0.48695652173913045, "grad_norm": 0.24903282523155212, "learning_rate": 0.00018808009290217136, "loss": 0.6308, "step": 112 }, { "epoch": 0.49130434782608695, "grad_norm": 0.20529182255268097, "learning_rate": 0.00018771826024025946, "loss": 0.6315, "step": 113 }, { "epoch": 0.4956521739130435, "grad_norm": 0.18206629157066345, "learning_rate": 0.00018735137646646078, "loss": 0.6409, "step": 114 }, { "epoch": 0.5, "grad_norm": 0.22906547784805298, "learning_rate": 0.00018697946270717467, "loss": 0.6522, "step": 115 }, { "epoch": 0.5043478260869565, "grad_norm": 0.23560722172260284, "learning_rate": 0.00018660254037844388, "loss": 0.6424, "step": 116 }, { "epoch": 0.508695652173913, "grad_norm": 0.3479248881340027, "learning_rate": 0.00018622063118472134, "loss": 0.6591, "step": 117 }, { "epoch": 0.5130434782608696, "grad_norm": 0.48405924439430237, "learning_rate": 0.00018583375711762052, "loss": 0.6312, "step": 118 }, { "epoch": 0.5173913043478261, "grad_norm": 0.6660999655723572, "learning_rate": 0.00018544194045464886, "loss": 0.6492, "step": 119 }, { "epoch": 0.5217391304347826, "grad_norm": 0.6070662140846252, "learning_rate": 0.0001850452037579251, "loss": 0.631, "step": 120 }, { "epoch": 0.5260869565217391, "grad_norm": 0.2432556301355362, "learning_rate": 0.00018464356987288013, "loss": 0.6192, "step": 121 }, { "epoch": 0.5304347826086957, "grad_norm": 0.4718700647354126, "learning_rate": 0.00018423706192694116, "loss": 0.6385, "step": 122 }, { "epoch": 0.5347826086956522, "grad_norm": 0.41220200061798096, "learning_rate": 0.00018382570332820043, "loss": 0.6362, "step": 123 }, { "epoch": 0.5391304347826087, "grad_norm": 0.24313992261886597, "learning_rate": 0.00018340951776406694, "loss": 0.659, "step": 124 }, { "epoch": 0.5434782608695652, "grad_norm": 0.42307668924331665, "learning_rate": 0.00018298852919990252, "loss": 0.6484, "step": 125 }, { "epoch": 0.5478260869565217, "grad_norm": 0.2858572006225586, "learning_rate": 0.00018256276187764197, "loss": 0.6437, "step": 126 }, { "epoch": 0.5521739130434783, "grad_norm": 0.2318851351737976, "learning_rate": 0.0001821322403143969, "loss": 0.6191, "step": 127 }, { "epoch": 0.5565217391304348, "grad_norm": 0.3861188292503357, "learning_rate": 0.0001816969893010442, "loss": 0.639, "step": 128 }, { "epoch": 0.5608695652173913, "grad_norm": 0.2969801127910614, "learning_rate": 0.0001812570339007983, "loss": 0.6624, "step": 129 }, { "epoch": 0.5652173913043478, "grad_norm": 0.29341548681259155, "learning_rate": 0.00018081239944776805, "loss": 0.639, "step": 130 }, { "epoch": 0.5695652173913044, "grad_norm": 0.43678849935531616, "learning_rate": 0.00018036311154549784, "loss": 0.6384, "step": 131 }, { "epoch": 0.5739130434782609, "grad_norm": 0.5248069167137146, "learning_rate": 0.00017990919606549328, "loss": 0.6451, "step": 132 }, { "epoch": 0.5782608695652174, "grad_norm": 0.5387030243873596, "learning_rate": 0.00017945067914573146, "loss": 0.6198, "step": 133 }, { "epoch": 0.5826086956521739, "grad_norm": 0.55666184425354, "learning_rate": 0.00017898758718915586, "loss": 0.6391, "step": 134 }, { "epoch": 0.5869565217391305, "grad_norm": 0.4839560389518738, "learning_rate": 0.0001785199468621559, "loss": 0.6411, "step": 135 }, { "epoch": 0.591304347826087, "grad_norm": 0.5173195004463196, "learning_rate": 0.00017804778509303138, "loss": 0.6318, "step": 136 }, { "epoch": 0.5956521739130435, "grad_norm": 0.341448038816452, "learning_rate": 0.000177571129070442, "loss": 0.6427, "step": 137 }, { "epoch": 0.6, "grad_norm": 0.2654604911804199, "learning_rate": 0.00017709000624184162, "loss": 0.616, "step": 138 }, { "epoch": 0.6043478260869565, "grad_norm": 0.4000408351421356, "learning_rate": 0.0001766044443118978, "loss": 0.611, "step": 139 }, { "epoch": 0.6086956521739131, "grad_norm": 0.2812383770942688, "learning_rate": 0.00017611447124089649, "loss": 0.6508, "step": 140 }, { "epoch": 0.6130434782608696, "grad_norm": 0.30483949184417725, "learning_rate": 0.00017562011524313185, "loss": 0.6628, "step": 141 }, { "epoch": 0.6173913043478261, "grad_norm": 0.4457907974720001, "learning_rate": 0.0001751214047852818, "loss": 0.6274, "step": 142 }, { "epoch": 0.6217391304347826, "grad_norm": 0.38395488262176514, "learning_rate": 0.00017461836858476856, "loss": 0.6528, "step": 143 }, { "epoch": 0.6260869565217392, "grad_norm": 0.573344886302948, "learning_rate": 0.00017411103560810526, "loss": 0.6504, "step": 144 }, { "epoch": 0.6304347826086957, "grad_norm": 0.5133661031723022, "learning_rate": 0.00017359943506922774, "loss": 0.6334, "step": 145 }, { "epoch": 0.6347826086956522, "grad_norm": 0.2995568513870239, "learning_rate": 0.00017308359642781242, "loss": 0.6328, "step": 146 }, { "epoch": 0.6391304347826087, "grad_norm": 0.5677820444107056, "learning_rate": 0.0001725635493875799, "loss": 0.639, "step": 147 }, { "epoch": 0.6434782608695652, "grad_norm": 0.4751092791557312, "learning_rate": 0.00017203932389458454, "loss": 0.6229, "step": 148 }, { "epoch": 0.6478260869565218, "grad_norm": 0.4374710023403168, "learning_rate": 0.00017151095013548994, "loss": 0.6377, "step": 149 }, { "epoch": 0.6521739130434783, "grad_norm": 0.4172927439212799, "learning_rate": 0.0001709784585358309, "loss": 0.6277, "step": 150 }, { "epoch": 0.6565217391304348, "grad_norm": 0.3994798958301544, "learning_rate": 0.00017044187975826124, "loss": 0.637, "step": 151 }, { "epoch": 0.6608695652173913, "grad_norm": 0.34366917610168457, "learning_rate": 0.00016990124470078822, "loss": 0.6556, "step": 152 }, { "epoch": 0.6652173913043479, "grad_norm": 0.533347487449646, "learning_rate": 0.0001693565844949933, "loss": 0.6073, "step": 153 }, { "epoch": 0.6695652173913044, "grad_norm": 0.4292946457862854, "learning_rate": 0.0001688079305042395, "loss": 0.6548, "step": 154 }, { "epoch": 0.6739130434782609, "grad_norm": 0.2770076394081116, "learning_rate": 0.00016825531432186543, "loss": 0.6014, "step": 155 }, { "epoch": 0.6782608695652174, "grad_norm": 0.377838134765625, "learning_rate": 0.0001676987677693659, "loss": 0.6406, "step": 156 }, { "epoch": 0.6826086956521739, "grad_norm": 0.421268492937088, "learning_rate": 0.0001671383228945597, "loss": 0.6288, "step": 157 }, { "epoch": 0.6869565217391305, "grad_norm": 0.4219221770763397, "learning_rate": 0.00016657401196974405, "loss": 0.647, "step": 158 }, { "epoch": 0.691304347826087, "grad_norm": 0.3563760221004486, "learning_rate": 0.00016600586748983641, "loss": 0.6307, "step": 159 }, { "epoch": 0.6956521739130435, "grad_norm": 0.39387866854667664, "learning_rate": 0.00016543392217050314, "loss": 0.631, "step": 160 }, { "epoch": 0.7, "grad_norm": 0.36268243193626404, "learning_rate": 0.0001648582089462756, "loss": 0.6429, "step": 161 }, { "epoch": 0.7043478260869566, "grad_norm": 0.3702019155025482, "learning_rate": 0.00016427876096865394, "loss": 0.6338, "step": 162 }, { "epoch": 0.7086956521739131, "grad_norm": 0.44408297538757324, "learning_rate": 0.00016369561160419784, "loss": 0.6416, "step": 163 }, { "epoch": 0.7130434782608696, "grad_norm": 0.5986080765724182, "learning_rate": 0.00016310879443260528, "loss": 0.6187, "step": 164 }, { "epoch": 0.717391304347826, "grad_norm": 0.7963016629219055, "learning_rate": 0.0001625183432447789, "loss": 0.6365, "step": 165 }, { "epoch": 0.7217391304347827, "grad_norm": 1.2156025171279907, "learning_rate": 0.0001619242920408802, "loss": 0.6625, "step": 166 }, { "epoch": 0.7260869565217392, "grad_norm": 0.7924716472625732, "learning_rate": 0.00016132667502837165, "loss": 0.6276, "step": 167 }, { "epoch": 0.7304347826086957, "grad_norm": 0.29551273584365845, "learning_rate": 0.00016072552662004696, "loss": 0.6159, "step": 168 }, { "epoch": 0.7347826086956522, "grad_norm": 0.7566269040107727, "learning_rate": 0.00016012088143204953, "loss": 0.6485, "step": 169 }, { "epoch": 0.7391304347826086, "grad_norm": 1.001354455947876, "learning_rate": 0.00015951277428187898, "loss": 0.6323, "step": 170 }, { "epoch": 0.7434782608695653, "grad_norm": 0.9103027582168579, "learning_rate": 0.00015890124018638638, "loss": 0.6255, "step": 171 }, { "epoch": 0.7478260869565218, "grad_norm": 0.3885137736797333, "learning_rate": 0.00015828631435975784, "loss": 0.6323, "step": 172 }, { "epoch": 0.7521739130434782, "grad_norm": 0.6141281723976135, "learning_rate": 0.00015766803221148673, "loss": 0.6504, "step": 173 }, { "epoch": 0.7565217391304347, "grad_norm": 0.8024821281433105, "learning_rate": 0.0001570464293443346, "loss": 0.641, "step": 174 }, { "epoch": 0.7608695652173914, "grad_norm": 0.43333736062049866, "learning_rate": 0.00015642154155228122, "loss": 0.627, "step": 175 }, { "epoch": 0.7652173913043478, "grad_norm": 0.649389922618866, "learning_rate": 0.00015579340481846336, "loss": 0.6483, "step": 176 }, { "epoch": 0.7695652173913043, "grad_norm": 1.0359424352645874, "learning_rate": 0.00015516205531310273, "loss": 0.6332, "step": 177 }, { "epoch": 0.7739130434782608, "grad_norm": 0.7209396362304688, "learning_rate": 0.00015452752939142328, "loss": 0.6524, "step": 178 }, { "epoch": 0.7782608695652173, "grad_norm": 0.6178513169288635, "learning_rate": 0.00015388986359155758, "loss": 0.645, "step": 179 }, { "epoch": 0.782608695652174, "grad_norm": 0.9886595606803894, "learning_rate": 0.00015324909463244296, "loss": 0.6642, "step": 180 }, { "epoch": 0.7869565217391304, "grad_norm": 0.7466373443603516, "learning_rate": 0.00015260525941170712, "loss": 0.6315, "step": 181 }, { "epoch": 0.7913043478260869, "grad_norm": 0.5552679896354675, "learning_rate": 0.00015195839500354335, "loss": 0.6207, "step": 182 }, { "epoch": 0.7956521739130434, "grad_norm": 0.5576688647270203, "learning_rate": 0.0001513085386565758, "loss": 0.6421, "step": 183 }, { "epoch": 0.8, "grad_norm": 0.4000707268714905, "learning_rate": 0.00015065572779171432, "loss": 0.6398, "step": 184 }, { "epoch": 0.8043478260869565, "grad_norm": 0.4978863298892975, "learning_rate": 0.00015000000000000001, "loss": 0.6456, "step": 185 }, { "epoch": 0.808695652173913, "grad_norm": 0.4530424177646637, "learning_rate": 0.00014934139304044033, "loss": 0.6453, "step": 186 }, { "epoch": 0.8130434782608695, "grad_norm": 0.29163071513175964, "learning_rate": 0.00014867994483783485, "loss": 0.6558, "step": 187 }, { "epoch": 0.8173913043478261, "grad_norm": 0.33445900678634644, "learning_rate": 0.00014801569348059157, "loss": 0.6291, "step": 188 }, { "epoch": 0.8217391304347826, "grad_norm": 0.3891032934188843, "learning_rate": 0.0001473486772185334, "loss": 0.6458, "step": 189 }, { "epoch": 0.8260869565217391, "grad_norm": 0.4320944845676422, "learning_rate": 0.00014667893446069588, "loss": 0.6275, "step": 190 }, { "epoch": 0.8304347826086956, "grad_norm": 0.3652418553829193, "learning_rate": 0.00014600650377311522, "loss": 0.6434, "step": 191 }, { "epoch": 0.8347826086956521, "grad_norm": 0.2939096689224243, "learning_rate": 0.00014533142387660773, "loss": 0.6462, "step": 192 }, { "epoch": 0.8391304347826087, "grad_norm": 0.36094796657562256, "learning_rate": 0.00014465373364454001, "loss": 0.6259, "step": 193 }, { "epoch": 0.8434782608695652, "grad_norm": 0.503746747970581, "learning_rate": 0.00014397347210059057, "loss": 0.6565, "step": 194 }, { "epoch": 0.8478260869565217, "grad_norm": 0.501377522945404, "learning_rate": 0.00014329067841650274, "loss": 0.6358, "step": 195 }, { "epoch": 0.8521739130434782, "grad_norm": 0.40720251202583313, "learning_rate": 0.00014260539190982886, "loss": 0.636, "step": 196 }, { "epoch": 0.8565217391304348, "grad_norm": 0.3170947730541229, "learning_rate": 0.00014191765204166643, "loss": 0.6343, "step": 197 }, { "epoch": 0.8608695652173913, "grad_norm": 0.43554455041885376, "learning_rate": 0.00014122749841438575, "loss": 0.6319, "step": 198 }, { "epoch": 0.8652173913043478, "grad_norm": 0.5128415822982788, "learning_rate": 0.00014053497076934948, "loss": 0.6326, "step": 199 }, { "epoch": 0.8695652173913043, "grad_norm": 0.44992515444755554, "learning_rate": 0.00013984010898462416, "loss": 0.6343, "step": 200 }, { "epoch": 0.8739130434782608, "grad_norm": 0.506968080997467, "learning_rate": 0.00013914295307268396, "loss": 0.6472, "step": 201 }, { "epoch": 0.8782608695652174, "grad_norm": 0.6257392764091492, "learning_rate": 0.0001384435431781065, "loss": 0.6535, "step": 202 }, { "epoch": 0.8826086956521739, "grad_norm": 0.9480230808258057, "learning_rate": 0.00013774191957526143, "loss": 0.6628, "step": 203 }, { "epoch": 0.8869565217391304, "grad_norm": 1.2171893119812012, "learning_rate": 0.00013703812266599113, "loss": 0.6585, "step": 204 }, { "epoch": 0.8913043478260869, "grad_norm": 0.3134421110153198, "learning_rate": 0.00013633219297728416, "loss": 0.6629, "step": 205 }, { "epoch": 0.8956521739130435, "grad_norm": 1.003349781036377, "learning_rate": 0.00013562417115894172, "loss": 0.6516, "step": 206 }, { "epoch": 0.9, "grad_norm": 1.246419906616211, "learning_rate": 0.00013491409798123687, "loss": 0.6418, "step": 207 }, { "epoch": 0.9043478260869565, "grad_norm": 0.46948862075805664, "learning_rate": 0.00013420201433256689, "loss": 0.6441, "step": 208 }, { "epoch": 0.908695652173913, "grad_norm": 1.628340244293213, "learning_rate": 0.00013348796121709862, "loss": 0.6661, "step": 209 }, { "epoch": 0.9130434782608695, "grad_norm": 0.4027623236179352, "learning_rate": 0.0001327719797524075, "loss": 0.6342, "step": 210 }, { "epoch": 0.9173913043478261, "grad_norm": 1.3196384906768799, "learning_rate": 0.00013205411116710972, "loss": 0.6724, "step": 211 }, { "epoch": 0.9217391304347826, "grad_norm": 0.561631977558136, "learning_rate": 0.00013133439679848823, "loss": 0.6541, "step": 212 }, { "epoch": 0.9260869565217391, "grad_norm": 0.7715569734573364, "learning_rate": 0.00013061287809011242, "loss": 0.6419, "step": 213 }, { "epoch": 0.9304347826086956, "grad_norm": 0.8591257333755493, "learning_rate": 0.0001298895965894516, "loss": 0.6197, "step": 214 }, { "epoch": 0.9347826086956522, "grad_norm": 0.4229847192764282, "learning_rate": 0.0001291645939454825, "loss": 0.6472, "step": 215 }, { "epoch": 0.9391304347826087, "grad_norm": 0.7943733930587769, "learning_rate": 0.0001284379119062912, "loss": 0.6576, "step": 216 }, { "epoch": 0.9434782608695652, "grad_norm": 0.7454273104667664, "learning_rate": 0.0001277095923166689, "loss": 0.6245, "step": 217 }, { "epoch": 0.9478260869565217, "grad_norm": 0.4976602792739868, "learning_rate": 0.00012697967711570242, "loss": 0.644, "step": 218 }, { "epoch": 0.9521739130434783, "grad_norm": 0.6845293641090393, "learning_rate": 0.00012624820833435937, "loss": 0.6412, "step": 219 }, { "epoch": 0.9565217391304348, "grad_norm": 0.7265484929084778, "learning_rate": 0.0001255152280930676, "loss": 0.6438, "step": 220 }, { "epoch": 0.9608695652173913, "grad_norm": 0.4346272647380829, "learning_rate": 0.00012478077859929, "loss": 0.6116, "step": 221 }, { "epoch": 0.9652173913043478, "grad_norm": 0.5768253803253174, "learning_rate": 0.00012404490214509386, "loss": 0.6242, "step": 222 }, { "epoch": 0.9695652173913043, "grad_norm": 0.688556969165802, "learning_rate": 0.00012330764110471566, "loss": 0.6546, "step": 223 }, { "epoch": 0.9739130434782609, "grad_norm": 0.6147114634513855, "learning_rate": 0.00012256903793212107, "loss": 0.6286, "step": 224 }, { "epoch": 0.9782608695652174, "grad_norm": 0.6598117351531982, "learning_rate": 0.00012182913515856015, "loss": 0.65, "step": 225 }, { "epoch": 0.9826086956521739, "grad_norm": 0.6232290863990784, "learning_rate": 0.00012108797539011847, "loss": 0.6465, "step": 226 }, { "epoch": 0.9869565217391304, "grad_norm": 0.3764599561691284, "learning_rate": 0.0001203456013052634, "loss": 0.6397, "step": 227 }, { "epoch": 0.991304347826087, "grad_norm": 0.4177006781101227, "learning_rate": 0.00011960205565238684, "loss": 0.6324, "step": 228 }, { "epoch": 0.9956521739130435, "grad_norm": 0.6632861495018005, "learning_rate": 0.00011885738124734358, "loss": 0.6394, "step": 229 }, { "epoch": 1.0, "grad_norm": 0.8406037092208862, "learning_rate": 0.00011811162097098558, "loss": 0.6563, "step": 230 }, { "epoch": 1.0043478260869565, "grad_norm": 0.5756528973579407, "learning_rate": 0.00011736481776669306, "loss": 0.6087, "step": 231 }, { "epoch": 1.008695652173913, "grad_norm": 0.4710526466369629, "learning_rate": 0.00011661701463790142, "loss": 0.632, "step": 232 }, { "epoch": 1.0130434782608695, "grad_norm": 0.6560250520706177, "learning_rate": 0.00011586825464562514, "loss": 0.6305, "step": 233 }, { "epoch": 1.017391304347826, "grad_norm": 0.7808629870414734, "learning_rate": 0.0001151185809059781, "loss": 0.6374, "step": 234 }, { "epoch": 1.0217391304347827, "grad_norm": 0.7576888799667358, "learning_rate": 0.00011436803658769082, "loss": 0.6436, "step": 235 }, { "epoch": 1.0260869565217392, "grad_norm": 0.5998823046684265, "learning_rate": 0.00011361666490962468, "loss": 0.6245, "step": 236 }, { "epoch": 1.0304347826086957, "grad_norm": 0.4165184795856476, "learning_rate": 0.00011286450913828312, "loss": 0.6307, "step": 237 }, { "epoch": 1.0347826086956522, "grad_norm": 0.7513805031776428, "learning_rate": 0.00011211161258532041, "loss": 0.6327, "step": 238 }, { "epoch": 1.0391304347826087, "grad_norm": 0.6765947341918945, "learning_rate": 0.00011135801860504749, "loss": 0.6412, "step": 239 }, { "epoch": 1.0434782608695652, "grad_norm": 0.6011433005332947, "learning_rate": 0.00011060377059193547, "loss": 0.6521, "step": 240 }, { "epoch": 1.0478260869565217, "grad_norm": 0.4051482677459717, "learning_rate": 0.00010984891197811687, "loss": 0.6419, "step": 241 }, { "epoch": 1.0521739130434782, "grad_norm": 0.7668951153755188, "learning_rate": 0.0001090934862308847, "loss": 0.6264, "step": 242 }, { "epoch": 1.0565217391304347, "grad_norm": 0.6456301212310791, "learning_rate": 0.00010833753685018935, "loss": 0.6509, "step": 243 }, { "epoch": 1.0608695652173914, "grad_norm": 0.3792930841445923, "learning_rate": 0.00010758110736613385, "loss": 0.6132, "step": 244 }, { "epoch": 1.065217391304348, "grad_norm": 0.47149112820625305, "learning_rate": 0.0001068242413364671, "loss": 0.6346, "step": 245 }, { "epoch": 1.0695652173913044, "grad_norm": 0.5918865203857422, "learning_rate": 0.00010606698234407586, "loss": 0.6365, "step": 246 }, { "epoch": 1.0739130434782609, "grad_norm": 0.4131185710430145, "learning_rate": 0.00010530937399447496, "loss": 0.6262, "step": 247 }, { "epoch": 1.0782608695652174, "grad_norm": 0.6192177534103394, "learning_rate": 0.00010455145991329638, "loss": 0.6267, "step": 248 }, { "epoch": 1.0826086956521739, "grad_norm": 0.7231637835502625, "learning_rate": 0.00010379328374377715, "loss": 0.6524, "step": 249 }, { "epoch": 1.0869565217391304, "grad_norm": 0.5949220061302185, "learning_rate": 0.00010303488914424624, "loss": 0.6465, "step": 250 }, { "epoch": 1.0913043478260869, "grad_norm": 0.4544646441936493, "learning_rate": 0.00010227631978561056, "loss": 0.6383, "step": 251 }, { "epoch": 1.0956521739130434, "grad_norm": 0.3706333041191101, "learning_rate": 0.00010151761934884028, "loss": 0.6277, "step": 252 }, { "epoch": 1.1, "grad_norm": 0.4875901937484741, "learning_rate": 0.00010075883152245334, "loss": 0.6119, "step": 253 }, { "epoch": 1.1043478260869566, "grad_norm": 0.3684738576412201, "learning_rate": 0.0001, "loss": 0.6264, "step": 254 }, { "epoch": 1.108695652173913, "grad_norm": 0.42785608768463135, "learning_rate": 9.92411684775467e-05, "loss": 0.6272, "step": 255 }, { "epoch": 1.1130434782608696, "grad_norm": 0.3924098014831543, "learning_rate": 9.848238065115975e-05, "loss": 0.65, "step": 256 }, { "epoch": 1.117391304347826, "grad_norm": 0.3814132809638977, "learning_rate": 9.772368021438943e-05, "loss": 0.6103, "step": 257 }, { "epoch": 1.1217391304347826, "grad_norm": 0.3904854655265808, "learning_rate": 9.696511085575377e-05, "loss": 0.6422, "step": 258 }, { "epoch": 1.126086956521739, "grad_norm": 0.27673909068107605, "learning_rate": 9.620671625622288e-05, "loss": 0.637, "step": 259 }, { "epoch": 1.1304347826086956, "grad_norm": 0.3034502863883972, "learning_rate": 9.544854008670367e-05, "loss": 0.6286, "step": 260 }, { "epoch": 1.134782608695652, "grad_norm": 0.3358616828918457, "learning_rate": 9.469062600552509e-05, "loss": 0.6427, "step": 261 }, { "epoch": 1.1391304347826088, "grad_norm": 0.245226189494133, "learning_rate": 9.393301765592415e-05, "loss": 0.6269, "step": 262 }, { "epoch": 1.1434782608695653, "grad_norm": 0.3370216488838196, "learning_rate": 9.317575866353292e-05, "loss": 0.6093, "step": 263 }, { "epoch": 1.1478260869565218, "grad_norm": 0.27349230647087097, "learning_rate": 9.241889263386618e-05, "loss": 0.6495, "step": 264 }, { "epoch": 1.1521739130434783, "grad_norm": 0.3180980086326599, "learning_rate": 9.166246314981066e-05, "loss": 0.6336, "step": 265 }, { "epoch": 1.1565217391304348, "grad_norm": 0.32506948709487915, "learning_rate": 9.09065137691153e-05, "loss": 0.6264, "step": 266 }, { "epoch": 1.1608695652173913, "grad_norm": 0.30634960532188416, "learning_rate": 9.015108802188313e-05, "loss": 0.6269, "step": 267 }, { "epoch": 1.1652173913043478, "grad_norm": 0.36996349692344666, "learning_rate": 8.939622940806455e-05, "loss": 0.6454, "step": 268 }, { "epoch": 1.1695652173913043, "grad_norm": 0.2220568060874939, "learning_rate": 8.86419813949525e-05, "loss": 0.6106, "step": 269 }, { "epoch": 1.1739130434782608, "grad_norm": 0.26350730657577515, "learning_rate": 8.788838741467962e-05, "loss": 0.5946, "step": 270 }, { "epoch": 1.1782608695652175, "grad_norm": 0.22457756102085114, "learning_rate": 8.713549086171691e-05, "loss": 0.617, "step": 271 }, { "epoch": 1.182608695652174, "grad_norm": 0.28281551599502563, "learning_rate": 8.638333509037536e-05, "loss": 0.618, "step": 272 }, { "epoch": 1.1869565217391305, "grad_norm": 0.3269217908382416, "learning_rate": 8.563196341230919e-05, "loss": 0.6261, "step": 273 }, { "epoch": 1.191304347826087, "grad_norm": 0.2878088355064392, "learning_rate": 8.488141909402191e-05, "loss": 0.6104, "step": 274 }, { "epoch": 1.1956521739130435, "grad_norm": 0.23815755546092987, "learning_rate": 8.413174535437487e-05, "loss": 0.6325, "step": 275 }, { "epoch": 1.2, "grad_norm": 0.2873881459236145, "learning_rate": 8.33829853620986e-05, "loss": 0.6353, "step": 276 }, { "epoch": 1.2043478260869565, "grad_norm": 0.24071787297725677, "learning_rate": 8.263518223330697e-05, "loss": 0.621, "step": 277 }, { "epoch": 1.208695652173913, "grad_norm": 0.2793138921260834, "learning_rate": 8.188837902901442e-05, "loss": 0.6279, "step": 278 }, { "epoch": 1.2130434782608694, "grad_norm": 0.4142613112926483, "learning_rate": 8.114261875265643e-05, "loss": 0.6164, "step": 279 }, { "epoch": 1.2173913043478262, "grad_norm": 0.3821130692958832, "learning_rate": 8.039794434761318e-05, "loss": 0.6191, "step": 280 }, { "epoch": 1.2217391304347827, "grad_norm": 0.24204838275909424, "learning_rate": 7.965439869473664e-05, "loss": 0.6149, "step": 281 }, { "epoch": 1.2260869565217392, "grad_norm": 0.2838297188282013, "learning_rate": 7.891202460988158e-05, "loss": 0.6478, "step": 282 }, { "epoch": 1.2304347826086957, "grad_norm": 0.35510286688804626, "learning_rate": 7.817086484143986e-05, "loss": 0.6418, "step": 283 }, { "epoch": 1.2347826086956522, "grad_norm": 0.37343931198120117, "learning_rate": 7.743096206787894e-05, "loss": 0.6126, "step": 284 }, { "epoch": 1.2391304347826086, "grad_norm": 0.264636754989624, "learning_rate": 7.669235889528436e-05, "loss": 0.6231, "step": 285 }, { "epoch": 1.2434782608695651, "grad_norm": 0.2996392548084259, "learning_rate": 7.595509785490617e-05, "loss": 0.6343, "step": 286 }, { "epoch": 1.2478260869565219, "grad_norm": 0.38490474224090576, "learning_rate": 7.521922140071002e-05, "loss": 0.64, "step": 287 }, { "epoch": 1.2521739130434781, "grad_norm": 0.297821044921875, "learning_rate": 7.448477190693238e-05, "loss": 0.6197, "step": 288 }, { "epoch": 1.2565217391304349, "grad_norm": 0.2390124499797821, "learning_rate": 7.375179166564063e-05, "loss": 0.6283, "step": 289 }, { "epoch": 1.2608695652173914, "grad_norm": 0.27224546670913696, "learning_rate": 7.302032288429756e-05, "loss": 0.6197, "step": 290 }, { "epoch": 1.2652173913043478, "grad_norm": 0.3059544563293457, "learning_rate": 7.229040768333115e-05, "loss": 0.6281, "step": 291 }, { "epoch": 1.2695652173913043, "grad_norm": 0.23116622865200043, "learning_rate": 7.156208809370883e-05, "loss": 0.6235, "step": 292 }, { "epoch": 1.2739130434782608, "grad_norm": 0.28964969515800476, "learning_rate": 7.08354060545175e-05, "loss": 0.6032, "step": 293 }, { "epoch": 1.2782608695652173, "grad_norm": 0.23806549608707428, "learning_rate": 7.011040341054845e-05, "loss": 0.6113, "step": 294 }, { "epoch": 1.2826086956521738, "grad_norm": 0.23752138018608093, "learning_rate": 6.93871219098876e-05, "loss": 0.6359, "step": 295 }, { "epoch": 1.2869565217391306, "grad_norm": 0.23858202993869781, "learning_rate": 6.866560320151179e-05, "loss": 0.6207, "step": 296 }, { "epoch": 1.2913043478260868, "grad_norm": 0.28167223930358887, "learning_rate": 6.79458888328903e-05, "loss": 0.6055, "step": 297 }, { "epoch": 1.2956521739130435, "grad_norm": 0.24841302633285522, "learning_rate": 6.722802024759252e-05, "loss": 0.62, "step": 298 }, { "epoch": 1.3, "grad_norm": 0.2754172682762146, "learning_rate": 6.651203878290139e-05, "loss": 0.6177, "step": 299 }, { "epoch": 1.3043478260869565, "grad_norm": 0.3091265857219696, "learning_rate": 6.579798566743314e-05, "loss": 0.6295, "step": 300 }, { "epoch": 1.308695652173913, "grad_norm": 0.2213958352804184, "learning_rate": 6.508590201876317e-05, "loss": 0.6017, "step": 301 }, { "epoch": 1.3130434782608695, "grad_norm": 0.30063438415527344, "learning_rate": 6.437582884105835e-05, "loss": 0.6455, "step": 302 }, { "epoch": 1.317391304347826, "grad_norm": 0.3458832800388336, "learning_rate": 6.366780702271589e-05, "loss": 0.6326, "step": 303 }, { "epoch": 1.3217391304347825, "grad_norm": 0.24162618815898895, "learning_rate": 6.29618773340089e-05, "loss": 0.6155, "step": 304 }, { "epoch": 1.3260869565217392, "grad_norm": 0.32919561862945557, "learning_rate": 6.225808042473858e-05, "loss": 0.6354, "step": 305 }, { "epoch": 1.3304347826086955, "grad_norm": 0.3182857036590576, "learning_rate": 6.155645682189351e-05, "loss": 0.6007, "step": 306 }, { "epoch": 1.3347826086956522, "grad_norm": 0.2895689308643341, "learning_rate": 6.085704692731609e-05, "loss": 0.6382, "step": 307 }, { "epoch": 1.3391304347826087, "grad_norm": 0.2976822555065155, "learning_rate": 6.015989101537586e-05, "loss": 0.5889, "step": 308 }, { "epoch": 1.3434782608695652, "grad_norm": 0.2826453745365143, "learning_rate": 5.9465029230650534e-05, "loss": 0.6329, "step": 309 }, { "epoch": 1.3478260869565217, "grad_norm": 0.3266195058822632, "learning_rate": 5.877250158561425e-05, "loss": 0.6359, "step": 310 }, { "epoch": 1.3521739130434782, "grad_norm": 0.2917690873146057, "learning_rate": 5.8082347958333625e-05, "loss": 0.6301, "step": 311 }, { "epoch": 1.3565217391304347, "grad_norm": 0.25518476963043213, "learning_rate": 5.73946080901712e-05, "loss": 0.5977, "step": 312 }, { "epoch": 1.3608695652173912, "grad_norm": 0.2891990542411804, "learning_rate": 5.670932158349731e-05, "loss": 0.6394, "step": 313 }, { "epoch": 1.365217391304348, "grad_norm": 0.24494178593158722, "learning_rate": 5.602652789940941e-05, "loss": 0.6104, "step": 314 }, { "epoch": 1.3695652173913042, "grad_norm": 0.2637173533439636, "learning_rate": 5.5346266355459995e-05, "loss": 0.6252, "step": 315 }, { "epoch": 1.373913043478261, "grad_norm": 0.30065053701400757, "learning_rate": 5.466857612339229e-05, "loss": 0.638, "step": 316 }, { "epoch": 1.3782608695652174, "grad_norm": 0.2335294783115387, "learning_rate": 5.399349622688479e-05, "loss": 0.6433, "step": 317 }, { "epoch": 1.382608695652174, "grad_norm": 0.28032875061035156, "learning_rate": 5.332106553930414e-05, "loss": 0.6301, "step": 318 }, { "epoch": 1.3869565217391304, "grad_norm": 0.3166956603527069, "learning_rate": 5.26513227814666e-05, "loss": 0.63, "step": 319 }, { "epoch": 1.391304347826087, "grad_norm": 0.24381506443023682, "learning_rate": 5.1984306519408456e-05, "loss": 0.5954, "step": 320 }, { "epoch": 1.3956521739130434, "grad_norm": 0.38320308923721313, "learning_rate": 5.1320055162165115e-05, "loss": 0.6464, "step": 321 }, { "epoch": 1.4, "grad_norm": 0.30799657106399536, "learning_rate": 5.065860695955971e-05, "loss": 0.6356, "step": 322 }, { "epoch": 1.4043478260869566, "grad_norm": 0.5910504460334778, "learning_rate": 5.000000000000002e-05, "loss": 0.6058, "step": 323 }, { "epoch": 1.4086956521739131, "grad_norm": 0.35114794969558716, "learning_rate": 4.934427220828571e-05, "loss": 0.6172, "step": 324 }, { "epoch": 1.4130434782608696, "grad_norm": 0.27586087584495544, "learning_rate": 4.869146134342426e-05, "loss": 0.6249, "step": 325 }, { "epoch": 1.4173913043478261, "grad_norm": 0.2799661457538605, "learning_rate": 4.804160499645667e-05, "loss": 0.6144, "step": 326 }, { "epoch": 1.4217391304347826, "grad_norm": 0.29662200808525085, "learning_rate": 4.739474058829289e-05, "loss": 0.6042, "step": 327 }, { "epoch": 1.4260869565217391, "grad_norm": 0.26697418093681335, "learning_rate": 4.675090536755705e-05, "loss": 0.6164, "step": 328 }, { "epoch": 1.4304347826086956, "grad_norm": 0.2900712490081787, "learning_rate": 4.611013640844245e-05, "loss": 0.6298, "step": 329 }, { "epoch": 1.434782608695652, "grad_norm": 0.26294711232185364, "learning_rate": 4.547247060857675e-05, "loss": 0.6363, "step": 330 }, { "epoch": 1.4391304347826086, "grad_norm": 0.22578993439674377, "learning_rate": 4.483794468689728e-05, "loss": 0.615, "step": 331 }, { "epoch": 1.4434782608695653, "grad_norm": 0.2953335642814636, "learning_rate": 4.420659518153667e-05, "loss": 0.6278, "step": 332 }, { "epoch": 1.4478260869565218, "grad_norm": 0.27388015389442444, "learning_rate": 4.357845844771881e-05, "loss": 0.6222, "step": 333 }, { "epoch": 1.4521739130434783, "grad_norm": 0.23460474610328674, "learning_rate": 4.295357065566543e-05, "loss": 0.6313, "step": 334 }, { "epoch": 1.4565217391304348, "grad_norm": 0.3273029923439026, "learning_rate": 4.2331967788513295e-05, "loss": 0.6038, "step": 335 }, { "epoch": 1.4608695652173913, "grad_norm": 0.27219584584236145, "learning_rate": 4.1713685640242165e-05, "loss": 0.6285, "step": 336 }, { "epoch": 1.4652173913043478, "grad_norm": 0.3417372703552246, "learning_rate": 4.109875981361363e-05, "loss": 0.6361, "step": 337 }, { "epoch": 1.4695652173913043, "grad_norm": 0.26904726028442383, "learning_rate": 4.048722571812105e-05, "loss": 0.6143, "step": 338 }, { "epoch": 1.4739130434782608, "grad_norm": 0.3139411211013794, "learning_rate": 3.987911856795047e-05, "loss": 0.6209, "step": 339 }, { "epoch": 1.4782608695652173, "grad_norm": 0.2561061978340149, "learning_rate": 3.927447337995304e-05, "loss": 0.614, "step": 340 }, { "epoch": 1.482608695652174, "grad_norm": 0.2412693351507187, "learning_rate": 3.8673324971628357e-05, "loss": 0.6154, "step": 341 }, { "epoch": 1.4869565217391305, "grad_norm": 0.20282143354415894, "learning_rate": 3.8075707959119846e-05, "loss": 0.6052, "step": 342 }, { "epoch": 1.491304347826087, "grad_norm": 0.25691401958465576, "learning_rate": 3.7481656755221125e-05, "loss": 0.6081, "step": 343 }, { "epoch": 1.4956521739130435, "grad_norm": 0.26478344202041626, "learning_rate": 3.689120556739475e-05, "loss": 0.6191, "step": 344 }, { "epoch": 1.5, "grad_norm": 0.21863462030887604, "learning_rate": 3.630438839580217e-05, "loss": 0.6319, "step": 345 }, { "epoch": 1.5043478260869565, "grad_norm": 0.22426074743270874, "learning_rate": 3.5721239031346066e-05, "loss": 0.6178, "step": 346 }, { "epoch": 1.508695652173913, "grad_norm": 0.21831241250038147, "learning_rate": 3.5141791053724405e-05, "loss": 0.6303, "step": 347 }, { "epoch": 1.5130434782608697, "grad_norm": 0.21894365549087524, "learning_rate": 3.456607782949689e-05, "loss": 0.6044, "step": 348 }, { "epoch": 1.517391304347826, "grad_norm": 0.25870388746261597, "learning_rate": 3.399413251016359e-05, "loss": 0.6181, "step": 349 }, { "epoch": 1.5217391304347827, "grad_norm": 0.2224799394607544, "learning_rate": 3.342598803025595e-05, "loss": 0.6028, "step": 350 }, { "epoch": 1.526086956521739, "grad_norm": 0.25613147020339966, "learning_rate": 3.2861677105440336e-05, "loss": 0.5982, "step": 351 }, { "epoch": 1.5304347826086957, "grad_norm": 0.24120664596557617, "learning_rate": 3.2301232230634104e-05, "loss": 0.6159, "step": 352 }, { "epoch": 1.5347826086956522, "grad_norm": 0.22223018109798431, "learning_rate": 3.174468567813461e-05, "loss": 0.6113, "step": 353 }, { "epoch": 1.5391304347826087, "grad_norm": 0.25855186581611633, "learning_rate": 3.119206949576052e-05, "loss": 0.6315, "step": 354 }, { "epoch": 1.5434782608695652, "grad_norm": 0.21754255890846252, "learning_rate": 3.0643415505006735e-05, "loss": 0.6202, "step": 355 }, { "epoch": 1.5478260869565217, "grad_norm": 0.2552974224090576, "learning_rate": 3.009875529921181e-05, "loss": 0.6195, "step": 356 }, { "epoch": 1.5521739130434784, "grad_norm": 0.19017580151557922, "learning_rate": 2.9558120241738784e-05, "loss": 0.5913, "step": 357 }, { "epoch": 1.5565217391304347, "grad_norm": 0.2394818663597107, "learning_rate": 2.90215414641691e-05, "loss": 0.6091, "step": 358 }, { "epoch": 1.5608695652173914, "grad_norm": 0.29089704155921936, "learning_rate": 2.8489049864510054e-05, "loss": 0.639, "step": 359 }, { "epoch": 1.5652173913043477, "grad_norm": 0.21274344623088837, "learning_rate": 2.7960676105415472e-05, "loss": 0.6182, "step": 360 }, { "epoch": 1.5695652173913044, "grad_norm": 0.2907910645008087, "learning_rate": 2.7436450612420095e-05, "loss": 0.6111, "step": 361 }, { "epoch": 1.5739130434782609, "grad_norm": 0.19825702905654907, "learning_rate": 2.691640357218759e-05, "loss": 0.6175, "step": 362 }, { "epoch": 1.5782608695652174, "grad_norm": 0.2254411280155182, "learning_rate": 2.640056493077231e-05, "loss": 0.5938, "step": 363 }, { "epoch": 1.5826086956521739, "grad_norm": 0.2379319965839386, "learning_rate": 2.5888964391894766e-05, "loss": 0.6105, "step": 364 }, { "epoch": 1.5869565217391304, "grad_norm": 0.1910097748041153, "learning_rate": 2.5381631415231454e-05, "loss": 0.6088, "step": 365 }, { "epoch": 1.591304347826087, "grad_norm": 0.2428610920906067, "learning_rate": 2.4878595214718236e-05, "loss": 0.5999, "step": 366 }, { "epoch": 1.5956521739130434, "grad_norm": 0.21029497683048248, "learning_rate": 2.4379884756868167e-05, "loss": 0.617, "step": 367 }, { "epoch": 1.6, "grad_norm": 0.22931547462940216, "learning_rate": 2.3885528759103538e-05, "loss": 0.5912, "step": 368 }, { "epoch": 1.6043478260869564, "grad_norm": 0.19771426916122437, "learning_rate": 2.339555568810221e-05, "loss": 0.58, "step": 369 }, { "epoch": 1.608695652173913, "grad_norm": 0.25638020038604736, "learning_rate": 2.2909993758158412e-05, "loss": 0.6217, "step": 370 }, { "epoch": 1.6130434782608696, "grad_norm": 0.2538512945175171, "learning_rate": 2.242887092955801e-05, "loss": 0.6372, "step": 371 }, { "epoch": 1.617391304347826, "grad_norm": 0.19667693972587585, "learning_rate": 2.1952214906968627e-05, "loss": 0.5964, "step": 372 }, { "epoch": 1.6217391304347826, "grad_norm": 0.2506635785102844, "learning_rate": 2.1480053137844115e-05, "loss": 0.6196, "step": 373 }, { "epoch": 1.626086956521739, "grad_norm": 0.19938236474990845, "learning_rate": 2.101241281084416e-05, "loss": 0.6214, "step": 374 }, { "epoch": 1.6304347826086958, "grad_norm": 0.20109523832798004, "learning_rate": 2.054932085426856e-05, "loss": 0.6072, "step": 375 }, { "epoch": 1.634782608695652, "grad_norm": 0.21539539098739624, "learning_rate": 2.0090803934506764e-05, "loss": 0.6062, "step": 376 }, { "epoch": 1.6391304347826088, "grad_norm": 0.21811442077159882, "learning_rate": 1.9636888454502178e-05, "loss": 0.6093, "step": 377 }, { "epoch": 1.643478260869565, "grad_norm": 0.1716393679380417, "learning_rate": 1.9187600552231955e-05, "loss": 0.5951, "step": 378 }, { "epoch": 1.6478260869565218, "grad_norm": 0.22683413326740265, "learning_rate": 1.8742966099201697e-05, "loss": 0.6119, "step": 379 }, { "epoch": 1.6521739130434783, "grad_norm": 0.213649183511734, "learning_rate": 1.8303010698955804e-05, "loss": 0.6013, "step": 380 }, { "epoch": 1.6565217391304348, "grad_norm": 0.1564229130744934, "learning_rate": 1.7867759685603114e-05, "loss": 0.6083, "step": 381 }, { "epoch": 1.6608695652173913, "grad_norm": 0.2164810448884964, "learning_rate": 1.7437238122358057e-05, "loss": 0.6306, "step": 382 }, { "epoch": 1.6652173913043478, "grad_norm": 0.18622331321239471, "learning_rate": 1.7011470800097496e-05, "loss": 0.5776, "step": 383 }, { "epoch": 1.6695652173913045, "grad_norm": 0.1888633519411087, "learning_rate": 1.659048223593308e-05, "loss": 0.6242, "step": 384 }, { "epoch": 1.6739130434782608, "grad_norm": 0.16813671588897705, "learning_rate": 1.6174296671799572e-05, "loss": 0.5759, "step": 385 }, { "epoch": 1.6782608695652175, "grad_norm": 0.20108157396316528, "learning_rate": 1.5762938073058853e-05, "loss": 0.6151, "step": 386 }, { "epoch": 1.6826086956521737, "grad_norm": 0.17089848220348358, "learning_rate": 1.5356430127119913e-05, "loss": 0.5993, "step": 387 }, { "epoch": 1.6869565217391305, "grad_norm": 0.15591366589069366, "learning_rate": 1.4954796242074898e-05, "loss": 0.614, "step": 388 }, { "epoch": 1.691304347826087, "grad_norm": 0.16411159932613373, "learning_rate": 1.4558059545351143e-05, "loss": 0.6023, "step": 389 }, { "epoch": 1.6956521739130435, "grad_norm": 0.17830075323581696, "learning_rate": 1.4166242882379476e-05, "loss": 0.6031, "step": 390 }, { "epoch": 1.7, "grad_norm": 0.15914571285247803, "learning_rate": 1.3779368815278647e-05, "loss": 0.6173, "step": 391 }, { "epoch": 1.7043478260869565, "grad_norm": 0.16238906979560852, "learning_rate": 1.339745962155613e-05, "loss": 0.6085, "step": 392 }, { "epoch": 1.7086956521739132, "grad_norm": 0.1836780607700348, "learning_rate": 1.302053729282533e-05, "loss": 0.6147, "step": 393 }, { "epoch": 1.7130434782608694, "grad_norm": 0.17346879839897156, "learning_rate": 1.2648623533539261e-05, "loss": 0.5877, "step": 394 }, { "epoch": 1.7173913043478262, "grad_norm": 0.17038564383983612, "learning_rate": 1.2281739759740574e-05, "loss": 0.5981, "step": 395 }, { "epoch": 1.7217391304347827, "grad_norm": 0.23689156770706177, "learning_rate": 1.1919907097828653e-05, "loss": 0.6318, "step": 396 }, { "epoch": 1.7260869565217392, "grad_norm": 0.1710849106311798, "learning_rate": 1.1563146383342772e-05, "loss": 0.6007, "step": 397 }, { "epoch": 1.7304347826086957, "grad_norm": 0.150990828871727, "learning_rate": 1.1211478159762478e-05, "loss": 0.5942, "step": 398 }, { "epoch": 1.7347826086956522, "grad_norm": 0.17545339465141296, "learning_rate": 1.0864922677324618e-05, "loss": 0.6205, "step": 399 }, { "epoch": 1.7391304347826086, "grad_norm": 0.15553072094917297, "learning_rate": 1.0523499891857225e-05, "loss": 0.5996, "step": 400 }, { "epoch": 1.7434782608695651, "grad_norm": 0.16285602748394012, "learning_rate": 1.01872294636304e-05, "loss": 0.5937, "step": 401 }, { "epoch": 1.7478260869565219, "grad_norm": 0.16755890846252441, "learning_rate": 9.856130756224213e-06, "loss": 0.6023, "step": 402 }, { "epoch": 1.7521739130434781, "grad_norm": 0.141220822930336, "learning_rate": 9.530222835413738e-06, "loss": 0.6181, "step": 403 }, { "epoch": 1.7565217391304349, "grad_norm": 0.1646806001663208, "learning_rate": 9.209524468071096e-06, "loss": 0.615, "step": 404 }, { "epoch": 1.7608695652173914, "grad_norm": 0.1562896966934204, "learning_rate": 8.894054121084838e-06, "loss": 0.6002, "step": 405 }, { "epoch": 1.7652173913043478, "grad_norm": 0.15589256584644318, "learning_rate": 8.58382996029652e-06, "loss": 0.6098, "step": 406 }, { "epoch": 1.7695652173913043, "grad_norm": 0.14259637892246246, "learning_rate": 8.278869849454718e-06, "loss": 0.6032, "step": 407 }, { "epoch": 1.7739130434782608, "grad_norm": 0.1473761945962906, "learning_rate": 7.97919134918632e-06, "loss": 0.6265, "step": 408 }, { "epoch": 1.7782608695652173, "grad_norm": 0.1384039968252182, "learning_rate": 7.684811715985429e-06, "loss": 0.617, "step": 409 }, { "epoch": 1.7826086956521738, "grad_norm": 0.16400642693042755, "learning_rate": 7.395747901219474e-06, "loss": 0.632, "step": 410 }, { "epoch": 1.7869565217391306, "grad_norm": 0.154324010014534, "learning_rate": 7.1120165501533e-06, "loss": 0.6039, "step": 411 }, { "epoch": 1.7913043478260868, "grad_norm": 0.15824785828590393, "learning_rate": 6.833634000990541e-06, "loss": 0.5953, "step": 412 }, { "epoch": 1.7956521739130435, "grad_norm": 0.14589077234268188, "learning_rate": 6.560616283932897e-06, "loss": 0.6049, "step": 413 }, { "epoch": 1.8, "grad_norm": 0.1429314911365509, "learning_rate": 6.292979120256992e-06, "loss": 0.6068, "step": 414 }, { "epoch": 1.8043478260869565, "grad_norm": 0.1345595121383667, "learning_rate": 6.030737921409169e-06, "loss": 0.6233, "step": 415 }, { "epoch": 1.808695652173913, "grad_norm": 0.14544036984443665, "learning_rate": 5.77390778811796e-06, "loss": 0.6229, "step": 416 }, { "epoch": 1.8130434782608695, "grad_norm": 0.14767540991306305, "learning_rate": 5.52250350952459e-06, "loss": 0.6289, "step": 417 }, { "epoch": 1.8173913043478263, "grad_norm": 0.14071600139141083, "learning_rate": 5.276539562331384e-06, "loss": 0.6006, "step": 418 }, { "epoch": 1.8217391304347825, "grad_norm": 0.15143164992332458, "learning_rate": 5.036030109968082e-06, "loss": 0.6162, "step": 419 }, { "epoch": 1.8260869565217392, "grad_norm": 0.1341582089662552, "learning_rate": 4.800989001776324e-06, "loss": 0.6015, "step": 420 }, { "epoch": 1.8304347826086955, "grad_norm": 0.13631945848464966, "learning_rate": 4.5714297722121106e-06, "loss": 0.6197, "step": 421 }, { "epoch": 1.8347826086956522, "grad_norm": 0.15245981514453888, "learning_rate": 4.347365640066525e-06, "loss": 0.6225, "step": 422 }, { "epoch": 1.8391304347826087, "grad_norm": 0.14014090597629547, "learning_rate": 4.128809507704445e-06, "loss": 0.5993, "step": 423 }, { "epoch": 1.8434782608695652, "grad_norm": 0.13918210566043854, "learning_rate": 3.915773960321634e-06, "loss": 0.6243, "step": 424 }, { "epoch": 1.8478260869565217, "grad_norm": 0.13100025057792664, "learning_rate": 3.7082712652200867e-06, "loss": 0.6074, "step": 425 }, { "epoch": 1.8521739130434782, "grad_norm": 0.15842288732528687, "learning_rate": 3.5063133711014882e-06, "loss": 0.6097, "step": 426 }, { "epoch": 1.856521739130435, "grad_norm": 0.1331150382757187, "learning_rate": 3.3099119073793928e-06, "loss": 0.6021, "step": 427 }, { "epoch": 1.8608695652173912, "grad_norm": 0.13728439807891846, "learning_rate": 3.119078183509372e-06, "loss": 0.6015, "step": 428 }, { "epoch": 1.865217391304348, "grad_norm": 0.1390867531299591, "learning_rate": 2.9338231883378366e-06, "loss": 0.603, "step": 429 }, { "epoch": 1.8695652173913042, "grad_norm": 0.15344412624835968, "learning_rate": 2.7541575894693194e-06, "loss": 0.6045, "step": 430 }, { "epoch": 1.873913043478261, "grad_norm": 0.14015014469623566, "learning_rate": 2.580091732652101e-06, "loss": 0.6161, "step": 431 }, { "epoch": 1.8782608695652174, "grad_norm": 0.15435759723186493, "learning_rate": 2.4116356411825525e-06, "loss": 0.6281, "step": 432 }, { "epoch": 1.882608695652174, "grad_norm": 0.14000248908996582, "learning_rate": 2.248799015327907e-06, "loss": 0.6358, "step": 433 }, { "epoch": 1.8869565217391304, "grad_norm": 0.14310961961746216, "learning_rate": 2.091591231767709e-06, "loss": 0.63, "step": 434 }, { "epoch": 1.891304347826087, "grad_norm": 0.14115604758262634, "learning_rate": 1.9400213430538773e-06, "loss": 0.637, "step": 435 }, { "epoch": 1.8956521739130436, "grad_norm": 0.1358548104763031, "learning_rate": 1.7940980770894122e-06, "loss": 0.6241, "step": 436 }, { "epoch": 1.9, "grad_norm": 0.13970565795898438, "learning_rate": 1.6538298366257976e-06, "loss": 0.6118, "step": 437 }, { "epoch": 1.9043478260869566, "grad_norm": 0.1373710334300995, "learning_rate": 1.5192246987791981e-06, "loss": 0.6121, "step": 438 }, { "epoch": 1.908695652173913, "grad_norm": 0.13853000104427338, "learning_rate": 1.3902904145653096e-06, "loss": 0.6433, "step": 439 }, { "epoch": 1.9130434782608696, "grad_norm": 0.16299930214881897, "learning_rate": 1.2670344084530383e-06, "loss": 0.6114, "step": 440 }, { "epoch": 1.9173913043478261, "grad_norm": 0.15240226686000824, "learning_rate": 1.1494637779369766e-06, "loss": 0.6392, "step": 441 }, { "epoch": 1.9217391304347826, "grad_norm": 0.12335983663797379, "learning_rate": 1.0375852931286956e-06, "loss": 0.6224, "step": 442 }, { "epoch": 1.9260869565217391, "grad_norm": 0.1319676637649536, "learning_rate": 9.314053963669245e-07, "loss": 0.618, "step": 443 }, { "epoch": 1.9304347826086956, "grad_norm": 0.11884966492652893, "learning_rate": 8.309302018465581e-07, "loss": 0.5911, "step": 444 }, { "epoch": 1.9347826086956523, "grad_norm": 0.1367439180612564, "learning_rate": 7.361654952665609e-07, "loss": 0.6186, "step": 445 }, { "epoch": 1.9391304347826086, "grad_norm": 0.12649576365947723, "learning_rate": 6.471167334968886e-07, "loss": 0.6249, "step": 446 }, { "epoch": 1.9434782608695653, "grad_norm": 0.1303686797618866, "learning_rate": 5.637890442641402e-07, "loss": 0.5979, "step": 447 }, { "epoch": 1.9478260869565216, "grad_norm": 0.1282416582107544, "learning_rate": 4.861872258564049e-07, "loss": 0.6195, "step": 448 }, { "epoch": 1.9521739130434783, "grad_norm": 0.12191151827573776, "learning_rate": 4.143157468468717e-07, "loss": 0.6097, "step": 449 }, { "epoch": 1.9565217391304348, "grad_norm": 0.11873479187488556, "learning_rate": 3.481787458365915e-07, "loss": 0.6102, "step": 450 }, { "epoch": 1.9608695652173913, "grad_norm": 0.1280432939529419, "learning_rate": 2.877800312160783e-07, "loss": 0.5857, "step": 451 }, { "epoch": 1.9652173913043478, "grad_norm": 0.12122710049152374, "learning_rate": 2.3312308094607382e-07, "loss": 0.5989, "step": 452 }, { "epoch": 1.9695652173913043, "grad_norm": 0.12293203175067902, "learning_rate": 1.8421104235727405e-07, "loss": 0.6275, "step": 453 }, { "epoch": 1.973913043478261, "grad_norm": 0.1292450726032257, "learning_rate": 1.4104673196903005e-07, "loss": 0.6009, "step": 454 }, { "epoch": 1.9782608695652173, "grad_norm": 0.1343158781528473, "learning_rate": 1.0363263532724432e-07, "loss": 0.6207, "step": 455 }, { "epoch": 1.982608695652174, "grad_norm": 0.12888604402542114, "learning_rate": 7.197090686119623e-08, "loss": 0.6183, "step": 456 }, { "epoch": 1.9869565217391303, "grad_norm": 0.12688706815242767, "learning_rate": 4.606336975948589e-08, "loss": 0.6183, "step": 457 }, { "epoch": 1.991304347826087, "grad_norm": 0.1274312287569046, "learning_rate": 2.5911515865084667e-08, "loss": 0.6094, "step": 458 }, { "epoch": 1.9956521739130435, "grad_norm": 0.13021548092365265, "learning_rate": 1.1516505589381776e-08, "loss": 0.6078, "step": 459 }, { "epoch": 2.0, "grad_norm": 0.12913620471954346, "learning_rate": 2.8791678453821135e-09, "loss": 0.6241, "step": 460 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7941728521827123e+19, "train_batch_size": 24, "trial_name": null, "trial_params": null }