{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 4.596632480621338, "eval_runtime": 132.3152, "eval_samples_per_second": 147.043, "eval_steps_per_second": 1.149, "step": 0 }, { "epoch": 0.015873015873015872, "grad_norm": 0.26967182755470276, "learning_rate": 0.0001, "loss": 4.612, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 0.2898576855659485, "learning_rate": 0.0001, "loss": 4.4349, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 0.2901840806007385, "learning_rate": 0.0001, "loss": 4.5047, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 0.3495827615261078, "learning_rate": 0.0001, "loss": 4.4469, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 0.39650025963783264, "learning_rate": 0.0001, "loss": 4.526, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 0.4414258897304535, "learning_rate": 0.0001, "loss": 4.546, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 0.4898076057434082, "learning_rate": 0.0001, "loss": 4.4131, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 0.48589834570884705, "learning_rate": 0.0001, "loss": 4.3135, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 0.5929457545280457, "learning_rate": 0.0001, "loss": 4.3235, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 0.6119698286056519, "learning_rate": 0.0001, "loss": 4.3461, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 0.6312505006790161, "learning_rate": 0.0001, "loss": 4.214, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 0.6929273009300232, "learning_rate": 0.0001, "loss": 4.1427, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 0.7550596594810486, "learning_rate": 0.0001, "loss": 4.1668, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 0.6299394965171814, "learning_rate": 0.0001, "loss": 3.9331, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 0.692095160484314, "learning_rate": 0.0001, "loss": 3.9224, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 0.955380916595459, "learning_rate": 0.0001, "loss": 3.9012, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 0.8210205435752869, "learning_rate": 0.0001, "loss": 3.8245, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 0.6235060095787048, "learning_rate": 0.0001, "loss": 3.6529, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 0.5264109373092651, "learning_rate": 0.0001, "loss": 3.7113, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 0.7998248338699341, "learning_rate": 0.0001, "loss": 3.525, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8079671859741211, "learning_rate": 0.0001, "loss": 3.5775, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 0.5628694295883179, "learning_rate": 0.0001, "loss": 3.5531, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 1.1438932418823242, "learning_rate": 0.0001, "loss": 3.5744, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 0.8720822930335999, "learning_rate": 0.0001, "loss": 3.5822, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 1.0421593189239502, "learning_rate": 0.0001, "loss": 3.5075, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 0.6795033812522888, "learning_rate": 0.0001, "loss": 3.4547, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 0.6736836433410645, "learning_rate": 0.0001, "loss": 3.4874, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 0.5057028532028198, "learning_rate": 0.0001, "loss": 3.4797, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 0.49980196356773376, "learning_rate": 0.0001, "loss": 3.3364, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 0.4449177384376526, "learning_rate": 0.0001, "loss": 3.2993, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 0.3823322057723999, "learning_rate": 0.0001, "loss": 3.3223, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 0.3255627453327179, "learning_rate": 0.0001, "loss": 3.3542, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 0.3236791491508484, "learning_rate": 0.0001, "loss": 3.2756, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 0.3119717240333557, "learning_rate": 0.0001, "loss": 3.1773, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 0.30881795287132263, "learning_rate": 0.0001, "loss": 3.2265, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.30378207564353943, "learning_rate": 0.0001, "loss": 3.0821, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 0.35712969303131104, "learning_rate": 0.0001, "loss": 3.2201, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.30827978253364563, "learning_rate": 0.0001, "loss": 3.135, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 0.3056108057498932, "learning_rate": 0.0001, "loss": 3.1543, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 0.3181809186935425, "learning_rate": 0.0001, "loss": 3.1409, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 0.32424548268318176, "learning_rate": 0.0001, "loss": 3.0388, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3107524514198303, "learning_rate": 0.0001, "loss": 3.0717, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 0.30104756355285645, "learning_rate": 0.0001, "loss": 3.0217, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 0.2986432909965515, "learning_rate": 0.0001, "loss": 2.9845, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.3059774339199066, "learning_rate": 0.0001, "loss": 2.8926, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 0.3167428970336914, "learning_rate": 0.0001, "loss": 2.9162, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 0.3148072361946106, "learning_rate": 0.0001, "loss": 3.0487, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.35918205976486206, "learning_rate": 0.0001, "loss": 3.037, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 0.3954469561576843, "learning_rate": 0.0001, "loss": 2.8684, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 0.33450567722320557, "learning_rate": 0.0001, "loss": 2.9808, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 0.35196173191070557, "learning_rate": 0.0001, "loss": 2.9072, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 0.3617725670337677, "learning_rate": 0.0001, "loss": 2.8938, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 0.377817302942276, "learning_rate": 0.0001, "loss": 2.8316, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 0.3597480058670044, "learning_rate": 0.0001, "loss": 2.8602, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 0.37252452969551086, "learning_rate": 0.0001, "loss": 2.8926, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.36722978949546814, "learning_rate": 0.0001, "loss": 2.983, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 0.37402233481407166, "learning_rate": 0.0001, "loss": 2.8777, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 0.31507572531700134, "learning_rate": 0.0001, "loss": 2.9208, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 0.35631242394447327, "learning_rate": 0.0001, "loss": 2.8828, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 0.33693110942840576, "learning_rate": 0.0001, "loss": 2.8233, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 0.323428750038147, "learning_rate": 0.0001, "loss": 2.9311, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 0.3073025643825531, "learning_rate": 0.0001, "loss": 2.7934, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.31816455721855164, "learning_rate": 0.0001, "loss": 2.8395, "step": 63 }, { "epoch": 1.0, "eval_loss": 2.8033106327056885, "eval_runtime": 131.4471, "eval_samples_per_second": 148.014, "eval_steps_per_second": 1.156, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.3218700587749481, "learning_rate": 0.0001, "loss": 2.6772, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 0.33239927887916565, "learning_rate": 0.0001, "loss": 2.8064, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.3222041726112366, "learning_rate": 0.0001, "loss": 2.7618, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 0.3218175172805786, "learning_rate": 0.0001, "loss": 2.823, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.3080641031265259, "learning_rate": 0.0001, "loss": 2.6865, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.31820255517959595, "learning_rate": 0.0001, "loss": 2.7326, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.3285914361476898, "learning_rate": 0.0001, "loss": 2.8175, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.34610089659690857, "learning_rate": 0.0001, "loss": 2.642, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.3352147042751312, "learning_rate": 0.0001, "loss": 2.6517, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 0.3385539948940277, "learning_rate": 0.0001, "loss": 2.6797, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 0.3544485867023468, "learning_rate": 0.0001, "loss": 2.8208, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.35244956612586975, "learning_rate": 0.0001, "loss": 2.7275, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.3411391079425812, "learning_rate": 0.0001, "loss": 2.5638, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 0.358157753944397, "learning_rate": 0.0001, "loss": 2.6367, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 0.35242363810539246, "learning_rate": 0.0001, "loss": 2.6895, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 0.3461464047431946, "learning_rate": 0.0001, "loss": 2.7364, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.33865585923194885, "learning_rate": 0.0001, "loss": 2.6523, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 0.3517827093601227, "learning_rate": 0.0001, "loss": 2.573, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 0.3442043960094452, "learning_rate": 0.0001, "loss": 2.6348, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 0.379457026720047, "learning_rate": 0.0001, "loss": 2.6806, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.3560670018196106, "learning_rate": 0.0001, "loss": 2.6734, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 0.3954298496246338, "learning_rate": 0.0001, "loss": 2.7085, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.3917439579963684, "learning_rate": 0.0001, "loss": 2.6951, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 0.350482702255249, "learning_rate": 0.0001, "loss": 2.6786, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.3554580509662628, "learning_rate": 0.0001, "loss": 2.5309, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 0.3658367991447449, "learning_rate": 0.0001, "loss": 2.5796, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.354397177696228, "learning_rate": 0.0001, "loss": 2.6635, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.3648728132247925, "learning_rate": 0.0001, "loss": 2.6793, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 0.39242303371429443, "learning_rate": 0.0001, "loss": 2.6318, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 0.3823918104171753, "learning_rate": 0.0001, "loss": 2.509, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 0.36895614862442017, "learning_rate": 0.0001, "loss": 2.5909, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.3695090711116791, "learning_rate": 0.0001, "loss": 2.5639, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.37633731961250305, "learning_rate": 0.0001, "loss": 2.6351, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.3766629099845886, "learning_rate": 0.0001, "loss": 2.4803, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.3765883445739746, "learning_rate": 0.0001, "loss": 2.6615, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 0.3859737515449524, "learning_rate": 0.0001, "loss": 2.481, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.36387383937835693, "learning_rate": 0.0001, "loss": 2.5661, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.38114404678344727, "learning_rate": 0.0001, "loss": 2.6509, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.3747798800468445, "learning_rate": 0.0001, "loss": 2.5248, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.39510586857795715, "learning_rate": 0.0001, "loss": 2.5784, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 0.4021095335483551, "learning_rate": 0.0001, "loss": 2.6515, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.3947932720184326, "learning_rate": 0.0001, "loss": 2.6109, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.39475512504577637, "learning_rate": 0.0001, "loss": 2.4172, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 0.38429784774780273, "learning_rate": 0.0001, "loss": 2.4828, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.39032769203186035, "learning_rate": 0.0001, "loss": 2.5088, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.3955644369125366, "learning_rate": 0.0001, "loss": 2.571, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.3895743787288666, "learning_rate": 0.0001, "loss": 2.4817, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 0.41024699807167053, "learning_rate": 0.0001, "loss": 2.5888, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.42291179299354553, "learning_rate": 0.0001, "loss": 2.5777, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.40427684783935547, "learning_rate": 0.0001, "loss": 2.4627, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.4024043381214142, "learning_rate": 0.0001, "loss": 2.5497, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.3848831057548523, "learning_rate": 0.0001, "loss": 2.507, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.39639776945114136, "learning_rate": 0.0001, "loss": 2.4462, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 0.4169174134731293, "learning_rate": 0.0001, "loss": 2.5912, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.39467406272888184, "learning_rate": 0.0001, "loss": 2.5258, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.39699482917785645, "learning_rate": 0.0001, "loss": 2.3845, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 0.40015560388565063, "learning_rate": 0.0001, "loss": 2.615, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.4159916937351227, "learning_rate": 0.0001, "loss": 2.4749, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 0.4075866937637329, "learning_rate": 0.0001, "loss": 2.3877, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.40443259477615356, "learning_rate": 0.0001, "loss": 2.4426, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.42848312854766846, "learning_rate": 0.0001, "loss": 2.4878, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.4103951156139374, "learning_rate": 0.0001, "loss": 2.4452, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.42405155301094055, "learning_rate": 0.0001, "loss": 2.5123, "step": 126 }, { "epoch": 2.0, "eval_loss": 2.508723258972168, "eval_runtime": 131.3173, "eval_samples_per_second": 148.16, "eval_steps_per_second": 1.158, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.40521103143692017, "learning_rate": 0.0001, "loss": 2.4685, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.3934997022151947, "learning_rate": 0.0001, "loss": 2.4345, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.4023909568786621, "learning_rate": 0.0001, "loss": 2.4582, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 0.43362903594970703, "learning_rate": 0.0001, "loss": 2.4846, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.42157357931137085, "learning_rate": 0.0001, "loss": 2.4447, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.4321233332157135, "learning_rate": 0.0001, "loss": 2.4022, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.424949049949646, "learning_rate": 0.0001, "loss": 2.3601, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.42683982849121094, "learning_rate": 0.0001, "loss": 2.3331, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.4327385425567627, "learning_rate": 0.0001, "loss": 2.3789, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.4611188471317291, "learning_rate": 0.0001, "loss": 2.5813, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 0.45626574754714966, "learning_rate": 0.0001, "loss": 2.4627, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 0.43329715728759766, "learning_rate": 0.0001, "loss": 2.4692, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.4390101730823517, "learning_rate": 0.0001, "loss": 2.4429, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.46726638078689575, "learning_rate": 0.0001, "loss": 2.436, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.45958849787712097, "learning_rate": 0.0001, "loss": 2.4011, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.4724540710449219, "learning_rate": 0.0001, "loss": 2.3135, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.46117934584617615, "learning_rate": 0.0001, "loss": 2.3877, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.4439626634120941, "learning_rate": 0.0001, "loss": 2.4332, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.45710358023643494, "learning_rate": 0.0001, "loss": 2.4141, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 0.45968300104141235, "learning_rate": 0.0001, "loss": 2.3651, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.4726170599460602, "learning_rate": 0.0001, "loss": 2.3512, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.46369409561157227, "learning_rate": 0.0001, "loss": 2.2377, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 0.47940853238105774, "learning_rate": 0.0001, "loss": 2.5072, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.4780332148075104, "learning_rate": 0.0001, "loss": 2.3619, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.4739968776702881, "learning_rate": 0.0001, "loss": 2.3421, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.5014553070068359, "learning_rate": 0.0001, "loss": 2.4197, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.5013525485992432, "learning_rate": 0.0001, "loss": 2.3054, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.496791809797287, "learning_rate": 0.0001, "loss": 2.444, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.4828886091709137, "learning_rate": 0.0001, "loss": 2.3354, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.5080811381340027, "learning_rate": 0.0001, "loss": 2.4921, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 0.4668521583080292, "learning_rate": 0.0001, "loss": 2.4057, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.5066611170768738, "learning_rate": 0.0001, "loss": 2.3574, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.48983776569366455, "learning_rate": 0.0001, "loss": 2.4238, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.4692371189594269, "learning_rate": 0.0001, "loss": 2.3498, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.4888613820075989, "learning_rate": 0.0001, "loss": 2.3825, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.487679123878479, "learning_rate": 0.0001, "loss": 2.3583, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.4797740578651428, "learning_rate": 0.0001, "loss": 2.2742, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.5052629113197327, "learning_rate": 0.0001, "loss": 2.4731, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.515653669834137, "learning_rate": 0.0001, "loss": 2.2931, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.5115556120872498, "learning_rate": 0.0001, "loss": 2.3343, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.5058712363243103, "learning_rate": 0.0001, "loss": 2.3427, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5015872716903687, "learning_rate": 0.0001, "loss": 2.2843, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.5232997536659241, "learning_rate": 0.0001, "loss": 2.3664, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.4920145273208618, "learning_rate": 0.0001, "loss": 2.2829, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.513135552406311, "learning_rate": 0.0001, "loss": 2.3496, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.5038886070251465, "learning_rate": 0.0001, "loss": 2.2757, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 0.515469491481781, "learning_rate": 0.0001, "loss": 2.3699, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.4982060492038727, "learning_rate": 0.0001, "loss": 2.369, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.4803526997566223, "learning_rate": 0.0001, "loss": 2.323, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.4883512854576111, "learning_rate": 0.0001, "loss": 2.3837, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.48442399501800537, "learning_rate": 0.0001, "loss": 2.3452, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.49058425426483154, "learning_rate": 0.0001, "loss": 2.3554, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.5121751427650452, "learning_rate": 0.0001, "loss": 2.3755, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.48031556606292725, "learning_rate": 0.0001, "loss": 2.3302, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.526944100856781, "learning_rate": 0.0001, "loss": 2.2234, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.513681173324585, "learning_rate": 0.0001, "loss": 2.2488, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.5152313113212585, "learning_rate": 0.0001, "loss": 2.4144, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.504129946231842, "learning_rate": 0.0001, "loss": 2.3023, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 0.5012217164039612, "learning_rate": 0.0001, "loss": 2.3223, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.5341697335243225, "learning_rate": 0.0001, "loss": 2.3747, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.5258910655975342, "learning_rate": 0.0001, "loss": 2.3556, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.5003915429115295, "learning_rate": 0.0001, "loss": 2.3331, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.5328536629676819, "learning_rate": 0.0001, "loss": 2.2383, "step": 189 }, { "epoch": 3.0, "eval_loss": 2.3905880451202393, "eval_runtime": 131.2361, "eval_samples_per_second": 148.252, "eval_steps_per_second": 1.158, "step": 189 }, { "epoch": 3.015873015873016, "grad_norm": 0.5069262385368347, "learning_rate": 0.0001, "loss": 2.224, "step": 190 }, { "epoch": 3.0317460317460316, "grad_norm": 0.5155209898948669, "learning_rate": 0.0001, "loss": 2.2273, "step": 191 }, { "epoch": 3.0476190476190474, "grad_norm": 0.5158161520957947, "learning_rate": 0.0001, "loss": 2.2146, "step": 192 }, { "epoch": 3.0634920634920633, "grad_norm": 0.50887531042099, "learning_rate": 0.0001, "loss": 2.2648, "step": 193 }, { "epoch": 3.0793650793650795, "grad_norm": 0.4977055490016937, "learning_rate": 0.0001, "loss": 2.1434, "step": 194 }, { "epoch": 3.0952380952380953, "grad_norm": 0.5491054654121399, "learning_rate": 0.0001, "loss": 2.3017, "step": 195 }, { "epoch": 3.111111111111111, "grad_norm": 0.5289337635040283, "learning_rate": 0.0001, "loss": 2.1921, "step": 196 }, { "epoch": 3.126984126984127, "grad_norm": 0.5454579591751099, "learning_rate": 0.0001, "loss": 2.1324, "step": 197 }, { "epoch": 3.142857142857143, "grad_norm": 0.5596087574958801, "learning_rate": 0.0001, "loss": 2.2861, "step": 198 }, { "epoch": 3.1587301587301586, "grad_norm": 0.5491024851799011, "learning_rate": 0.0001, "loss": 2.2208, "step": 199 }, { "epoch": 3.1746031746031744, "grad_norm": 0.5643439888954163, "learning_rate": 0.0001, "loss": 2.2757, "step": 200 }, { "epoch": 3.1904761904761907, "grad_norm": 0.5374104380607605, "learning_rate": 0.0001, "loss": 2.2096, "step": 201 }, { "epoch": 3.2063492063492065, "grad_norm": 0.5371658802032471, "learning_rate": 0.0001, "loss": 2.2569, "step": 202 }, { "epoch": 3.2222222222222223, "grad_norm": 0.5406343936920166, "learning_rate": 0.0001, "loss": 2.3481, "step": 203 }, { "epoch": 3.238095238095238, "grad_norm": 0.546499490737915, "learning_rate": 0.0001, "loss": 2.2431, "step": 204 }, { "epoch": 3.253968253968254, "grad_norm": 0.5593274235725403, "learning_rate": 0.0001, "loss": 2.2204, "step": 205 }, { "epoch": 3.2698412698412698, "grad_norm": 0.5756828784942627, "learning_rate": 0.0001, "loss": 2.2358, "step": 206 }, { "epoch": 3.2857142857142856, "grad_norm": 0.5745422840118408, "learning_rate": 0.0001, "loss": 2.3466, "step": 207 }, { "epoch": 3.3015873015873014, "grad_norm": 0.5843771696090698, "learning_rate": 0.0001, "loss": 2.2836, "step": 208 }, { "epoch": 3.317460317460317, "grad_norm": 0.5851179361343384, "learning_rate": 0.0001, "loss": 2.1949, "step": 209 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5551464557647705, "learning_rate": 0.0001, "loss": 2.2045, "step": 210 }, { "epoch": 3.3492063492063493, "grad_norm": 0.579687237739563, "learning_rate": 0.0001, "loss": 2.2161, "step": 211 }, { "epoch": 3.365079365079365, "grad_norm": 0.590063750743866, "learning_rate": 0.0001, "loss": 2.1731, "step": 212 }, { "epoch": 3.380952380952381, "grad_norm": 0.5903512835502625, "learning_rate": 0.0001, "loss": 2.3689, "step": 213 }, { "epoch": 3.3968253968253967, "grad_norm": 0.571978747844696, "learning_rate": 0.0001, "loss": 2.1554, "step": 214 }, { "epoch": 3.4126984126984126, "grad_norm": 0.5911630392074585, "learning_rate": 0.0001, "loss": 2.1664, "step": 215 }, { "epoch": 3.4285714285714284, "grad_norm": 0.5953576564788818, "learning_rate": 0.0001, "loss": 2.2494, "step": 216 }, { "epoch": 3.4444444444444446, "grad_norm": 0.6063962578773499, "learning_rate": 0.0001, "loss": 2.1696, "step": 217 }, { "epoch": 3.4603174603174605, "grad_norm": 0.5811805129051208, "learning_rate": 0.0001, "loss": 2.255, "step": 218 }, { "epoch": 3.4761904761904763, "grad_norm": 0.5684872269630432, "learning_rate": 0.0001, "loss": 2.2388, "step": 219 }, { "epoch": 3.492063492063492, "grad_norm": 0.5764886736869812, "learning_rate": 0.0001, "loss": 2.2843, "step": 220 }, { "epoch": 3.507936507936508, "grad_norm": 0.592780351638794, "learning_rate": 0.0001, "loss": 2.281, "step": 221 }, { "epoch": 3.5238095238095237, "grad_norm": 0.5397675037384033, "learning_rate": 0.0001, "loss": 2.2044, "step": 222 }, { "epoch": 3.5396825396825395, "grad_norm": 0.5696619749069214, "learning_rate": 0.0001, "loss": 2.2013, "step": 223 }, { "epoch": 3.5555555555555554, "grad_norm": 0.5832935571670532, "learning_rate": 0.0001, "loss": 2.1913, "step": 224 }, { "epoch": 3.571428571428571, "grad_norm": 0.58072829246521, "learning_rate": 0.0001, "loss": 2.1943, "step": 225 }, { "epoch": 3.5873015873015874, "grad_norm": 0.6177693009376526, "learning_rate": 0.0001, "loss": 2.2172, "step": 226 }, { "epoch": 3.6031746031746033, "grad_norm": 0.6173182725906372, "learning_rate": 0.0001, "loss": 2.3449, "step": 227 }, { "epoch": 3.619047619047619, "grad_norm": 0.5733217000961304, "learning_rate": 0.0001, "loss": 2.2023, "step": 228 }, { "epoch": 3.634920634920635, "grad_norm": 0.5936962366104126, "learning_rate": 0.0001, "loss": 2.3098, "step": 229 }, { "epoch": 3.6507936507936507, "grad_norm": 0.5863310694694519, "learning_rate": 0.0001, "loss": 2.3444, "step": 230 }, { "epoch": 3.6666666666666665, "grad_norm": 0.5781247019767761, "learning_rate": 0.0001, "loss": 2.2717, "step": 231 }, { "epoch": 3.682539682539683, "grad_norm": 0.5641736388206482, "learning_rate": 0.0001, "loss": 2.2551, "step": 232 }, { "epoch": 3.6984126984126986, "grad_norm": 0.5762507319450378, "learning_rate": 0.0001, "loss": 2.2005, "step": 233 }, { "epoch": 3.7142857142857144, "grad_norm": 0.5860148668289185, "learning_rate": 0.0001, "loss": 2.2755, "step": 234 }, { "epoch": 3.7301587301587302, "grad_norm": 0.5595604181289673, "learning_rate": 0.0001, "loss": 2.1698, "step": 235 }, { "epoch": 3.746031746031746, "grad_norm": 0.6000334620475769, "learning_rate": 0.0001, "loss": 2.1448, "step": 236 }, { "epoch": 3.761904761904762, "grad_norm": 0.5891295075416565, "learning_rate": 0.0001, "loss": 2.2041, "step": 237 }, { "epoch": 3.7777777777777777, "grad_norm": 0.5736986398696899, "learning_rate": 0.0001, "loss": 2.2187, "step": 238 }, { "epoch": 3.7936507936507935, "grad_norm": 0.605859100818634, "learning_rate": 0.0001, "loss": 2.1449, "step": 239 }, { "epoch": 3.8095238095238093, "grad_norm": 0.6083592772483826, "learning_rate": 0.0001, "loss": 2.2481, "step": 240 }, { "epoch": 3.825396825396825, "grad_norm": 0.6463331580162048, "learning_rate": 0.0001, "loss": 2.3296, "step": 241 }, { "epoch": 3.8412698412698414, "grad_norm": 0.6547737717628479, "learning_rate": 0.0001, "loss": 2.2724, "step": 242 }, { "epoch": 3.857142857142857, "grad_norm": 0.6021876931190491, "learning_rate": 0.0001, "loss": 2.2059, "step": 243 }, { "epoch": 3.873015873015873, "grad_norm": 0.5920640230178833, "learning_rate": 0.0001, "loss": 2.2656, "step": 244 }, { "epoch": 3.888888888888889, "grad_norm": 0.6126848459243774, "learning_rate": 0.0001, "loss": 2.2974, "step": 245 }, { "epoch": 3.9047619047619047, "grad_norm": 0.5852090716362, "learning_rate": 0.0001, "loss": 2.1778, "step": 246 }, { "epoch": 3.9206349206349205, "grad_norm": 0.5844463109970093, "learning_rate": 0.0001, "loss": 2.1347, "step": 247 }, { "epoch": 3.9365079365079367, "grad_norm": 0.5811941027641296, "learning_rate": 0.0001, "loss": 2.2374, "step": 248 }, { "epoch": 3.9523809523809526, "grad_norm": 0.593923807144165, "learning_rate": 0.0001, "loss": 2.1287, "step": 249 }, { "epoch": 3.9682539682539684, "grad_norm": 0.5784031748771667, "learning_rate": 0.0001, "loss": 2.2797, "step": 250 }, { "epoch": 3.984126984126984, "grad_norm": 0.5905077457427979, "learning_rate": 0.0001, "loss": 2.1873, "step": 251 }, { "epoch": 4.0, "grad_norm": 0.5942009687423706, "learning_rate": 0.0001, "loss": 2.2128, "step": 252 }, { "epoch": 4.0, "eval_loss": 2.3258297443389893, "eval_runtime": 131.2215, "eval_samples_per_second": 148.268, "eval_steps_per_second": 1.158, "step": 252 }, { "epoch": 4.015873015873016, "grad_norm": 0.5791855454444885, "learning_rate": 0.0001, "loss": 2.1227, "step": 253 }, { "epoch": 4.031746031746032, "grad_norm": 0.5935028195381165, "learning_rate": 0.0001, "loss": 2.0783, "step": 254 }, { "epoch": 4.0476190476190474, "grad_norm": 0.5985621809959412, "learning_rate": 0.0001, "loss": 2.2336, "step": 255 }, { "epoch": 4.063492063492063, "grad_norm": 0.595588207244873, "learning_rate": 0.0001, "loss": 2.1797, "step": 256 }, { "epoch": 4.079365079365079, "grad_norm": 0.6228951811790466, "learning_rate": 0.0001, "loss": 2.102, "step": 257 }, { "epoch": 4.095238095238095, "grad_norm": 0.6309946179389954, "learning_rate": 0.0001, "loss": 2.1029, "step": 258 }, { "epoch": 4.111111111111111, "grad_norm": 0.6356679797172546, "learning_rate": 0.0001, "loss": 2.2215, "step": 259 }, { "epoch": 4.1269841269841265, "grad_norm": 0.6199479103088379, "learning_rate": 0.0001, "loss": 2.209, "step": 260 }, { "epoch": 4.142857142857143, "grad_norm": 0.6396746635437012, "learning_rate": 0.0001, "loss": 2.1935, "step": 261 }, { "epoch": 4.158730158730159, "grad_norm": 0.624236524105072, "learning_rate": 0.0001, "loss": 2.1285, "step": 262 }, { "epoch": 4.174603174603175, "grad_norm": 0.6298365592956543, "learning_rate": 0.0001, "loss": 2.1425, "step": 263 }, { "epoch": 4.190476190476191, "grad_norm": 0.6376235485076904, "learning_rate": 0.0001, "loss": 2.1139, "step": 264 }, { "epoch": 4.2063492063492065, "grad_norm": 0.6685353517532349, "learning_rate": 0.0001, "loss": 2.3174, "step": 265 }, { "epoch": 4.222222222222222, "grad_norm": 0.646286129951477, "learning_rate": 0.0001, "loss": 2.1518, "step": 266 }, { "epoch": 4.238095238095238, "grad_norm": 0.6334670782089233, "learning_rate": 0.0001, "loss": 2.1696, "step": 267 }, { "epoch": 4.253968253968254, "grad_norm": 0.646755039691925, "learning_rate": 0.0001, "loss": 2.1926, "step": 268 }, { "epoch": 4.26984126984127, "grad_norm": 0.6688207387924194, "learning_rate": 0.0001, "loss": 2.0777, "step": 269 }, { "epoch": 4.285714285714286, "grad_norm": 0.6606519818305969, "learning_rate": 0.0001, "loss": 2.0851, "step": 270 }, { "epoch": 4.301587301587301, "grad_norm": 0.6396586894989014, "learning_rate": 0.0001, "loss": 2.0645, "step": 271 }, { "epoch": 4.317460317460317, "grad_norm": 0.6652937531471252, "learning_rate": 0.0001, "loss": 2.0649, "step": 272 }, { "epoch": 4.333333333333333, "grad_norm": 0.6375648379325867, "learning_rate": 0.0001, "loss": 2.1439, "step": 273 }, { "epoch": 4.349206349206349, "grad_norm": 0.6601315140724182, "learning_rate": 0.0001, "loss": 2.1115, "step": 274 }, { "epoch": 4.365079365079365, "grad_norm": 0.6605430841445923, "learning_rate": 0.0001, "loss": 2.0952, "step": 275 }, { "epoch": 4.380952380952381, "grad_norm": 0.682755708694458, "learning_rate": 0.0001, "loss": 2.0782, "step": 276 }, { "epoch": 4.396825396825397, "grad_norm": 0.7010186314582825, "learning_rate": 0.0001, "loss": 2.1585, "step": 277 }, { "epoch": 4.412698412698413, "grad_norm": 0.6743943691253662, "learning_rate": 0.0001, "loss": 2.0401, "step": 278 }, { "epoch": 4.428571428571429, "grad_norm": 0.659075140953064, "learning_rate": 0.0001, "loss": 2.0975, "step": 279 }, { "epoch": 4.444444444444445, "grad_norm": 0.6433005332946777, "learning_rate": 0.0001, "loss": 2.1697, "step": 280 }, { "epoch": 4.4603174603174605, "grad_norm": 0.6300135850906372, "learning_rate": 0.0001, "loss": 2.0184, "step": 281 }, { "epoch": 4.476190476190476, "grad_norm": 0.6259458065032959, "learning_rate": 0.0001, "loss": 2.0946, "step": 282 }, { "epoch": 4.492063492063492, "grad_norm": 0.6421889066696167, "learning_rate": 0.0001, "loss": 2.061, "step": 283 }, { "epoch": 4.507936507936508, "grad_norm": 0.6369497179985046, "learning_rate": 0.0001, "loss": 2.0741, "step": 284 }, { "epoch": 4.523809523809524, "grad_norm": 0.6569401621818542, "learning_rate": 0.0001, "loss": 2.1381, "step": 285 }, { "epoch": 4.5396825396825395, "grad_norm": 0.6559494733810425, "learning_rate": 0.0001, "loss": 2.0315, "step": 286 }, { "epoch": 4.555555555555555, "grad_norm": 0.6800838112831116, "learning_rate": 0.0001, "loss": 2.1973, "step": 287 }, { "epoch": 4.571428571428571, "grad_norm": 0.660052478313446, "learning_rate": 0.0001, "loss": 2.0466, "step": 288 }, { "epoch": 4.587301587301587, "grad_norm": 0.67457115650177, "learning_rate": 0.0001, "loss": 2.0389, "step": 289 }, { "epoch": 4.603174603174603, "grad_norm": 0.701083779335022, "learning_rate": 0.0001, "loss": 2.1097, "step": 290 }, { "epoch": 4.619047619047619, "grad_norm": 0.6736295223236084, "learning_rate": 0.0001, "loss": 2.0152, "step": 291 }, { "epoch": 4.634920634920634, "grad_norm": 0.6742798686027527, "learning_rate": 0.0001, "loss": 2.0856, "step": 292 }, { "epoch": 4.650793650793651, "grad_norm": 0.6709657907485962, "learning_rate": 0.0001, "loss": 2.0645, "step": 293 }, { "epoch": 4.666666666666667, "grad_norm": 0.7078341841697693, "learning_rate": 0.0001, "loss": 2.081, "step": 294 }, { "epoch": 4.682539682539683, "grad_norm": 0.6934991478919983, "learning_rate": 0.0001, "loss": 2.1799, "step": 295 }, { "epoch": 4.698412698412699, "grad_norm": 0.629152238368988, "learning_rate": 0.0001, "loss": 2.1099, "step": 296 }, { "epoch": 4.714285714285714, "grad_norm": 0.6546439528465271, "learning_rate": 0.0001, "loss": 2.0761, "step": 297 }, { "epoch": 4.73015873015873, "grad_norm": 0.6665687561035156, "learning_rate": 0.0001, "loss": 2.1299, "step": 298 }, { "epoch": 4.746031746031746, "grad_norm": 0.6587039828300476, "learning_rate": 0.0001, "loss": 2.0257, "step": 299 }, { "epoch": 4.761904761904762, "grad_norm": 0.6549575924873352, "learning_rate": 0.0001, "loss": 2.0057, "step": 300 }, { "epoch": 4.777777777777778, "grad_norm": 0.6643835306167603, "learning_rate": 0.0001, "loss": 2.14, "step": 301 }, { "epoch": 4.7936507936507935, "grad_norm": 0.6684074401855469, "learning_rate": 0.0001, "loss": 2.133, "step": 302 }, { "epoch": 4.809523809523809, "grad_norm": 0.6732567548751831, "learning_rate": 0.0001, "loss": 2.1709, "step": 303 }, { "epoch": 4.825396825396825, "grad_norm": 0.6627042293548584, "learning_rate": 0.0001, "loss": 2.1105, "step": 304 }, { "epoch": 4.841269841269841, "grad_norm": 0.7108631134033203, "learning_rate": 0.0001, "loss": 2.2417, "step": 305 }, { "epoch": 4.857142857142857, "grad_norm": 0.6348216533660889, "learning_rate": 0.0001, "loss": 2.1181, "step": 306 }, { "epoch": 4.8730158730158735, "grad_norm": 0.7204558849334717, "learning_rate": 0.0001, "loss": 2.2538, "step": 307 }, { "epoch": 4.888888888888889, "grad_norm": 0.6680718660354614, "learning_rate": 0.0001, "loss": 2.0988, "step": 308 }, { "epoch": 4.904761904761905, "grad_norm": 0.629531741142273, "learning_rate": 0.0001, "loss": 2.0871, "step": 309 }, { "epoch": 4.920634920634921, "grad_norm": 0.6763596534729004, "learning_rate": 0.0001, "loss": 2.1201, "step": 310 }, { "epoch": 4.936507936507937, "grad_norm": 0.6565173268318176, "learning_rate": 0.0001, "loss": 2.0979, "step": 311 }, { "epoch": 4.9523809523809526, "grad_norm": 0.6826967597007751, "learning_rate": 0.0001, "loss": 2.2006, "step": 312 }, { "epoch": 4.968253968253968, "grad_norm": 0.6894890666007996, "learning_rate": 0.0001, "loss": 2.1615, "step": 313 }, { "epoch": 4.984126984126984, "grad_norm": 0.661064624786377, "learning_rate": 0.0001, "loss": 2.1329, "step": 314 }, { "epoch": 5.0, "grad_norm": 0.6418980956077576, "learning_rate": 0.0001, "loss": 2.0654, "step": 315 }, { "epoch": 5.0, "eval_loss": 2.2862746715545654, "eval_runtime": 131.1439, "eval_samples_per_second": 148.356, "eval_steps_per_second": 1.159, "step": 315 }, { "epoch": 5.015873015873016, "grad_norm": 0.6436026096343994, "learning_rate": 0.0001, "loss": 2.094, "step": 316 }, { "epoch": 5.031746031746032, "grad_norm": 0.6524760127067566, "learning_rate": 0.0001, "loss": 2.0095, "step": 317 }, { "epoch": 5.0476190476190474, "grad_norm": 0.6650781035423279, "learning_rate": 0.0001, "loss": 1.9616, "step": 318 }, { "epoch": 5.063492063492063, "grad_norm": 0.7145387530326843, "learning_rate": 0.0001, "loss": 1.9858, "step": 319 }, { "epoch": 5.079365079365079, "grad_norm": 0.7050247192382812, "learning_rate": 0.0001, "loss": 2.0304, "step": 320 }, { "epoch": 5.095238095238095, "grad_norm": 0.722385585308075, "learning_rate": 0.0001, "loss": 2.0837, "step": 321 }, { "epoch": 5.111111111111111, "grad_norm": 0.7140416502952576, "learning_rate": 0.0001, "loss": 1.9818, "step": 322 }, { "epoch": 5.1269841269841265, "grad_norm": 0.7170698642730713, "learning_rate": 0.0001, "loss": 2.0055, "step": 323 }, { "epoch": 5.142857142857143, "grad_norm": 0.6809371709823608, "learning_rate": 0.0001, "loss": 2.0375, "step": 324 }, { "epoch": 5.158730158730159, "grad_norm": 0.7128539085388184, "learning_rate": 0.0001, "loss": 2.0837, "step": 325 }, { "epoch": 5.174603174603175, "grad_norm": 0.7276302576065063, "learning_rate": 0.0001, "loss": 2.0278, "step": 326 }, { "epoch": 5.190476190476191, "grad_norm": 0.7262607216835022, "learning_rate": 0.0001, "loss": 2.0855, "step": 327 }, { "epoch": 5.2063492063492065, "grad_norm": 0.7341217398643494, "learning_rate": 0.0001, "loss": 2.1168, "step": 328 }, { "epoch": 5.222222222222222, "grad_norm": 0.7559060454368591, "learning_rate": 0.0001, "loss": 2.0097, "step": 329 }, { "epoch": 5.238095238095238, "grad_norm": 0.7604755759239197, "learning_rate": 0.0001, "loss": 2.0065, "step": 330 }, { "epoch": 5.253968253968254, "grad_norm": 0.7481369972229004, "learning_rate": 0.0001, "loss": 2.0204, "step": 331 }, { "epoch": 5.26984126984127, "grad_norm": 0.7615129351615906, "learning_rate": 0.0001, "loss": 2.123, "step": 332 }, { "epoch": 5.285714285714286, "grad_norm": 0.737953245639801, "learning_rate": 0.0001, "loss": 2.0006, "step": 333 }, { "epoch": 5.301587301587301, "grad_norm": 0.7923922538757324, "learning_rate": 0.0001, "loss": 2.0557, "step": 334 }, { "epoch": 5.317460317460317, "grad_norm": 0.7669034004211426, "learning_rate": 0.0001, "loss": 2.0475, "step": 335 }, { "epoch": 5.333333333333333, "grad_norm": 0.7678413391113281, "learning_rate": 0.0001, "loss": 2.0465, "step": 336 }, { "epoch": 5.349206349206349, "grad_norm": 0.8182974457740784, "learning_rate": 0.0001, "loss": 2.1546, "step": 337 }, { "epoch": 5.365079365079365, "grad_norm": 0.7198697924613953, "learning_rate": 0.0001, "loss": 2.0281, "step": 338 }, { "epoch": 5.380952380952381, "grad_norm": 0.7147011756896973, "learning_rate": 0.0001, "loss": 2.0025, "step": 339 }, { "epoch": 5.396825396825397, "grad_norm": 0.7140205502510071, "learning_rate": 0.0001, "loss": 1.9455, "step": 340 }, { "epoch": 5.412698412698413, "grad_norm": 0.7569496631622314, "learning_rate": 0.0001, "loss": 1.9467, "step": 341 }, { "epoch": 5.428571428571429, "grad_norm": 0.7563154697418213, "learning_rate": 0.0001, "loss": 2.028, "step": 342 }, { "epoch": 5.444444444444445, "grad_norm": 0.7342957258224487, "learning_rate": 0.0001, "loss": 2.0094, "step": 343 }, { "epoch": 5.4603174603174605, "grad_norm": 0.6956045627593994, "learning_rate": 0.0001, "loss": 1.9638, "step": 344 }, { "epoch": 5.476190476190476, "grad_norm": 0.7402710914611816, "learning_rate": 0.0001, "loss": 1.952, "step": 345 }, { "epoch": 5.492063492063492, "grad_norm": 0.7401675581932068, "learning_rate": 0.0001, "loss": 2.0025, "step": 346 }, { "epoch": 5.507936507936508, "grad_norm": 0.7523941397666931, "learning_rate": 0.0001, "loss": 1.968, "step": 347 }, { "epoch": 5.523809523809524, "grad_norm": 0.7316027283668518, "learning_rate": 0.0001, "loss": 2.047, "step": 348 }, { "epoch": 5.5396825396825395, "grad_norm": 0.7798665761947632, "learning_rate": 0.0001, "loss": 2.0133, "step": 349 }, { "epoch": 5.555555555555555, "grad_norm": 0.7752505540847778, "learning_rate": 0.0001, "loss": 2.0177, "step": 350 }, { "epoch": 5.571428571428571, "grad_norm": 0.7742307186126709, "learning_rate": 0.0001, "loss": 1.9797, "step": 351 }, { "epoch": 5.587301587301587, "grad_norm": 0.7735832929611206, "learning_rate": 0.0001, "loss": 2.0347, "step": 352 }, { "epoch": 5.603174603174603, "grad_norm": 0.7533615827560425, "learning_rate": 0.0001, "loss": 1.9683, "step": 353 }, { "epoch": 5.619047619047619, "grad_norm": 0.7873823046684265, "learning_rate": 0.0001, "loss": 2.1035, "step": 354 }, { "epoch": 5.634920634920634, "grad_norm": 0.739718496799469, "learning_rate": 0.0001, "loss": 2.0512, "step": 355 }, { "epoch": 5.650793650793651, "grad_norm": 0.7133144736289978, "learning_rate": 0.0001, "loss": 1.9078, "step": 356 }, { "epoch": 5.666666666666667, "grad_norm": 0.7506771087646484, "learning_rate": 0.0001, "loss": 2.0843, "step": 357 }, { "epoch": 5.682539682539683, "grad_norm": 0.7370438575744629, "learning_rate": 0.0001, "loss": 1.9155, "step": 358 }, { "epoch": 5.698412698412699, "grad_norm": 0.7745400667190552, "learning_rate": 0.0001, "loss": 2.0788, "step": 359 }, { "epoch": 5.714285714285714, "grad_norm": 0.7573673725128174, "learning_rate": 0.0001, "loss": 1.9092, "step": 360 }, { "epoch": 5.73015873015873, "grad_norm": 0.7863042950630188, "learning_rate": 0.0001, "loss": 1.9919, "step": 361 }, { "epoch": 5.746031746031746, "grad_norm": 0.766444206237793, "learning_rate": 0.0001, "loss": 2.0213, "step": 362 }, { "epoch": 5.761904761904762, "grad_norm": 0.7395709156990051, "learning_rate": 0.0001, "loss": 1.926, "step": 363 }, { "epoch": 5.777777777777778, "grad_norm": 0.7217903733253479, "learning_rate": 0.0001, "loss": 1.9947, "step": 364 }, { "epoch": 5.7936507936507935, "grad_norm": 0.7542216181755066, "learning_rate": 0.0001, "loss": 1.9689, "step": 365 }, { "epoch": 5.809523809523809, "grad_norm": 0.7293450236320496, "learning_rate": 0.0001, "loss": 2.0281, "step": 366 }, { "epoch": 5.825396825396825, "grad_norm": 0.7562200427055359, "learning_rate": 0.0001, "loss": 2.0989, "step": 367 }, { "epoch": 5.841269841269841, "grad_norm": 0.7612117528915405, "learning_rate": 0.0001, "loss": 2.0551, "step": 368 }, { "epoch": 5.857142857142857, "grad_norm": 0.734811007976532, "learning_rate": 0.0001, "loss": 2.0023, "step": 369 }, { "epoch": 5.8730158730158735, "grad_norm": 0.7379953265190125, "learning_rate": 0.0001, "loss": 1.9598, "step": 370 }, { "epoch": 5.888888888888889, "grad_norm": 0.7303497791290283, "learning_rate": 0.0001, "loss": 1.972, "step": 371 }, { "epoch": 5.904761904761905, "grad_norm": 0.8013107180595398, "learning_rate": 0.0001, "loss": 2.0159, "step": 372 }, { "epoch": 5.920634920634921, "grad_norm": 0.7716558575630188, "learning_rate": 0.0001, "loss": 2.053, "step": 373 }, { "epoch": 5.936507936507937, "grad_norm": 0.7352483868598938, "learning_rate": 0.0001, "loss": 2.0788, "step": 374 }, { "epoch": 5.9523809523809526, "grad_norm": 0.735283374786377, "learning_rate": 0.0001, "loss": 1.9362, "step": 375 }, { "epoch": 5.968253968253968, "grad_norm": 0.7122552394866943, "learning_rate": 0.0001, "loss": 1.9169, "step": 376 }, { "epoch": 5.984126984126984, "grad_norm": 0.7541640400886536, "learning_rate": 0.0001, "loss": 2.008, "step": 377 }, { "epoch": 6.0, "grad_norm": 0.7707133889198303, "learning_rate": 0.0001, "loss": 2.0254, "step": 378 }, { "epoch": 6.0, "eval_loss": 2.260683298110962, "eval_runtime": 131.3176, "eval_samples_per_second": 148.16, "eval_steps_per_second": 1.157, "step": 378 } ], "logging_steps": 1, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1611656497447567e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }