{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 3.201144218444824, "eval_runtime": 137.3071, "eval_samples_per_second": 141.697, "eval_steps_per_second": 1.107, "step": 0 }, { "epoch": 0.015873015873015872, "grad_norm": 0.1918456107378006, "learning_rate": 0.0001, "loss": 3.2274, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 0.21201364696025848, "learning_rate": 0.0001, "loss": 2.9728, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 0.1865563839673996, "learning_rate": 0.0001, "loss": 3.1107, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 0.256558358669281, "learning_rate": 0.0001, "loss": 3.1054, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 0.30679622292518616, "learning_rate": 0.0001, "loss": 3.1735, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 0.27670395374298096, "learning_rate": 0.0001, "loss": 3.175, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 0.29917973279953003, "learning_rate": 0.0001, "loss": 3.0947, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 0.28829720616340637, "learning_rate": 0.0001, "loss": 2.9762, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 0.363932728767395, "learning_rate": 0.0001, "loss": 3.0589, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 0.3397577702999115, "learning_rate": 0.0001, "loss": 3.0165, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 0.3906390368938446, "learning_rate": 0.0001, "loss": 2.9656, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 0.39602699875831604, "learning_rate": 0.0001, "loss": 2.9934, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 0.5025639533996582, "learning_rate": 0.0001, "loss": 2.9371, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 0.41868922114372253, "learning_rate": 0.0001, "loss": 2.8788, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 0.4148877263069153, "learning_rate": 0.0001, "loss": 2.8633, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 0.36584576964378357, "learning_rate": 0.0001, "loss": 2.7781, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 0.5037583112716675, "learning_rate": 0.0001, "loss": 2.7633, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 0.37624871730804443, "learning_rate": 0.0001, "loss": 2.7114, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 0.3943127989768982, "learning_rate": 0.0001, "loss": 2.7029, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 0.300891250371933, "learning_rate": 0.0001, "loss": 2.5994, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 0.2970874309539795, "learning_rate": 0.0001, "loss": 2.6166, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 0.29132720828056335, "learning_rate": 0.0001, "loss": 2.627, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 0.259672075510025, "learning_rate": 0.0001, "loss": 2.6551, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 0.3438767194747925, "learning_rate": 0.0001, "loss": 2.62, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 0.36698848009109497, "learning_rate": 0.0001, "loss": 2.5411, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 0.500142514705658, "learning_rate": 0.0001, "loss": 2.5366, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 0.3668989837169647, "learning_rate": 0.0001, "loss": 2.5864, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 0.5424588322639465, "learning_rate": 0.0001, "loss": 2.5303, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 0.4026002585887909, "learning_rate": 0.0001, "loss": 2.4917, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 0.5646772384643555, "learning_rate": 0.0001, "loss": 2.4438, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 0.43536150455474854, "learning_rate": 0.0001, "loss": 2.4743, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 0.34031543135643005, "learning_rate": 0.0001, "loss": 2.4207, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 0.30232375860214233, "learning_rate": 0.0001, "loss": 2.4499, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 0.31755775213241577, "learning_rate": 0.0001, "loss": 2.2954, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 0.2854190170764923, "learning_rate": 0.0001, "loss": 2.4148, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.2745571434497833, "learning_rate": 0.0001, "loss": 2.2658, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 0.26832568645477295, "learning_rate": 0.0001, "loss": 2.4336, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.2571115493774414, "learning_rate": 0.0001, "loss": 2.3325, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 0.27978894114494324, "learning_rate": 0.0001, "loss": 2.3623, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 0.27634117007255554, "learning_rate": 0.0001, "loss": 2.3947, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 0.2856747806072235, "learning_rate": 0.0001, "loss": 2.3065, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 0.273630827665329, "learning_rate": 0.0001, "loss": 2.2969, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 0.294461190700531, "learning_rate": 0.0001, "loss": 2.3315, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 0.27730676531791687, "learning_rate": 0.0001, "loss": 2.2656, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.28087306022644043, "learning_rate": 0.0001, "loss": 2.1831, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 0.2754216194152832, "learning_rate": 0.0001, "loss": 2.2577, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 0.29807186126708984, "learning_rate": 0.0001, "loss": 2.3723, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.3020806908607483, "learning_rate": 0.0001, "loss": 2.309, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 0.2983008027076721, "learning_rate": 0.0001, "loss": 2.1748, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 0.30773478746414185, "learning_rate": 0.0001, "loss": 2.2758, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 0.30065926909446716, "learning_rate": 0.0001, "loss": 2.2587, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 0.3026188313961029, "learning_rate": 0.0001, "loss": 2.1818, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 0.30180782079696655, "learning_rate": 0.0001, "loss": 2.1304, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 0.29865095019340515, "learning_rate": 0.0001, "loss": 2.2356, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 0.30642348527908325, "learning_rate": 0.0001, "loss": 2.2411, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.30322977900505066, "learning_rate": 0.0001, "loss": 2.2252, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 0.2848075032234192, "learning_rate": 0.0001, "loss": 2.1687, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 0.2916617691516876, "learning_rate": 0.0001, "loss": 2.1998, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 0.2930757999420166, "learning_rate": 0.0001, "loss": 2.2222, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 0.28044334053993225, "learning_rate": 0.0001, "loss": 2.1614, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 0.29108643531799316, "learning_rate": 0.0001, "loss": 2.2684, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 0.2764103412628174, "learning_rate": 0.0001, "loss": 2.152, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.290540486574173, "learning_rate": 0.0001, "loss": 2.2189, "step": 63 }, { "epoch": 1.0, "eval_loss": 2.1553854942321777, "eval_runtime": 134.4627, "eval_samples_per_second": 144.694, "eval_steps_per_second": 1.13, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.28722119331359863, "learning_rate": 0.0001, "loss": 2.119, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 0.2941075265407562, "learning_rate": 0.0001, "loss": 2.1355, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.27956002950668335, "learning_rate": 0.0001, "loss": 2.0956, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 0.2902751863002777, "learning_rate": 0.0001, "loss": 2.1677, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.28545236587524414, "learning_rate": 0.0001, "loss": 2.1038, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.28611037135124207, "learning_rate": 0.0001, "loss": 2.0701, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.2909831404685974, "learning_rate": 0.0001, "loss": 2.1483, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.29801449179649353, "learning_rate": 0.0001, "loss": 2.0127, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.3016774356365204, "learning_rate": 0.0001, "loss": 2.072, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 0.302207887172699, "learning_rate": 0.0001, "loss": 2.0818, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 0.3024630844593048, "learning_rate": 0.0001, "loss": 2.0775, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.3113912045955658, "learning_rate": 0.0001, "loss": 2.0484, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.3100714385509491, "learning_rate": 0.0001, "loss": 2.0322, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 0.31157463788986206, "learning_rate": 0.0001, "loss": 2.0123, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 0.30897048115730286, "learning_rate": 0.0001, "loss": 2.0196, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 0.32494208216667175, "learning_rate": 0.0001, "loss": 2.1164, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.3228481113910675, "learning_rate": 0.0001, "loss": 2.0119, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 0.31433093547821045, "learning_rate": 0.0001, "loss": 1.9773, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 0.33841848373413086, "learning_rate": 0.0001, "loss": 2.0676, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 0.33233773708343506, "learning_rate": 0.0001, "loss": 2.0717, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.34447962045669556, "learning_rate": 0.0001, "loss": 2.1348, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 0.33957433700561523, "learning_rate": 0.0001, "loss": 2.0339, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.3519391715526581, "learning_rate": 0.0001, "loss": 2.0317, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 0.3266676068305969, "learning_rate": 0.0001, "loss": 1.9932, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.3306258022785187, "learning_rate": 0.0001, "loss": 1.9695, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 0.3315640389919281, "learning_rate": 0.0001, "loss": 1.9975, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.33888551592826843, "learning_rate": 0.0001, "loss": 2.0804, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.3356088399887085, "learning_rate": 0.0001, "loss": 2.0882, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 0.3448535203933716, "learning_rate": 0.0001, "loss": 2.0641, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 0.33544135093688965, "learning_rate": 0.0001, "loss": 2.0657, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 0.32962003350257874, "learning_rate": 0.0001, "loss": 1.9702, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.3348037898540497, "learning_rate": 0.0001, "loss": 2.0495, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.35935646295547485, "learning_rate": 0.0001, "loss": 2.0343, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.35089877247810364, "learning_rate": 0.0001, "loss": 1.9913, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.35951006412506104, "learning_rate": 0.0001, "loss": 2.036, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 0.3543176054954529, "learning_rate": 0.0001, "loss": 2.0005, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.34387317299842834, "learning_rate": 0.0001, "loss": 2.0283, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.3506092429161072, "learning_rate": 0.0001, "loss": 2.0301, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.3446710705757141, "learning_rate": 0.0001, "loss": 1.9749, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.3415977656841278, "learning_rate": 0.0001, "loss": 1.9853, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 0.3609831631183624, "learning_rate": 0.0001, "loss": 2.0477, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.35015106201171875, "learning_rate": 0.0001, "loss": 2.0356, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.3593216836452484, "learning_rate": 0.0001, "loss": 1.8311, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 0.35230639576911926, "learning_rate": 0.0001, "loss": 1.9562, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.3519164025783539, "learning_rate": 0.0001, "loss": 2.0257, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.3627997934818268, "learning_rate": 0.0001, "loss": 2.0012, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.37876084446907043, "learning_rate": 0.0001, "loss": 1.9746, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 0.37692707777023315, "learning_rate": 0.0001, "loss": 2.0422, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.3628651201725006, "learning_rate": 0.0001, "loss": 1.9981, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.35194242000579834, "learning_rate": 0.0001, "loss": 1.9343, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.36310839653015137, "learning_rate": 0.0001, "loss": 2.0421, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.3339640200138092, "learning_rate": 0.0001, "loss": 1.9606, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.3632456958293915, "learning_rate": 0.0001, "loss": 1.8558, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 0.36308878660202026, "learning_rate": 0.0001, "loss": 2.0127, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.3537601828575134, "learning_rate": 0.0001, "loss": 1.9704, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.3581252098083496, "learning_rate": 0.0001, "loss": 1.9086, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 0.37275415658950806, "learning_rate": 0.0001, "loss": 2.0914, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.36517035961151123, "learning_rate": 0.0001, "loss": 1.9529, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 0.3566952049732208, "learning_rate": 0.0001, "loss": 1.8498, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.36236587166786194, "learning_rate": 0.0001, "loss": 1.9652, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.3652929663658142, "learning_rate": 0.0001, "loss": 2.0085, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.3546443581581116, "learning_rate": 0.0001, "loss": 1.9192, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.3752649128437042, "learning_rate": 0.0001, "loss": 1.9932, "step": 126 }, { "epoch": 2.0, "eval_loss": 1.9884382486343384, "eval_runtime": 134.41, "eval_samples_per_second": 144.751, "eval_steps_per_second": 1.131, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.35680314898490906, "learning_rate": 0.0001, "loss": 2.0274, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.34379494190216064, "learning_rate": 0.0001, "loss": 1.944, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.351840615272522, "learning_rate": 0.0001, "loss": 1.9053, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 0.38462555408477783, "learning_rate": 0.0001, "loss": 1.9082, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.37032756209373474, "learning_rate": 0.0001, "loss": 1.9048, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.36423760652542114, "learning_rate": 0.0001, "loss": 1.9041, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.36103999614715576, "learning_rate": 0.0001, "loss": 1.7832, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.36970341205596924, "learning_rate": 0.0001, "loss": 1.8562, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.38281646370887756, "learning_rate": 0.0001, "loss": 1.8671, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.4053700864315033, "learning_rate": 0.0001, "loss": 2.0125, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 0.40937232971191406, "learning_rate": 0.0001, "loss": 1.9053, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 0.39173007011413574, "learning_rate": 0.0001, "loss": 1.9645, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.39559757709503174, "learning_rate": 0.0001, "loss": 1.9285, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.4081740975379944, "learning_rate": 0.0001, "loss": 1.8501, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.4095621705055237, "learning_rate": 0.0001, "loss": 1.9005, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.4242890179157257, "learning_rate": 0.0001, "loss": 1.8266, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.4077770709991455, "learning_rate": 0.0001, "loss": 1.8656, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.4218343496322632, "learning_rate": 0.0001, "loss": 1.9931, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.3991089463233948, "learning_rate": 0.0001, "loss": 1.8629, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 0.42081835865974426, "learning_rate": 0.0001, "loss": 1.8855, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.4178907573223114, "learning_rate": 0.0001, "loss": 1.8834, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.4098721146583557, "learning_rate": 0.0001, "loss": 1.7691, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 0.4463600814342499, "learning_rate": 0.0001, "loss": 2.0333, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.4598867893218994, "learning_rate": 0.0001, "loss": 1.8711, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.42520642280578613, "learning_rate": 0.0001, "loss": 1.8435, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.43913593888282776, "learning_rate": 0.0001, "loss": 1.9094, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.4239555299282074, "learning_rate": 0.0001, "loss": 1.795, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.43195414543151855, "learning_rate": 0.0001, "loss": 1.9467, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.42905962467193604, "learning_rate": 0.0001, "loss": 1.8613, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.44274038076400757, "learning_rate": 0.0001, "loss": 1.9131, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 0.42327380180358887, "learning_rate": 0.0001, "loss": 1.8945, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.43506136536598206, "learning_rate": 0.0001, "loss": 1.8626, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.4368724524974823, "learning_rate": 0.0001, "loss": 1.8973, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.43993279337882996, "learning_rate": 0.0001, "loss": 1.8404, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.43752744793891907, "learning_rate": 0.0001, "loss": 1.9374, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.4344784915447235, "learning_rate": 0.0001, "loss": 1.8943, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.41896679997444153, "learning_rate": 0.0001, "loss": 1.8438, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.43420085310935974, "learning_rate": 0.0001, "loss": 1.9431, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.444305419921875, "learning_rate": 0.0001, "loss": 1.8149, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.44758594036102295, "learning_rate": 0.0001, "loss": 1.8166, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.44555145502090454, "learning_rate": 0.0001, "loss": 1.9017, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.4599977135658264, "learning_rate": 0.0001, "loss": 1.8222, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.4442943036556244, "learning_rate": 0.0001, "loss": 1.8178, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.4303114116191864, "learning_rate": 0.0001, "loss": 1.8585, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.45859652757644653, "learning_rate": 0.0001, "loss": 1.8548, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.449211448431015, "learning_rate": 0.0001, "loss": 1.7996, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 0.458112508058548, "learning_rate": 0.0001, "loss": 1.8924, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.4440052807331085, "learning_rate": 0.0001, "loss": 1.8414, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.4355706572532654, "learning_rate": 0.0001, "loss": 1.8404, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.4662310779094696, "learning_rate": 0.0001, "loss": 1.9346, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.44040319323539734, "learning_rate": 0.0001, "loss": 1.8736, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.4506472945213318, "learning_rate": 0.0001, "loss": 1.9329, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.44710540771484375, "learning_rate": 0.0001, "loss": 1.8787, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.43612954020500183, "learning_rate": 0.0001, "loss": 1.8405, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.4578387439250946, "learning_rate": 0.0001, "loss": 1.746, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.4509958326816559, "learning_rate": 0.0001, "loss": 1.8539, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.44833552837371826, "learning_rate": 0.0001, "loss": 1.8407, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.4552953541278839, "learning_rate": 0.0001, "loss": 1.8312, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 0.4556235671043396, "learning_rate": 0.0001, "loss": 1.8324, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.44649648666381836, "learning_rate": 0.0001, "loss": 1.9377, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.47388994693756104, "learning_rate": 0.0001, "loss": 1.9287, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.43568143248558044, "learning_rate": 0.0001, "loss": 1.854, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.44688650965690613, "learning_rate": 0.0001, "loss": 1.8136, "step": 189 }, { "epoch": 3.0, "eval_loss": 1.9275089502334595, "eval_runtime": 134.3105, "eval_samples_per_second": 144.858, "eval_steps_per_second": 1.132, "step": 189 } ], "logging_steps": 1, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0791279677637919e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }