| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.0, | |
| "eval_steps": 500, | |
| "global_step": 504, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": 4.596632480621338, | |
| "eval_runtime": 132.3152, | |
| "eval_samples_per_second": 147.043, | |
| "eval_steps_per_second": 1.149, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.015873015873015872, | |
| "grad_norm": 0.26967182755470276, | |
| "learning_rate": 0.0001, | |
| "loss": 4.612, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.031746031746031744, | |
| "grad_norm": 0.2898576855659485, | |
| "learning_rate": 0.0001, | |
| "loss": 4.4349, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 0.2901840806007385, | |
| "learning_rate": 0.0001, | |
| "loss": 4.5047, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 0.3495827615261078, | |
| "learning_rate": 0.0001, | |
| "loss": 4.4469, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.07936507936507936, | |
| "grad_norm": 0.39650025963783264, | |
| "learning_rate": 0.0001, | |
| "loss": 4.526, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 0.4414258897304535, | |
| "learning_rate": 0.0001, | |
| "loss": 4.546, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 0.4898076057434082, | |
| "learning_rate": 0.0001, | |
| "loss": 4.4131, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 0.48589834570884705, | |
| "learning_rate": 0.0001, | |
| "loss": 4.3135, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.5929457545280457, | |
| "learning_rate": 0.0001, | |
| "loss": 4.3235, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.15873015873015872, | |
| "grad_norm": 0.6119698286056519, | |
| "learning_rate": 0.0001, | |
| "loss": 4.3461, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1746031746031746, | |
| "grad_norm": 0.6312505006790161, | |
| "learning_rate": 0.0001, | |
| "loss": 4.214, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.6929273009300232, | |
| "learning_rate": 0.0001, | |
| "loss": 4.1427, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.20634920634920634, | |
| "grad_norm": 0.7550596594810486, | |
| "learning_rate": 0.0001, | |
| "loss": 4.1668, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 0.6299394965171814, | |
| "learning_rate": 0.0001, | |
| "loss": 3.9331, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.692095160484314, | |
| "learning_rate": 0.0001, | |
| "loss": 3.9224, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 0.955380916595459, | |
| "learning_rate": 0.0001, | |
| "loss": 3.9012, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2698412698412698, | |
| "grad_norm": 0.8210205435752869, | |
| "learning_rate": 0.0001, | |
| "loss": 3.8245, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.6235060095787048, | |
| "learning_rate": 0.0001, | |
| "loss": 3.6529, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.30158730158730157, | |
| "grad_norm": 0.5264109373092651, | |
| "learning_rate": 0.0001, | |
| "loss": 3.7113, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 0.7998248338699341, | |
| "learning_rate": 0.0001, | |
| "loss": 3.525, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.8079671859741211, | |
| "learning_rate": 0.0001, | |
| "loss": 3.5775, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3492063492063492, | |
| "grad_norm": 0.5628694295883179, | |
| "learning_rate": 0.0001, | |
| "loss": 3.5531, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.36507936507936506, | |
| "grad_norm": 1.1438932418823242, | |
| "learning_rate": 0.0001, | |
| "loss": 3.5744, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.8720822930335999, | |
| "learning_rate": 0.0001, | |
| "loss": 3.5822, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.3968253968253968, | |
| "grad_norm": 1.0421593189239502, | |
| "learning_rate": 0.0001, | |
| "loss": 3.5075, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4126984126984127, | |
| "grad_norm": 0.6795033812522888, | |
| "learning_rate": 0.0001, | |
| "loss": 3.4547, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.6736836433410645, | |
| "learning_rate": 0.0001, | |
| "loss": 3.4874, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.5057028532028198, | |
| "learning_rate": 0.0001, | |
| "loss": 3.4797, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.4603174603174603, | |
| "grad_norm": 0.49980196356773376, | |
| "learning_rate": 0.0001, | |
| "loss": 3.3364, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 0.4449177384376526, | |
| "learning_rate": 0.0001, | |
| "loss": 3.2993, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.49206349206349204, | |
| "grad_norm": 0.3823322057723999, | |
| "learning_rate": 0.0001, | |
| "loss": 3.3223, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 0.3255627453327179, | |
| "learning_rate": 0.0001, | |
| "loss": 3.3542, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 0.3236791491508484, | |
| "learning_rate": 0.0001, | |
| "loss": 3.2756, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5396825396825397, | |
| "grad_norm": 0.3119717240333557, | |
| "learning_rate": 0.0001, | |
| "loss": 3.1773, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 0.30881795287132263, | |
| "learning_rate": 0.0001, | |
| "loss": 3.2265, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.30378207564353943, | |
| "learning_rate": 0.0001, | |
| "loss": 3.0821, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.5873015873015873, | |
| "grad_norm": 0.35712969303131104, | |
| "learning_rate": 0.0001, | |
| "loss": 3.2201, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6031746031746031, | |
| "grad_norm": 0.30827978253364563, | |
| "learning_rate": 0.0001, | |
| "loss": 3.135, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 0.3056108057498932, | |
| "learning_rate": 0.0001, | |
| "loss": 3.1543, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.3181809186935425, | |
| "learning_rate": 0.0001, | |
| "loss": 3.1409, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6507936507936508, | |
| "grad_norm": 0.32424548268318176, | |
| "learning_rate": 0.0001, | |
| "loss": 3.0388, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.3107524514198303, | |
| "learning_rate": 0.0001, | |
| "loss": 3.0717, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.6825396825396826, | |
| "grad_norm": 0.30104756355285645, | |
| "learning_rate": 0.0001, | |
| "loss": 3.0217, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 0.2986432909965515, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9845, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.3059774339199066, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8926, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.7301587301587301, | |
| "grad_norm": 0.3167428970336914, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9162, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.746031746031746, | |
| "grad_norm": 0.3148072361946106, | |
| "learning_rate": 0.0001, | |
| "loss": 3.0487, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.35918205976486206, | |
| "learning_rate": 0.0001, | |
| "loss": 3.037, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 0.3954469561576843, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8684, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.7936507936507936, | |
| "grad_norm": 0.33450567722320557, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9808, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8095238095238095, | |
| "grad_norm": 0.35196173191070557, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9072, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.8253968253968254, | |
| "grad_norm": 0.3617725670337677, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8938, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.8412698412698413, | |
| "grad_norm": 0.377817302942276, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8316, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.3597480058670044, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8602, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.873015873015873, | |
| "grad_norm": 0.37252452969551086, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8926, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.36722978949546814, | |
| "learning_rate": 0.0001, | |
| "loss": 2.983, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.9047619047619048, | |
| "grad_norm": 0.37402233481407166, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8777, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.9206349206349206, | |
| "grad_norm": 0.31507572531700134, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9208, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.9365079365079365, | |
| "grad_norm": 0.35631242394447327, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8828, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.33693110942840576, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8233, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.9682539682539683, | |
| "grad_norm": 0.323428750038147, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9311, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.9841269841269841, | |
| "grad_norm": 0.3073025643825531, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7934, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.31816455721855164, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8395, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 2.8033106327056885, | |
| "eval_runtime": 131.4471, | |
| "eval_samples_per_second": 148.014, | |
| "eval_steps_per_second": 1.156, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.0158730158730158, | |
| "grad_norm": 0.3218700587749481, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6772, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.0317460317460316, | |
| "grad_norm": 0.33239927887916565, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8064, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.0476190476190477, | |
| "grad_norm": 0.3222041726112366, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7618, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.0634920634920635, | |
| "grad_norm": 0.3218175172805786, | |
| "learning_rate": 0.0001, | |
| "loss": 2.823, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.0793650793650793, | |
| "grad_norm": 0.3080641031265259, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6865, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 0.31820255517959595, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7326, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.3285914361476898, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8175, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.126984126984127, | |
| "grad_norm": 0.34610089659690857, | |
| "learning_rate": 0.0001, | |
| "loss": 2.642, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.3352147042751312, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6517, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.1587301587301586, | |
| "grad_norm": 0.3385539948940277, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6797, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.1746031746031746, | |
| "grad_norm": 0.3544485867023468, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8208, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 0.35244956612586975, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7275, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.2063492063492063, | |
| "grad_norm": 0.3411391079425812, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5638, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 0.358157753944397, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6367, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.2380952380952381, | |
| "grad_norm": 0.35242363810539246, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6895, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 0.3461464047431946, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7364, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.2698412698412698, | |
| "grad_norm": 0.33865585923194885, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6523, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.3517827093601227, | |
| "learning_rate": 0.0001, | |
| "loss": 2.573, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.3015873015873016, | |
| "grad_norm": 0.3442043960094452, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6348, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.3174603174603174, | |
| "grad_norm": 0.379457026720047, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6806, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.3560670018196106, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6734, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.3492063492063493, | |
| "grad_norm": 0.3954298496246338, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7085, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.3650793650793651, | |
| "grad_norm": 0.3917439579963684, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6951, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.350482702255249, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6786, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.3968253968253967, | |
| "grad_norm": 0.3554580509662628, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5309, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.4126984126984126, | |
| "grad_norm": 0.3658367991447449, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5796, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.354397177696228, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6635, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.3648728132247925, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6793, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.4603174603174602, | |
| "grad_norm": 0.39242303371429443, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6318, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.4761904761904763, | |
| "grad_norm": 0.3823918104171753, | |
| "learning_rate": 0.0001, | |
| "loss": 2.509, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.492063492063492, | |
| "grad_norm": 0.36895614862442017, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5909, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.507936507936508, | |
| "grad_norm": 0.3695090711116791, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5639, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.5238095238095237, | |
| "grad_norm": 0.37633731961250305, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6351, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.5396825396825395, | |
| "grad_norm": 0.3766629099845886, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4803, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 0.3765883445739746, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6615, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.3859737515449524, | |
| "learning_rate": 0.0001, | |
| "loss": 2.481, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.5873015873015874, | |
| "grad_norm": 0.36387383937835693, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5661, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6031746031746033, | |
| "grad_norm": 0.38114404678344727, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6509, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.619047619047619, | |
| "grad_norm": 0.3747798800468445, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5248, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.6349206349206349, | |
| "grad_norm": 0.39510586857795715, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5784, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.6507936507936507, | |
| "grad_norm": 0.4021095335483551, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6515, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.3947932720184326, | |
| "learning_rate": 0.0001, | |
| "loss": 2.6109, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.6825396825396826, | |
| "grad_norm": 0.39475512504577637, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4172, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.6984126984126984, | |
| "grad_norm": 0.38429784774780273, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4828, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.39032769203186035, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5088, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.7301587301587302, | |
| "grad_norm": 0.3955644369125366, | |
| "learning_rate": 0.0001, | |
| "loss": 2.571, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.746031746031746, | |
| "grad_norm": 0.3895743787288666, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4817, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 0.41024699807167053, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5888, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 0.42291179299354553, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5777, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.7936507936507935, | |
| "grad_norm": 0.40427684783935547, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4627, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.8095238095238095, | |
| "grad_norm": 0.4024043381214142, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5497, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.8253968253968254, | |
| "grad_norm": 0.3848831057548523, | |
| "learning_rate": 0.0001, | |
| "loss": 2.507, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.8412698412698414, | |
| "grad_norm": 0.39639776945114136, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4462, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.4169174134731293, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5912, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.873015873015873, | |
| "grad_norm": 0.39467406272888184, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5258, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.39699482917785645, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3845, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.40015560388565063, | |
| "learning_rate": 0.0001, | |
| "loss": 2.615, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.9206349206349205, | |
| "grad_norm": 0.4159916937351227, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4749, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.9365079365079365, | |
| "grad_norm": 0.4075866937637329, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3877, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 0.40443259477615356, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4426, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.9682539682539684, | |
| "grad_norm": 0.42848312854766846, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4878, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.9841269841269842, | |
| "grad_norm": 0.4103951156139374, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4452, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.42405155301094055, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5123, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 2.508723258972168, | |
| "eval_runtime": 131.3173, | |
| "eval_samples_per_second": 148.16, | |
| "eval_steps_per_second": 1.158, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.015873015873016, | |
| "grad_norm": 0.40521103143692017, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4685, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.0317460317460316, | |
| "grad_norm": 0.3934997022151947, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4345, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.0476190476190474, | |
| "grad_norm": 0.4023909568786621, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4582, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.0634920634920633, | |
| "grad_norm": 0.43362903594970703, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4846, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.0793650793650795, | |
| "grad_norm": 0.42157357931137085, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4447, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.0952380952380953, | |
| "grad_norm": 0.4321233332157135, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4022, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 0.424949049949646, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3601, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.126984126984127, | |
| "grad_norm": 0.42683982849121094, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3331, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.4327385425567627, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3789, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.1587301587301586, | |
| "grad_norm": 0.4611188471317291, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5813, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.1746031746031744, | |
| "grad_norm": 0.45626574754714966, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4627, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 0.43329715728759766, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4692, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.2063492063492065, | |
| "grad_norm": 0.4390101730823517, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4429, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.46726638078689575, | |
| "learning_rate": 0.0001, | |
| "loss": 2.436, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.238095238095238, | |
| "grad_norm": 0.45958849787712097, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4011, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.253968253968254, | |
| "grad_norm": 0.4724540710449219, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3135, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.2698412698412698, | |
| "grad_norm": 0.46117934584617615, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3877, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.4439626634120941, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4332, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.3015873015873014, | |
| "grad_norm": 0.45710358023643494, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4141, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.317460317460317, | |
| "grad_norm": 0.45968300104141235, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3651, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.4726170599460602, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3512, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.3492063492063493, | |
| "grad_norm": 0.46369409561157227, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2377, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.365079365079365, | |
| "grad_norm": 0.47940853238105774, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5072, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 0.4780332148075104, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3619, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.3968253968253967, | |
| "grad_norm": 0.4739968776702881, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3421, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.4126984126984126, | |
| "grad_norm": 0.5014553070068359, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4197, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.4285714285714284, | |
| "grad_norm": 0.5013525485992432, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3054, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.496791809797287, | |
| "learning_rate": 0.0001, | |
| "loss": 2.444, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.4603174603174605, | |
| "grad_norm": 0.4828886091709137, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3354, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.4761904761904763, | |
| "grad_norm": 0.5080811381340027, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4921, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.492063492063492, | |
| "grad_norm": 0.4668521583080292, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4057, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.507936507936508, | |
| "grad_norm": 0.5066611170768738, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3574, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.5238095238095237, | |
| "grad_norm": 0.48983776569366455, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4238, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.5396825396825395, | |
| "grad_norm": 0.4692371189594269, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3498, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 0.4888613820075989, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3825, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.487679123878479, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3583, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.5873015873015874, | |
| "grad_norm": 0.4797740578651428, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2742, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.6031746031746033, | |
| "grad_norm": 0.5052629113197327, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4731, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.619047619047619, | |
| "grad_norm": 0.515653669834137, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2931, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.634920634920635, | |
| "grad_norm": 0.5115556120872498, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3343, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.6507936507936507, | |
| "grad_norm": 0.5058712363243103, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3427, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.5015872716903687, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2843, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.682539682539683, | |
| "grad_norm": 0.5232997536659241, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3664, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.6984126984126986, | |
| "grad_norm": 0.4920145273208618, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2829, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.7142857142857144, | |
| "grad_norm": 0.513135552406311, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3496, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.7301587301587302, | |
| "grad_norm": 0.5038886070251465, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2757, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.746031746031746, | |
| "grad_norm": 0.515469491481781, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3699, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.761904761904762, | |
| "grad_norm": 0.4982060492038727, | |
| "learning_rate": 0.0001, | |
| "loss": 2.369, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.4803526997566223, | |
| "learning_rate": 0.0001, | |
| "loss": 2.323, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.7936507936507935, | |
| "grad_norm": 0.4883512854576111, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3837, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.8095238095238093, | |
| "grad_norm": 0.48442399501800537, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3452, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.825396825396825, | |
| "grad_norm": 0.49058425426483154, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3554, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.8412698412698414, | |
| "grad_norm": 0.5121751427650452, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3755, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.48031556606292725, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3302, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.873015873015873, | |
| "grad_norm": 0.526944100856781, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2234, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.513681173324585, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2488, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.9047619047619047, | |
| "grad_norm": 0.5152313113212585, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4144, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.9206349206349205, | |
| "grad_norm": 0.504129946231842, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3023, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.9365079365079367, | |
| "grad_norm": 0.5012217164039612, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3223, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 0.5341697335243225, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3747, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.9682539682539684, | |
| "grad_norm": 0.5258910655975342, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3556, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.984126984126984, | |
| "grad_norm": 0.5003915429115295, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3331, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.5328536629676819, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2383, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 2.3905880451202393, | |
| "eval_runtime": 131.2361, | |
| "eval_samples_per_second": 148.252, | |
| "eval_steps_per_second": 1.158, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 3.015873015873016, | |
| "grad_norm": 0.5069262385368347, | |
| "learning_rate": 0.0001, | |
| "loss": 2.224, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.0317460317460316, | |
| "grad_norm": 0.5155209898948669, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2273, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 3.0476190476190474, | |
| "grad_norm": 0.5158161520957947, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2146, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 3.0634920634920633, | |
| "grad_norm": 0.50887531042099, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2648, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 3.0793650793650795, | |
| "grad_norm": 0.4977055490016937, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1434, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 3.0952380952380953, | |
| "grad_norm": 0.5491054654121399, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3017, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 0.5289337635040283, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1921, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 3.126984126984127, | |
| "grad_norm": 0.5454579591751099, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1324, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 3.142857142857143, | |
| "grad_norm": 0.5596087574958801, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2861, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 3.1587301587301586, | |
| "grad_norm": 0.5491024851799011, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2208, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 3.1746031746031744, | |
| "grad_norm": 0.5643439888954163, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2757, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.1904761904761907, | |
| "grad_norm": 0.5374104380607605, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2096, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 3.2063492063492065, | |
| "grad_norm": 0.5371658802032471, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2569, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 0.5406343936920166, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3481, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 3.238095238095238, | |
| "grad_norm": 0.546499490737915, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2431, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 3.253968253968254, | |
| "grad_norm": 0.5593274235725403, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2204, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 3.2698412698412698, | |
| "grad_norm": 0.5756828784942627, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2358, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 3.2857142857142856, | |
| "grad_norm": 0.5745422840118408, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3466, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 3.3015873015873014, | |
| "grad_norm": 0.5843771696090698, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2836, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 3.317460317460317, | |
| "grad_norm": 0.5851179361343384, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1949, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.5551464557647705, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2045, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.3492063492063493, | |
| "grad_norm": 0.579687237739563, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2161, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 3.365079365079365, | |
| "grad_norm": 0.590063750743866, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1731, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 3.380952380952381, | |
| "grad_norm": 0.5903512835502625, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3689, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 3.3968253968253967, | |
| "grad_norm": 0.571978747844696, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1554, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 3.4126984126984126, | |
| "grad_norm": 0.5911630392074585, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1664, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 3.4285714285714284, | |
| "grad_norm": 0.5953576564788818, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2494, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.6063962578773499, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1696, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 3.4603174603174605, | |
| "grad_norm": 0.5811805129051208, | |
| "learning_rate": 0.0001, | |
| "loss": 2.255, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 3.4761904761904763, | |
| "grad_norm": 0.5684872269630432, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2388, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 3.492063492063492, | |
| "grad_norm": 0.5764886736869812, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2843, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.507936507936508, | |
| "grad_norm": 0.592780351638794, | |
| "learning_rate": 0.0001, | |
| "loss": 2.281, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 3.5238095238095237, | |
| "grad_norm": 0.5397675037384033, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2044, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 3.5396825396825395, | |
| "grad_norm": 0.5696619749069214, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2013, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 0.5832935571670532, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1913, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 0.58072829246521, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1943, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.5873015873015874, | |
| "grad_norm": 0.6177693009376526, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2172, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 3.6031746031746033, | |
| "grad_norm": 0.6173182725906372, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3449, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 3.619047619047619, | |
| "grad_norm": 0.5733217000961304, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2023, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 3.634920634920635, | |
| "grad_norm": 0.5936962366104126, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3098, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 3.6507936507936507, | |
| "grad_norm": 0.5863310694694519, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3444, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.5781247019767761, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2717, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 3.682539682539683, | |
| "grad_norm": 0.5641736388206482, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2551, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 3.6984126984126986, | |
| "grad_norm": 0.5762507319450378, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2005, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 3.7142857142857144, | |
| "grad_norm": 0.5860148668289185, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2755, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.7301587301587302, | |
| "grad_norm": 0.5595604181289673, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1698, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.746031746031746, | |
| "grad_norm": 0.6000334620475769, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1448, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 3.761904761904762, | |
| "grad_norm": 0.5891295075416565, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2041, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 0.5736986398696899, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2187, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 3.7936507936507935, | |
| "grad_norm": 0.605859100818634, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1449, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 0.6083592772483826, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2481, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.825396825396825, | |
| "grad_norm": 0.6463331580162048, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3296, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 3.8412698412698414, | |
| "grad_norm": 0.6547737717628479, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2724, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 3.857142857142857, | |
| "grad_norm": 0.6021876931190491, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2059, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.873015873015873, | |
| "grad_norm": 0.5920640230178833, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2656, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 0.6126848459243774, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2974, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 3.9047619047619047, | |
| "grad_norm": 0.5852090716362, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1778, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.9206349206349205, | |
| "grad_norm": 0.5844463109970093, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1347, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 3.9365079365079367, | |
| "grad_norm": 0.5811941027641296, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2374, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 3.9523809523809526, | |
| "grad_norm": 0.593923807144165, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1287, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.9682539682539684, | |
| "grad_norm": 0.5784031748771667, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2797, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.984126984126984, | |
| "grad_norm": 0.5905077457427979, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1873, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.5942009687423706, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2128, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 2.3258297443389893, | |
| "eval_runtime": 131.2215, | |
| "eval_samples_per_second": 148.268, | |
| "eval_steps_per_second": 1.158, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 4.015873015873016, | |
| "grad_norm": 0.5791855454444885, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1227, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 4.031746031746032, | |
| "grad_norm": 0.5935028195381165, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0783, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 4.0476190476190474, | |
| "grad_norm": 0.5985621809959412, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2336, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 4.063492063492063, | |
| "grad_norm": 0.595588207244873, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1797, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 4.079365079365079, | |
| "grad_norm": 0.6228951811790466, | |
| "learning_rate": 0.0001, | |
| "loss": 2.102, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 4.095238095238095, | |
| "grad_norm": 0.6309946179389954, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1029, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 0.6356679797172546, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2215, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 4.1269841269841265, | |
| "grad_norm": 0.6199479103088379, | |
| "learning_rate": 0.0001, | |
| "loss": 2.209, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.142857142857143, | |
| "grad_norm": 0.6396746635437012, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1935, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 4.158730158730159, | |
| "grad_norm": 0.624236524105072, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1285, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 4.174603174603175, | |
| "grad_norm": 0.6298365592956543, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1425, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 4.190476190476191, | |
| "grad_norm": 0.6376235485076904, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1139, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 4.2063492063492065, | |
| "grad_norm": 0.6685353517532349, | |
| "learning_rate": 0.0001, | |
| "loss": 2.3174, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 0.646286129951477, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1518, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 4.238095238095238, | |
| "grad_norm": 0.6334670782089233, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1696, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 4.253968253968254, | |
| "grad_norm": 0.646755039691925, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1926, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 4.26984126984127, | |
| "grad_norm": 0.6688207387924194, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0777, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 0.6606519818305969, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0851, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.301587301587301, | |
| "grad_norm": 0.6396586894989014, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0645, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 4.317460317460317, | |
| "grad_norm": 0.6652937531471252, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0649, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 0.6375648379325867, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1439, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 4.349206349206349, | |
| "grad_norm": 0.6601315140724182, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1115, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 4.365079365079365, | |
| "grad_norm": 0.6605430841445923, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0952, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 4.380952380952381, | |
| "grad_norm": 0.682755708694458, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0782, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 4.396825396825397, | |
| "grad_norm": 0.7010186314582825, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1585, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 4.412698412698413, | |
| "grad_norm": 0.6743943691253662, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0401, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 4.428571428571429, | |
| "grad_norm": 0.659075140953064, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0975, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.6433005332946777, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1697, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.4603174603174605, | |
| "grad_norm": 0.6300135850906372, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0184, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 4.476190476190476, | |
| "grad_norm": 0.6259458065032959, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0946, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 4.492063492063492, | |
| "grad_norm": 0.6421889066696167, | |
| "learning_rate": 0.0001, | |
| "loss": 2.061, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 4.507936507936508, | |
| "grad_norm": 0.6369497179985046, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0741, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 4.523809523809524, | |
| "grad_norm": 0.6569401621818542, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1381, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 4.5396825396825395, | |
| "grad_norm": 0.6559494733810425, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0315, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 0.6800838112831116, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1973, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 4.571428571428571, | |
| "grad_norm": 0.660052478313446, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0466, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 4.587301587301587, | |
| "grad_norm": 0.67457115650177, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0389, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 4.603174603174603, | |
| "grad_norm": 0.701083779335022, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1097, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.619047619047619, | |
| "grad_norm": 0.6736295223236084, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0152, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 4.634920634920634, | |
| "grad_norm": 0.6742798686027527, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0856, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 4.650793650793651, | |
| "grad_norm": 0.6709657907485962, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0645, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 0.7078341841697693, | |
| "learning_rate": 0.0001, | |
| "loss": 2.081, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 4.682539682539683, | |
| "grad_norm": 0.6934991478919983, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1799, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 4.698412698412699, | |
| "grad_norm": 0.629152238368988, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1099, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 4.714285714285714, | |
| "grad_norm": 0.6546439528465271, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0761, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 4.73015873015873, | |
| "grad_norm": 0.6665687561035156, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1299, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 4.746031746031746, | |
| "grad_norm": 0.6587039828300476, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0257, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 0.6549575924873352, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0057, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 0.6643835306167603, | |
| "learning_rate": 0.0001, | |
| "loss": 2.14, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 4.7936507936507935, | |
| "grad_norm": 0.6684074401855469, | |
| "learning_rate": 0.0001, | |
| "loss": 2.133, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 4.809523809523809, | |
| "grad_norm": 0.6732567548751831, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1709, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 4.825396825396825, | |
| "grad_norm": 0.6627042293548584, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1105, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 4.841269841269841, | |
| "grad_norm": 0.7108631134033203, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2417, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 4.857142857142857, | |
| "grad_norm": 0.6348216533660889, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1181, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 4.8730158730158735, | |
| "grad_norm": 0.7204558849334717, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2538, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 0.6680718660354614, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0988, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 4.904761904761905, | |
| "grad_norm": 0.629531741142273, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0871, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 4.920634920634921, | |
| "grad_norm": 0.6763596534729004, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1201, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.936507936507937, | |
| "grad_norm": 0.6565173268318176, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0979, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 4.9523809523809526, | |
| "grad_norm": 0.6826967597007751, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2006, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 4.968253968253968, | |
| "grad_norm": 0.6894890666007996, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1615, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 4.984126984126984, | |
| "grad_norm": 0.661064624786377, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1329, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.6418980956077576, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0654, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 2.2862746715545654, | |
| "eval_runtime": 131.1439, | |
| "eval_samples_per_second": 148.356, | |
| "eval_steps_per_second": 1.159, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 5.015873015873016, | |
| "grad_norm": 0.6436026096343994, | |
| "learning_rate": 0.0001, | |
| "loss": 2.094, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 5.031746031746032, | |
| "grad_norm": 0.6524760127067566, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0095, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 5.0476190476190474, | |
| "grad_norm": 0.6650781035423279, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9616, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 5.063492063492063, | |
| "grad_norm": 0.7145387530326843, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9858, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 5.079365079365079, | |
| "grad_norm": 0.7050247192382812, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0304, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.095238095238095, | |
| "grad_norm": 0.722385585308075, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0837, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 0.7140416502952576, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9818, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 5.1269841269841265, | |
| "grad_norm": 0.7170698642730713, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0055, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 5.142857142857143, | |
| "grad_norm": 0.6809371709823608, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0375, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 5.158730158730159, | |
| "grad_norm": 0.7128539085388184, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0837, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 5.174603174603175, | |
| "grad_norm": 0.7276302576065063, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0278, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 5.190476190476191, | |
| "grad_norm": 0.7262607216835022, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0855, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 5.2063492063492065, | |
| "grad_norm": 0.7341217398643494, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1168, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 0.7559060454368591, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0097, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 5.238095238095238, | |
| "grad_norm": 0.7604755759239197, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0065, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 5.253968253968254, | |
| "grad_norm": 0.7481369972229004, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0204, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 5.26984126984127, | |
| "grad_norm": 0.7615129351615906, | |
| "learning_rate": 0.0001, | |
| "loss": 2.123, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 5.285714285714286, | |
| "grad_norm": 0.737953245639801, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0006, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 5.301587301587301, | |
| "grad_norm": 0.7923922538757324, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0557, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 5.317460317460317, | |
| "grad_norm": 0.7669034004211426, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0475, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.7678413391113281, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0465, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 5.349206349206349, | |
| "grad_norm": 0.8182974457740784, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1546, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 5.365079365079365, | |
| "grad_norm": 0.7198697924613953, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0281, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 5.380952380952381, | |
| "grad_norm": 0.7147011756896973, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0025, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 5.396825396825397, | |
| "grad_norm": 0.7140205502510071, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9455, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.412698412698413, | |
| "grad_norm": 0.7569496631622314, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9467, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 5.428571428571429, | |
| "grad_norm": 0.7563154697418213, | |
| "learning_rate": 0.0001, | |
| "loss": 2.028, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 0.7342957258224487, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0094, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 5.4603174603174605, | |
| "grad_norm": 0.6956045627593994, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9638, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 5.476190476190476, | |
| "grad_norm": 0.7402710914611816, | |
| "learning_rate": 0.0001, | |
| "loss": 1.952, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 5.492063492063492, | |
| "grad_norm": 0.7401675581932068, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0025, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 5.507936507936508, | |
| "grad_norm": 0.7523941397666931, | |
| "learning_rate": 0.0001, | |
| "loss": 1.968, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 5.523809523809524, | |
| "grad_norm": 0.7316027283668518, | |
| "learning_rate": 0.0001, | |
| "loss": 2.047, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 5.5396825396825395, | |
| "grad_norm": 0.7798665761947632, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0133, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.7752505540847778, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0177, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.571428571428571, | |
| "grad_norm": 0.7742307186126709, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9797, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 5.587301587301587, | |
| "grad_norm": 0.7735832929611206, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0347, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 5.603174603174603, | |
| "grad_norm": 0.7533615827560425, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9683, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 5.619047619047619, | |
| "grad_norm": 0.7873823046684265, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1035, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 5.634920634920634, | |
| "grad_norm": 0.739718496799469, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0512, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 5.650793650793651, | |
| "grad_norm": 0.7133144736289978, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9078, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 0.7506771087646484, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0843, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 5.682539682539683, | |
| "grad_norm": 0.7370438575744629, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9155, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 5.698412698412699, | |
| "grad_norm": 0.7745400667190552, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0788, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 0.7573673725128174, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9092, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.73015873015873, | |
| "grad_norm": 0.7863042950630188, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9919, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 5.746031746031746, | |
| "grad_norm": 0.766444206237793, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0213, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 5.761904761904762, | |
| "grad_norm": 0.7395709156990051, | |
| "learning_rate": 0.0001, | |
| "loss": 1.926, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 0.7217903733253479, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9947, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 5.7936507936507935, | |
| "grad_norm": 0.7542216181755066, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9689, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 5.809523809523809, | |
| "grad_norm": 0.7293450236320496, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0281, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 5.825396825396825, | |
| "grad_norm": 0.7562200427055359, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0989, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 5.841269841269841, | |
| "grad_norm": 0.7612117528915405, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0551, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 5.857142857142857, | |
| "grad_norm": 0.734811007976532, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0023, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 5.8730158730158735, | |
| "grad_norm": 0.7379953265190125, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9598, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 0.7303497791290283, | |
| "learning_rate": 0.0001, | |
| "loss": 1.972, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 5.904761904761905, | |
| "grad_norm": 0.8013107180595398, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0159, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 5.920634920634921, | |
| "grad_norm": 0.7716558575630188, | |
| "learning_rate": 0.0001, | |
| "loss": 2.053, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 5.936507936507937, | |
| "grad_norm": 0.7352483868598938, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0788, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 5.9523809523809526, | |
| "grad_norm": 0.735283374786377, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9362, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 5.968253968253968, | |
| "grad_norm": 0.7122552394866943, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9169, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 5.984126984126984, | |
| "grad_norm": 0.7541640400886536, | |
| "learning_rate": 0.0001, | |
| "loss": 2.008, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.7707133889198303, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0254, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 2.260683298110962, | |
| "eval_runtime": 131.3176, | |
| "eval_samples_per_second": 148.16, | |
| "eval_steps_per_second": 1.157, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 6.015873015873016, | |
| "grad_norm": 0.7102126479148865, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8746, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 6.031746031746032, | |
| "grad_norm": 0.7207889556884766, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8726, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 6.0476190476190474, | |
| "grad_norm": 0.7585148811340332, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9476, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 6.063492063492063, | |
| "grad_norm": 0.7799012064933777, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9059, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 6.079365079365079, | |
| "grad_norm": 0.7834872603416443, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8973, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 6.095238095238095, | |
| "grad_norm": 0.8367555737495422, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8635, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 0.8433529138565063, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9322, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 6.1269841269841265, | |
| "grad_norm": 0.7994055151939392, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9026, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 6.142857142857143, | |
| "grad_norm": 0.8268090486526489, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8396, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 6.158730158730159, | |
| "grad_norm": 0.8418826460838318, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9913, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 6.174603174603175, | |
| "grad_norm": 0.7894602417945862, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8797, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 6.190476190476191, | |
| "grad_norm": 0.8009418845176697, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8766, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 6.2063492063492065, | |
| "grad_norm": 0.8091604709625244, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8589, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 0.8371259570121765, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9282, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 6.238095238095238, | |
| "grad_norm": 0.8433154821395874, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9929, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 6.253968253968254, | |
| "grad_norm": 0.8620651960372925, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9399, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 6.26984126984127, | |
| "grad_norm": 0.8491103649139404, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8712, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 6.285714285714286, | |
| "grad_norm": 0.8032292127609253, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8639, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 6.301587301587301, | |
| "grad_norm": 0.8091895580291748, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9479, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 6.317460317460317, | |
| "grad_norm": 0.822962760925293, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0056, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 0.8068661689758301, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9636, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 6.349206349206349, | |
| "grad_norm": 0.8433991074562073, | |
| "learning_rate": 0.0001, | |
| "loss": 1.896, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 6.365079365079365, | |
| "grad_norm": 0.870265007019043, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8539, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 6.380952380952381, | |
| "grad_norm": 0.8308740854263306, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8613, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 6.396825396825397, | |
| "grad_norm": 0.8270155191421509, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8558, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 6.412698412698413, | |
| "grad_norm": 0.8572468161582947, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9396, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 6.428571428571429, | |
| "grad_norm": 0.8838953375816345, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9849, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 0.9025276899337769, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0395, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 6.4603174603174605, | |
| "grad_norm": 0.7760733962059021, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8545, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 6.476190476190476, | |
| "grad_norm": 0.8531435132026672, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8194, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 6.492063492063492, | |
| "grad_norm": 0.8817340731620789, | |
| "learning_rate": 0.0001, | |
| "loss": 2.035, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 6.507936507936508, | |
| "grad_norm": 0.8305168747901917, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9981, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 6.523809523809524, | |
| "grad_norm": 0.8058464527130127, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9231, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 6.5396825396825395, | |
| "grad_norm": 0.8471766114234924, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9397, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 0.8192426562309265, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8897, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 6.571428571428571, | |
| "grad_norm": 0.8103774785995483, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9078, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 6.587301587301587, | |
| "grad_norm": 0.8194547891616821, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9603, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 6.603174603174603, | |
| "grad_norm": 0.8253903388977051, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8948, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 6.619047619047619, | |
| "grad_norm": 0.8385449647903442, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9432, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 6.634920634920634, | |
| "grad_norm": 0.8203578591346741, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9588, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 6.650793650793651, | |
| "grad_norm": 0.9151753187179565, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9159, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.8757310509681702, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0025, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.682539682539683, | |
| "grad_norm": 0.833610475063324, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9071, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 6.698412698412699, | |
| "grad_norm": 0.8258681297302246, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8553, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 6.714285714285714, | |
| "grad_norm": 0.8394814133644104, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9562, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 6.73015873015873, | |
| "grad_norm": 0.78861403465271, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9229, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 6.746031746031746, | |
| "grad_norm": 0.8214682936668396, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7627, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 6.761904761904762, | |
| "grad_norm": 0.8098556399345398, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8274, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 0.8319344520568848, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8814, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 6.7936507936507935, | |
| "grad_norm": 0.8325346112251282, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9049, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 6.809523809523809, | |
| "grad_norm": 0.7863579392433167, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9267, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 6.825396825396825, | |
| "grad_norm": 0.7735232710838318, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8991, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.841269841269841, | |
| "grad_norm": 0.8038283586502075, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9259, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 6.857142857142857, | |
| "grad_norm": 0.7874023914337158, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9138, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 6.8730158730158735, | |
| "grad_norm": 0.8375279307365417, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9044, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 0.8343684673309326, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9485, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 6.904761904761905, | |
| "grad_norm": 0.812360405921936, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8098, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 6.920634920634921, | |
| "grad_norm": 0.8316230773925781, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9298, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 6.936507936507937, | |
| "grad_norm": 0.8456923365592957, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8636, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 6.9523809523809526, | |
| "grad_norm": 0.8814741969108582, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9683, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 6.968253968253968, | |
| "grad_norm": 0.8839107751846313, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8997, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 6.984126984126984, | |
| "grad_norm": 0.8631510138511658, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9282, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.801532506942749, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8911, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 2.25614333152771, | |
| "eval_runtime": 131.235, | |
| "eval_samples_per_second": 148.253, | |
| "eval_steps_per_second": 1.158, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 7.015873015873016, | |
| "grad_norm": 0.8276068568229675, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8037, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 7.031746031746032, | |
| "grad_norm": 0.8222858309745789, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9118, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 7.0476190476190474, | |
| "grad_norm": 0.7912269830703735, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7942, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 7.063492063492063, | |
| "grad_norm": 0.817715585231781, | |
| "learning_rate": 0.0001, | |
| "loss": 1.794, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 7.079365079365079, | |
| "grad_norm": 0.839603841304779, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8142, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 7.095238095238095, | |
| "grad_norm": 0.8901446461677551, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8385, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.959743857383728, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9296, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 7.1269841269841265, | |
| "grad_norm": 1.0038635730743408, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8572, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 0.9257464408874512, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6719, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 7.158730158730159, | |
| "grad_norm": 0.9111561179161072, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7518, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 7.174603174603175, | |
| "grad_norm": 0.9094122052192688, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8354, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 7.190476190476191, | |
| "grad_norm": 0.8958294987678528, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8313, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 7.2063492063492065, | |
| "grad_norm": 0.8593880534172058, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6535, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 0.8774977922439575, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8515, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 7.238095238095238, | |
| "grad_norm": 0.8923255801200867, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8578, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 7.253968253968254, | |
| "grad_norm": 0.901292622089386, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7843, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 7.26984126984127, | |
| "grad_norm": 0.912469744682312, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8014, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 7.285714285714286, | |
| "grad_norm": 0.9004772305488586, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7254, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 7.301587301587301, | |
| "grad_norm": 0.9091677665710449, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8108, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 7.317460317460317, | |
| "grad_norm": 0.8714731335639954, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7518, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 0.9471895694732666, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7915, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 7.349206349206349, | |
| "grad_norm": 0.9310287237167358, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8579, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 7.365079365079365, | |
| "grad_norm": 0.9024715423583984, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7138, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 7.380952380952381, | |
| "grad_norm": 0.8799977898597717, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7581, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 7.396825396825397, | |
| "grad_norm": 0.9417739510536194, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8122, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 7.412698412698413, | |
| "grad_norm": 0.9084734916687012, | |
| "learning_rate": 0.0001, | |
| "loss": 1.853, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 7.428571428571429, | |
| "grad_norm": 0.9279357194900513, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7924, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 0.8860162496566772, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7798, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 7.4603174603174605, | |
| "grad_norm": 0.897928774356842, | |
| "learning_rate": 0.0001, | |
| "loss": 1.817, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 7.476190476190476, | |
| "grad_norm": 0.9444167017936707, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8187, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 7.492063492063492, | |
| "grad_norm": 0.9067376852035522, | |
| "learning_rate": 0.0001, | |
| "loss": 1.852, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 7.507936507936508, | |
| "grad_norm": 0.9207352995872498, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7946, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 7.523809523809524, | |
| "grad_norm": 0.931064784526825, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8096, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 7.5396825396825395, | |
| "grad_norm": 0.8873443603515625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8386, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 0.8808175921440125, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7676, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 7.571428571428571, | |
| "grad_norm": 0.9169921278953552, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8152, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 7.587301587301587, | |
| "grad_norm": 0.8960747718811035, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8075, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 7.603174603174603, | |
| "grad_norm": 0.8552300333976746, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8091, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 7.619047619047619, | |
| "grad_norm": 0.9400557279586792, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8079, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.634920634920634, | |
| "grad_norm": 0.9670395851135254, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8646, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 7.650793650793651, | |
| "grad_norm": 0.8738551735877991, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7896, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 0.9151812791824341, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9319, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 7.682539682539683, | |
| "grad_norm": 0.9273959994316101, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7266, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 7.698412698412699, | |
| "grad_norm": 0.9260756969451904, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8105, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 7.714285714285714, | |
| "grad_norm": 0.9049228429794312, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7124, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 7.73015873015873, | |
| "grad_norm": 0.9356061220169067, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8866, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 7.746031746031746, | |
| "grad_norm": 0.9253839254379272, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7999, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 7.761904761904762, | |
| "grad_norm": 0.9333457946777344, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8482, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.952116072177887, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7953, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.7936507936507935, | |
| "grad_norm": 0.925373911857605, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7874, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 7.809523809523809, | |
| "grad_norm": 0.9960001111030579, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9653, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 7.825396825396825, | |
| "grad_norm": 0.9007038474082947, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7981, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 7.841269841269841, | |
| "grad_norm": 0.9345034956932068, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8689, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 7.857142857142857, | |
| "grad_norm": 0.8991730213165283, | |
| "learning_rate": 0.0001, | |
| "loss": 1.881, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 7.8730158730158735, | |
| "grad_norm": 0.8914086818695068, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7693, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.8485236763954163, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6928, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 7.904761904761905, | |
| "grad_norm": 0.8499413728713989, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7552, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 7.920634920634921, | |
| "grad_norm": 0.8961548805236816, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7611, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 7.936507936507937, | |
| "grad_norm": 0.8979543447494507, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7783, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.9523809523809526, | |
| "grad_norm": 0.9030099511146545, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8917, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 7.968253968253968, | |
| "grad_norm": 0.8999877572059631, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6667, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 7.984126984126984, | |
| "grad_norm": 0.9466712474822998, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8247, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.9156408905982971, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7829, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 2.2886300086975098, | |
| "eval_runtime": 131.1918, | |
| "eval_samples_per_second": 148.302, | |
| "eval_steps_per_second": 1.159, | |
| "step": 504 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 504, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.882994594967978e+18, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |