| { | |
| "best_global_step": 192, | |
| "best_metric": 0.70005822, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 16, | |
| "global_step": 380, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005263157894736842, | |
| "grad_norm": 9.152583101647197, | |
| "learning_rate": 1.9999658256641746e-05, | |
| "loss": 1.3120107650756836, | |
| "memory(GiB)": 57.01, | |
| "step": 1, | |
| "train_speed(iter/s)": 0.036088 | |
| }, | |
| { | |
| "epoch": 0.02631578947368421, | |
| "grad_norm": 2.1260795018452656, | |
| "learning_rate": 1.999145758387301e-05, | |
| "loss": 0.9991766810417175, | |
| "memory(GiB)": 57.01, | |
| "step": 5, | |
| "train_speed(iter/s)": 0.048355 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 0.7836553551546025, | |
| "learning_rate": 1.99658449300667e-05, | |
| "loss": 0.9186409950256348, | |
| "memory(GiB)": 57.01, | |
| "step": 10, | |
| "train_speed(iter/s)": 0.049883 | |
| }, | |
| { | |
| "epoch": 0.07894736842105263, | |
| "grad_norm": 0.6412562781618624, | |
| "learning_rate": 1.992320579737045e-05, | |
| "loss": 0.8510271072387695, | |
| "memory(GiB)": 57.01, | |
| "step": 15, | |
| "train_speed(iter/s)": 0.051154 | |
| }, | |
| { | |
| "epoch": 0.08421052631578947, | |
| "eval_loss": 0.7916765213012695, | |
| "eval_runtime": 9.8416, | |
| "eval_samples_per_second": 6.401, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7726357334653692, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.49696683676371833, | |
| "learning_rate": 1.9863613034027224e-05, | |
| "loss": 0.8826911926269532, | |
| "memory(GiB)": 69.3, | |
| "step": 20, | |
| "train_speed(iter/s)": 0.050284 | |
| }, | |
| { | |
| "epoch": 0.13157894736842105, | |
| "grad_norm": 0.5207448076725332, | |
| "learning_rate": 1.9787168453273546e-05, | |
| "loss": 0.831389045715332, | |
| "memory(GiB)": 69.3, | |
| "step": 25, | |
| "train_speed(iter/s)": 0.050966 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 0.5062200984947397, | |
| "learning_rate": 1.9694002659393306e-05, | |
| "loss": 0.8272812843322754, | |
| "memory(GiB)": 69.3, | |
| "step": 30, | |
| "train_speed(iter/s)": 0.051443 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "eval_loss": 0.7559030652046204, | |
| "eval_runtime": 9.8411, | |
| "eval_samples_per_second": 6.402, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7801532059949344, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.18421052631578946, | |
| "grad_norm": 0.4939312053721415, | |
| "learning_rate": 1.958427482458253e-05, | |
| "loss": 0.8164405822753906, | |
| "memory(GiB)": 69.37, | |
| "step": 35, | |
| "train_speed(iter/s)": 0.051059 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.7658040629902148, | |
| "learning_rate": 1.9458172417006347e-05, | |
| "loss": 0.8088149070739746, | |
| "memory(GiB)": 69.37, | |
| "step": 40, | |
| "train_speed(iter/s)": 0.051441 | |
| }, | |
| { | |
| "epoch": 0.23684210526315788, | |
| "grad_norm": 0.4138057058676066, | |
| "learning_rate": 1.9315910880512792e-05, | |
| "loss": 0.7972204208374023, | |
| "memory(GiB)": 69.37, | |
| "step": 45, | |
| "train_speed(iter/s)": 0.051684 | |
| }, | |
| { | |
| "epoch": 0.25263157894736843, | |
| "eval_loss": 0.7412319183349609, | |
| "eval_runtime": 9.8493, | |
| "eval_samples_per_second": 6.396, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7837714954961817, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.40921663738924585, | |
| "learning_rate": 1.9157733266550577e-05, | |
| "loss": 0.8212543487548828, | |
| "memory(GiB)": 69.37, | |
| "step": 50, | |
| "train_speed(iter/s)": 0.051332 | |
| }, | |
| { | |
| "epoch": 0.2894736842105263, | |
| "grad_norm": 0.42411505277750844, | |
| "learning_rate": 1.898390981891979e-05, | |
| "loss": 0.820932674407959, | |
| "memory(GiB)": 69.37, | |
| "step": 55, | |
| "train_speed(iter/s)": 0.051591 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.4372810679248111, | |
| "learning_rate": 1.879473751206489e-05, | |
| "loss": 0.7952181816101074, | |
| "memory(GiB)": 69.37, | |
| "step": 60, | |
| "train_speed(iter/s)": 0.051797 | |
| }, | |
| { | |
| "epoch": 0.3368421052631579, | |
| "eval_loss": 0.7309797406196594, | |
| "eval_runtime": 9.8337, | |
| "eval_samples_per_second": 6.407, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7857353697320562, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.34210526315789475, | |
| "grad_norm": 0.4228185459938873, | |
| "learning_rate": 1.8590539543698852e-05, | |
| "loss": 0.8058796882629394, | |
| "memory(GiB)": 69.37, | |
| "step": 65, | |
| "train_speed(iter/s)": 0.051508 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.42601403175611896, | |
| "learning_rate": 1.8371664782625287e-05, | |
| "loss": 0.794715690612793, | |
| "memory(GiB)": 69.37, | |
| "step": 70, | |
| "train_speed(iter/s)": 0.051591 | |
| }, | |
| { | |
| "epoch": 0.39473684210526316, | |
| "grad_norm": 0.4262322625536522, | |
| "learning_rate": 1.813848717270195e-05, | |
| "loss": 0.7831051349639893, | |
| "memory(GiB)": 69.37, | |
| "step": 75, | |
| "train_speed(iter/s)": 0.051639 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.48084059766704834, | |
| "learning_rate": 1.789140509396394e-05, | |
| "loss": 0.77615327835083, | |
| "memory(GiB)": 69.37, | |
| "step": 80, | |
| "train_speed(iter/s)": 0.051759 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "eval_loss": 0.7247982621192932, | |
| "eval_runtime": 9.8915, | |
| "eval_samples_per_second": 6.369, | |
| "eval_steps_per_second": 0.202, | |
| "eval_token_acc": 0.7875040467711527, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4473684210526316, | |
| "grad_norm": 0.4217600796086705, | |
| "learning_rate": 1.7630840681998068e-05, | |
| "loss": 0.7695322036743164, | |
| "memory(GiB)": 69.37, | |
| "step": 85, | |
| "train_speed(iter/s)": 0.051515 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.4445281215969849, | |
| "learning_rate": 1.735723910673132e-05, | |
| "loss": 0.7695138454437256, | |
| "memory(GiB)": 69.37, | |
| "step": 90, | |
| "train_speed(iter/s)": 0.051651 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.49202631008863995, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 0.7878283023834228, | |
| "memory(GiB)": 69.37, | |
| "step": 95, | |
| "train_speed(iter/s)": 0.051798 | |
| }, | |
| { | |
| "epoch": 0.5052631578947369, | |
| "eval_loss": 0.7192742824554443, | |
| "eval_runtime": 9.8387, | |
| "eval_samples_per_second": 6.403, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7890584829845175, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.5136302863742109, | |
| "learning_rate": 1.6772815716257414e-05, | |
| "loss": 0.8238545417785644, | |
| "memory(GiB)": 69.37, | |
| "step": 100, | |
| "train_speed(iter/s)": 0.051543 | |
| }, | |
| { | |
| "epoch": 0.5526315789473685, | |
| "grad_norm": 0.40357521887804787, | |
| "learning_rate": 1.646299237860941e-05, | |
| "loss": 0.7880066871643067, | |
| "memory(GiB)": 69.37, | |
| "step": 105, | |
| "train_speed(iter/s)": 0.050576 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.42779690136435417, | |
| "learning_rate": 1.6142127126896682e-05, | |
| "loss": 0.7965791702270508, | |
| "memory(GiB)": 69.37, | |
| "step": 110, | |
| "train_speed(iter/s)": 0.050695 | |
| }, | |
| { | |
| "epoch": 0.5894736842105263, | |
| "eval_loss": 0.7136698961257935, | |
| "eval_runtime": 9.8339, | |
| "eval_samples_per_second": 6.406, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7901154043914609, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6052631578947368, | |
| "grad_norm": 0.405474461817072, | |
| "learning_rate": 1.5810768154019386e-05, | |
| "loss": 0.7739765167236328, | |
| "memory(GiB)": 69.68, | |
| "step": 115, | |
| "train_speed(iter/s)": 0.050597 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.44011902270506637, | |
| "learning_rate": 1.5469481581224274e-05, | |
| "loss": 0.7681276321411132, | |
| "memory(GiB)": 69.68, | |
| "step": 120, | |
| "train_speed(iter/s)": 0.050764 | |
| }, | |
| { | |
| "epoch": 0.6578947368421053, | |
| "grad_norm": 0.40246980138168537, | |
| "learning_rate": 1.5118850490896012e-05, | |
| "loss": 0.7708123683929443, | |
| "memory(GiB)": 69.68, | |
| "step": 125, | |
| "train_speed(iter/s)": 0.050868 | |
| }, | |
| { | |
| "epoch": 0.6736842105263158, | |
| "eval_loss": 0.7104138135910034, | |
| "eval_runtime": 9.7939, | |
| "eval_samples_per_second": 6.433, | |
| "eval_steps_per_second": 0.204, | |
| "eval_token_acc": 0.790924758622003, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.3791182451908356, | |
| "learning_rate": 1.4759473930370738e-05, | |
| "loss": 0.7606860160827636, | |
| "memory(GiB)": 69.68, | |
| "step": 130, | |
| "train_speed(iter/s)": 0.05075 | |
| }, | |
| { | |
| "epoch": 0.7105263157894737, | |
| "grad_norm": 0.3794671895337737, | |
| "learning_rate": 1.4391965888473705e-05, | |
| "loss": 0.7771711349487305, | |
| "memory(GiB)": 69.68, | |
| "step": 135, | |
| "train_speed(iter/s)": 0.050853 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.37709314975078, | |
| "learning_rate": 1.4016954246529697e-05, | |
| "loss": 0.7493702888488769, | |
| "memory(GiB)": 69.68, | |
| "step": 140, | |
| "train_speed(iter/s)": 0.050943 | |
| }, | |
| { | |
| "epoch": 0.7578947368421053, | |
| "eval_loss": 0.706990122795105, | |
| "eval_runtime": 9.807, | |
| "eval_samples_per_second": 6.424, | |
| "eval_steps_per_second": 0.204, | |
| "eval_token_acc": 0.7914865456761441, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7631578947368421, | |
| "grad_norm": 0.39953813706935337, | |
| "learning_rate": 1.3635079705638298e-05, | |
| "loss": 0.7736559391021729, | |
| "memory(GiB)": 69.68, | |
| "step": 145, | |
| "train_speed(iter/s)": 0.050829 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.3830666041149983, | |
| "learning_rate": 1.3246994692046837e-05, | |
| "loss": 0.7789090156555176, | |
| "memory(GiB)": 69.68, | |
| "step": 150, | |
| "train_speed(iter/s)": 0.050937 | |
| }, | |
| { | |
| "epoch": 0.8157894736842105, | |
| "grad_norm": 0.421924385981196, | |
| "learning_rate": 1.2853362242491054e-05, | |
| "loss": 0.7540519714355469, | |
| "memory(GiB)": 69.68, | |
| "step": 155, | |
| "train_speed(iter/s)": 0.050961 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.4181935666985859, | |
| "learning_rate": 1.2454854871407993e-05, | |
| "loss": 0.7660489082336426, | |
| "memory(GiB)": 69.68, | |
| "step": 160, | |
| "train_speed(iter/s)": 0.051056 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "eval_loss": 0.7034122347831726, | |
| "eval_runtime": 9.8416, | |
| "eval_samples_per_second": 6.401, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7925434670830873, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.868421052631579, | |
| "grad_norm": 0.404345099061113, | |
| "learning_rate": 1.2052153421956343e-05, | |
| "loss": 0.7705528259277343, | |
| "memory(GiB)": 69.68, | |
| "step": 165, | |
| "train_speed(iter/s)": 0.050964 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.4116934893822773, | |
| "learning_rate": 1.164594590280734e-05, | |
| "loss": 0.7849107265472413, | |
| "memory(GiB)": 69.68, | |
| "step": 170, | |
| "train_speed(iter/s)": 0.051044 | |
| }, | |
| { | |
| "epoch": 0.9210526315789473, | |
| "grad_norm": 0.4090058942775372, | |
| "learning_rate": 1.123692631269348e-05, | |
| "loss": 0.757164478302002, | |
| "memory(GiB)": 69.68, | |
| "step": 175, | |
| "train_speed(iter/s)": 0.051102 | |
| }, | |
| { | |
| "epoch": 0.9263157894736842, | |
| "eval_loss": 0.7005300521850586, | |
| "eval_runtime": 9.7897, | |
| "eval_samples_per_second": 6.435, | |
| "eval_steps_per_second": 0.204, | |
| "eval_token_acc": 0.7931933309211403, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.3661599756724907, | |
| "learning_rate": 1.0825793454723325e-05, | |
| "loss": 0.7720006942749024, | |
| "memory(GiB)": 69.68, | |
| "step": 180, | |
| "train_speed(iter/s)": 0.051016 | |
| }, | |
| { | |
| "epoch": 0.9736842105263158, | |
| "grad_norm": 0.40343584878225963, | |
| "learning_rate": 1.0413249742488132e-05, | |
| "loss": 0.7824891090393067, | |
| "memory(GiB)": 69.68, | |
| "step": 185, | |
| "train_speed(iter/s)": 0.051107 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.41506034439679096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7296091079711914, | |
| "memory(GiB)": 69.68, | |
| "step": 190, | |
| "train_speed(iter/s)": 0.051173 | |
| }, | |
| { | |
| "epoch": 1.0105263157894737, | |
| "eval_loss": 0.7000582218170166, | |
| "eval_runtime": 9.8558, | |
| "eval_samples_per_second": 6.392, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7939455542648207, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.0263157894736843, | |
| "grad_norm": 0.38246407542421074, | |
| "learning_rate": 9.586750257511868e-06, | |
| "loss": 0.6210448265075683, | |
| "memory(GiB)": 69.68, | |
| "step": 195, | |
| "train_speed(iter/s)": 0.05107 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.4086238383459442, | |
| "learning_rate": 9.174206545276678e-06, | |
| "loss": 0.6003309726715088, | |
| "memory(GiB)": 69.68, | |
| "step": 200, | |
| "train_speed(iter/s)": 0.051127 | |
| }, | |
| { | |
| "epoch": 1.0789473684210527, | |
| "grad_norm": 0.5181020519262887, | |
| "learning_rate": 8.763073687306523e-06, | |
| "loss": 0.607297945022583, | |
| "memory(GiB)": 69.68, | |
| "step": 205, | |
| "train_speed(iter/s)": 0.050639 | |
| }, | |
| { | |
| "epoch": 1.0947368421052632, | |
| "eval_loss": 0.7147046327590942, | |
| "eval_runtime": 10.3765, | |
| "eval_samples_per_second": 6.071, | |
| "eval_steps_per_second": 0.193, | |
| "eval_token_acc": 0.7924292053093638, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.37648543638620585, | |
| "learning_rate": 8.35405409719266e-06, | |
| "loss": 0.6064910888671875, | |
| "memory(GiB)": 69.68, | |
| "step": 210, | |
| "train_speed(iter/s)": 0.050556 | |
| }, | |
| { | |
| "epoch": 1.131578947368421, | |
| "grad_norm": 0.49314042147480297, | |
| "learning_rate": 7.947846578043658e-06, | |
| "loss": 0.5927825927734375, | |
| "memory(GiB)": 69.68, | |
| "step": 215, | |
| "train_speed(iter/s)": 0.050629 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.3822092612971886, | |
| "learning_rate": 7.545145128592009e-06, | |
| "loss": 0.5842184543609619, | |
| "memory(GiB)": 69.68, | |
| "step": 220, | |
| "train_speed(iter/s)": 0.050696 | |
| }, | |
| { | |
| "epoch": 1.1789473684210527, | |
| "eval_loss": 0.7113193273544312, | |
| "eval_runtime": 9.9005, | |
| "eval_samples_per_second": 6.363, | |
| "eval_steps_per_second": 0.202, | |
| "eval_token_acc": 0.7929076764868314, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.1842105263157894, | |
| "grad_norm": 0.4045395338102251, | |
| "learning_rate": 7.14663775750895e-06, | |
| "loss": 0.6170342445373536, | |
| "memory(GiB)": 69.68, | |
| "step": 225, | |
| "train_speed(iter/s)": 0.050639 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.3917516152680252, | |
| "learning_rate": 6.7530053079531664e-06, | |
| "loss": 0.6077111244201661, | |
| "memory(GiB)": 69.68, | |
| "step": 230, | |
| "train_speed(iter/s)": 0.0507 | |
| }, | |
| { | |
| "epoch": 1.236842105263158, | |
| "grad_norm": 0.37092803642068417, | |
| "learning_rate": 6.364920294361701e-06, | |
| "loss": 0.5995924472808838, | |
| "memory(GiB)": 69.68, | |
| "step": 235, | |
| "train_speed(iter/s)": 0.050762 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.3904513301842613, | |
| "learning_rate": 5.983045753470308e-06, | |
| "loss": 0.5835226058959961, | |
| "memory(GiB)": 69.68, | |
| "step": 240, | |
| "train_speed(iter/s)": 0.050813 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "eval_loss": 0.7080445289611816, | |
| "eval_runtime": 9.9199, | |
| "eval_samples_per_second": 6.351, | |
| "eval_steps_per_second": 0.202, | |
| "eval_token_acc": 0.7932052331892365, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.2894736842105263, | |
| "grad_norm": 0.3419804528738108, | |
| "learning_rate": 5.608034111526298e-06, | |
| "loss": 0.5964583396911621, | |
| "memory(GiB)": 69.68, | |
| "step": 245, | |
| "train_speed(iter/s)": 0.05074 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.3778594571130598, | |
| "learning_rate": 5.240526069629265e-06, | |
| "loss": 0.6174930572509766, | |
| "memory(GiB)": 69.68, | |
| "step": 250, | |
| "train_speed(iter/s)": 0.050809 | |
| }, | |
| { | |
| "epoch": 1.3421052631578947, | |
| "grad_norm": 0.3454812999080356, | |
| "learning_rate": 4.881149509103993e-06, | |
| "loss": 0.6054057121276856, | |
| "memory(GiB)": 69.68, | |
| "step": 255, | |
| "train_speed(iter/s)": 0.050865 | |
| }, | |
| { | |
| "epoch": 1.3473684210526315, | |
| "eval_loss": 0.7069370746612549, | |
| "eval_runtime": 9.9384, | |
| "eval_samples_per_second": 6.339, | |
| "eval_steps_per_second": 0.201, | |
| "eval_token_acc": 0.7936860848203233, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.3459021630061238, | |
| "learning_rate": 4.530518418775734e-06, | |
| "loss": 0.6047042846679688, | |
| "memory(GiB)": 69.68, | |
| "step": 260, | |
| "train_speed(iter/s)": 0.050799 | |
| }, | |
| { | |
| "epoch": 1.3947368421052633, | |
| "grad_norm": 0.3670462674615966, | |
| "learning_rate": 4.189231845980618e-06, | |
| "loss": 0.5852503776550293, | |
| "memory(GiB)": 69.68, | |
| "step": 265, | |
| "train_speed(iter/s)": 0.050844 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.36764701477111333, | |
| "learning_rate": 3.857872873103322e-06, | |
| "loss": 0.6259764671325684, | |
| "memory(GiB)": 69.68, | |
| "step": 270, | |
| "train_speed(iter/s)": 0.050909 | |
| }, | |
| { | |
| "epoch": 1.431578947368421, | |
| "eval_loss": 0.7064932584762573, | |
| "eval_runtime": 9.8662, | |
| "eval_samples_per_second": 6.385, | |
| "eval_steps_per_second": 0.203, | |
| "eval_token_acc": 0.7939836415227286, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.4473684210526316, | |
| "grad_norm": 0.3707913076135043, | |
| "learning_rate": 3.5370076213905904e-06, | |
| "loss": 0.5928922653198242, | |
| "memory(GiB)": 69.68, | |
| "step": 275, | |
| "train_speed(iter/s)": 0.050851 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.3282385891332036, | |
| "learning_rate": 3.2271842837425917e-06, | |
| "loss": 0.5990563869476319, | |
| "memory(GiB)": 69.68, | |
| "step": 280, | |
| "train_speed(iter/s)": 0.050922 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.30485631330802376, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 0.5840599060058593, | |
| "memory(GiB)": 69.68, | |
| "step": 285, | |
| "train_speed(iter/s)": 0.050976 | |
| }, | |
| { | |
| "epoch": 1.5157894736842106, | |
| "eval_loss": 0.7049282789230347, | |
| "eval_runtime": 9.8976, | |
| "eval_samples_per_second": 6.365, | |
| "eval_steps_per_second": 0.202, | |
| "eval_token_acc": 0.7940907619355945, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.3541905065359078, | |
| "learning_rate": 2.642760893268684e-06, | |
| "loss": 0.6048089027404785, | |
| "memory(GiB)": 69.68, | |
| "step": 290, | |
| "train_speed(iter/s)": 0.050888 | |
| }, | |
| { | |
| "epoch": 1.5526315789473686, | |
| "grad_norm": 0.33886535203860046, | |
| "learning_rate": 2.369159318001937e-06, | |
| "loss": 0.6013635635375977, | |
| "memory(GiB)": 69.68, | |
| "step": 295, | |
| "train_speed(iter/s)": 0.050935 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.3560564200551741, | |
| "learning_rate": 2.1085949060360654e-06, | |
| "loss": 0.5912067413330078, | |
| "memory(GiB)": 69.68, | |
| "step": 300, | |
| "train_speed(iter/s)": 0.050982 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.7051891088485718, | |
| "eval_runtime": 10.0734, | |
| "eval_samples_per_second": 6.254, | |
| "eval_steps_per_second": 0.199, | |
| "eval_token_acc": 0.7941740778122679, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.6052631578947367, | |
| "grad_norm": 0.4101933630853145, | |
| "learning_rate": 1.861512827298051e-06, | |
| "loss": 0.6162192344665527, | |
| "memory(GiB)": 69.68, | |
| "step": 305, | |
| "train_speed(iter/s)": 0.05055 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.35731263913788486, | |
| "learning_rate": 1.6283352173747148e-06, | |
| "loss": 0.5980302810668945, | |
| "memory(GiB)": 69.68, | |
| "step": 310, | |
| "train_speed(iter/s)": 0.050574 | |
| }, | |
| { | |
| "epoch": 1.6578947368421053, | |
| "grad_norm": 0.3254781881255832, | |
| "learning_rate": 1.409460456301147e-06, | |
| "loss": 0.6072481155395508, | |
| "memory(GiB)": 69.68, | |
| "step": 315, | |
| "train_speed(iter/s)": 0.050633 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.34740054250648605, | |
| "learning_rate": 1.2052624879351105e-06, | |
| "loss": 0.590502119064331, | |
| "memory(GiB)": 69.68, | |
| "step": 320, | |
| "train_speed(iter/s)": 0.050687 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "eval_loss": 0.705406665802002, | |
| "eval_runtime": 9.9732, | |
| "eval_samples_per_second": 6.317, | |
| "eval_steps_per_second": 0.201, | |
| "eval_token_acc": 0.7945025804117233, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7105263157894737, | |
| "grad_norm": 0.3669361360683435, | |
| "learning_rate": 1.0160901810802114e-06, | |
| "loss": 0.6109643936157226, | |
| "memory(GiB)": 69.68, | |
| "step": 325, | |
| "train_speed(iter/s)": 0.050646 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.35255244252893525, | |
| "learning_rate": 8.42266733449425e-07, | |
| "loss": 0.598551607131958, | |
| "memory(GiB)": 69.68, | |
| "step": 330, | |
| "train_speed(iter/s)": 0.050696 | |
| }, | |
| { | |
| "epoch": 1.763157894736842, | |
| "grad_norm": 0.40857052465226457, | |
| "learning_rate": 6.840891194872112e-07, | |
| "loss": 0.5822429656982422, | |
| "memory(GiB)": 69.68, | |
| "step": 335, | |
| "train_speed(iter/s)": 0.050739 | |
| }, | |
| { | |
| "epoch": 1.768421052631579, | |
| "eval_loss": 0.7043415307998657, | |
| "eval_runtime": 10.0333, | |
| "eval_samples_per_second": 6.279, | |
| "eval_steps_per_second": 0.199, | |
| "eval_token_acc": 0.7944002209060959, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.33686853385429094, | |
| "learning_rate": 5.418275829936537e-07, | |
| "loss": 0.5957964897155762, | |
| "memory(GiB)": 69.68, | |
| "step": 340, | |
| "train_speed(iter/s)": 0.05072 | |
| }, | |
| { | |
| "epoch": 1.8157894736842106, | |
| "grad_norm": 0.3347165001043132, | |
| "learning_rate": 4.1572517541747294e-07, | |
| "loss": 0.5949300765991211, | |
| "memory(GiB)": 69.68, | |
| "step": 345, | |
| "train_speed(iter/s)": 0.050761 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.3526011133437637, | |
| "learning_rate": 3.059973406066963e-07, | |
| "loss": 0.6076947689056397, | |
| "memory(GiB)": 69.68, | |
| "step": 350, | |
| "train_speed(iter/s)": 0.050779 | |
| }, | |
| { | |
| "epoch": 1.8526315789473684, | |
| "eval_loss": 0.7042702436447144, | |
| "eval_runtime": 9.8892, | |
| "eval_samples_per_second": 6.371, | |
| "eval_steps_per_second": 0.202, | |
| "eval_token_acc": 0.7943549922873303, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.868421052631579, | |
| "grad_norm": 0.34715798268979603, | |
| "learning_rate": 2.1283154672645522e-07, | |
| "loss": 0.5999661445617676, | |
| "memory(GiB)": 69.68, | |
| "step": 355, | |
| "train_speed(iter/s)": 0.050733 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.3528812895767884, | |
| "learning_rate": 1.3638696597277678e-07, | |
| "loss": 0.583561372756958, | |
| "memory(GiB)": 69.68, | |
| "step": 360, | |
| "train_speed(iter/s)": 0.050768 | |
| }, | |
| { | |
| "epoch": 1.9210526315789473, | |
| "grad_norm": 0.330353105837426, | |
| "learning_rate": 7.679420262954984e-08, | |
| "loss": 0.5857907295227051, | |
| "memory(GiB)": 69.68, | |
| "step": 365, | |
| "train_speed(iter/s)": 0.050822 | |
| }, | |
| { | |
| "epoch": 1.936842105263158, | |
| "eval_loss": 0.7041947841644287, | |
| "eval_runtime": 9.9348, | |
| "eval_samples_per_second": 6.341, | |
| "eval_steps_per_second": 0.201, | |
| "eval_token_acc": 0.7944621127001962, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.3549687395322014, | |
| "learning_rate": 3.4155069933301535e-08, | |
| "loss": 0.6037016868591308, | |
| "memory(GiB)": 69.68, | |
| "step": 370, | |
| "train_speed(iter/s)": 0.050781 | |
| }, | |
| { | |
| "epoch": 1.973684210526316, | |
| "grad_norm": 0.3394111981962662, | |
| "learning_rate": 8.542416126989805e-09, | |
| "loss": 0.6164089202880859, | |
| "memory(GiB)": 69.68, | |
| "step": 375, | |
| "train_speed(iter/s)": 0.050825 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.3439629679872151, | |
| "learning_rate": 0.0, | |
| "loss": 0.6200397491455079, | |
| "memory(GiB)": 69.68, | |
| "step": 380, | |
| "train_speed(iter/s)": 0.050867 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.7041846513748169, | |
| "eval_runtime": 9.8952, | |
| "eval_samples_per_second": 6.367, | |
| "eval_steps_per_second": 0.202, | |
| "eval_token_acc": 0.7943787968235226, | |
| "step": 380 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 380, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2673065754361856.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |