{ "best_global_step": 192, "best_metric": 0.70005822, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 16, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005263157894736842, "grad_norm": 9.152583101647197, "learning_rate": 1.9999658256641746e-05, "loss": 1.3120107650756836, "memory(GiB)": 57.01, "step": 1, "train_speed(iter/s)": 0.036088 }, { "epoch": 0.02631578947368421, "grad_norm": 2.1260795018452656, "learning_rate": 1.999145758387301e-05, "loss": 0.9991766810417175, "memory(GiB)": 57.01, "step": 5, "train_speed(iter/s)": 0.048355 }, { "epoch": 0.05263157894736842, "grad_norm": 0.7836553551546025, "learning_rate": 1.99658449300667e-05, "loss": 0.9186409950256348, "memory(GiB)": 57.01, "step": 10, "train_speed(iter/s)": 0.049883 }, { "epoch": 0.07894736842105263, "grad_norm": 0.6412562781618624, "learning_rate": 1.992320579737045e-05, "loss": 0.8510271072387695, "memory(GiB)": 57.01, "step": 15, "train_speed(iter/s)": 0.051154 }, { "epoch": 0.08421052631578947, "eval_loss": 0.7916765213012695, "eval_runtime": 9.8416, "eval_samples_per_second": 6.401, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7726357334653692, "step": 16 }, { "epoch": 0.10526315789473684, "grad_norm": 0.49696683676371833, "learning_rate": 1.9863613034027224e-05, "loss": 0.8826911926269532, "memory(GiB)": 69.3, "step": 20, "train_speed(iter/s)": 0.050284 }, { "epoch": 0.13157894736842105, "grad_norm": 0.5207448076725332, "learning_rate": 1.9787168453273546e-05, "loss": 0.831389045715332, "memory(GiB)": 69.3, "step": 25, "train_speed(iter/s)": 0.050966 }, { "epoch": 0.15789473684210525, "grad_norm": 0.5062200984947397, "learning_rate": 1.9694002659393306e-05, "loss": 0.8272812843322754, "memory(GiB)": 69.3, "step": 30, "train_speed(iter/s)": 0.051443 }, { "epoch": 0.16842105263157894, "eval_loss": 0.7559030652046204, "eval_runtime": 9.8411, "eval_samples_per_second": 6.402, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7801532059949344, "step": 32 }, { "epoch": 0.18421052631578946, "grad_norm": 0.4939312053721415, "learning_rate": 1.958427482458253e-05, "loss": 0.8164405822753906, "memory(GiB)": 69.37, "step": 35, "train_speed(iter/s)": 0.051059 }, { "epoch": 0.21052631578947367, "grad_norm": 0.7658040629902148, "learning_rate": 1.9458172417006347e-05, "loss": 0.8088149070739746, "memory(GiB)": 69.37, "step": 40, "train_speed(iter/s)": 0.051441 }, { "epoch": 0.23684210526315788, "grad_norm": 0.4138057058676066, "learning_rate": 1.9315910880512792e-05, "loss": 0.7972204208374023, "memory(GiB)": 69.37, "step": 45, "train_speed(iter/s)": 0.051684 }, { "epoch": 0.25263157894736843, "eval_loss": 0.7412319183349609, "eval_runtime": 9.8493, "eval_samples_per_second": 6.396, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7837714954961817, "step": 48 }, { "epoch": 0.2631578947368421, "grad_norm": 0.40921663738924585, "learning_rate": 1.9157733266550577e-05, "loss": 0.8212543487548828, "memory(GiB)": 69.37, "step": 50, "train_speed(iter/s)": 0.051332 }, { "epoch": 0.2894736842105263, "grad_norm": 0.42411505277750844, "learning_rate": 1.898390981891979e-05, "loss": 0.820932674407959, "memory(GiB)": 69.37, "step": 55, "train_speed(iter/s)": 0.051591 }, { "epoch": 0.3157894736842105, "grad_norm": 0.4372810679248111, "learning_rate": 1.879473751206489e-05, "loss": 0.7952181816101074, "memory(GiB)": 69.37, "step": 60, "train_speed(iter/s)": 0.051797 }, { "epoch": 0.3368421052631579, "eval_loss": 0.7309797406196594, "eval_runtime": 9.8337, "eval_samples_per_second": 6.407, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7857353697320562, "step": 64 }, { "epoch": 0.34210526315789475, "grad_norm": 0.4228185459938873, "learning_rate": 1.8590539543698852e-05, "loss": 0.8058796882629394, "memory(GiB)": 69.37, "step": 65, "train_speed(iter/s)": 0.051508 }, { "epoch": 0.3684210526315789, "grad_norm": 0.42601403175611896, "learning_rate": 1.8371664782625287e-05, "loss": 0.794715690612793, "memory(GiB)": 69.37, "step": 70, "train_speed(iter/s)": 0.051591 }, { "epoch": 0.39473684210526316, "grad_norm": 0.4262322625536522, "learning_rate": 1.813848717270195e-05, "loss": 0.7831051349639893, "memory(GiB)": 69.37, "step": 75, "train_speed(iter/s)": 0.051639 }, { "epoch": 0.42105263157894735, "grad_norm": 0.48084059766704834, "learning_rate": 1.789140509396394e-05, "loss": 0.77615327835083, "memory(GiB)": 69.37, "step": 80, "train_speed(iter/s)": 0.051759 }, { "epoch": 0.42105263157894735, "eval_loss": 0.7247982621192932, "eval_runtime": 9.8915, "eval_samples_per_second": 6.369, "eval_steps_per_second": 0.202, "eval_token_acc": 0.7875040467711527, "step": 80 }, { "epoch": 0.4473684210526316, "grad_norm": 0.4217600796086705, "learning_rate": 1.7630840681998068e-05, "loss": 0.7695322036743164, "memory(GiB)": 69.37, "step": 85, "train_speed(iter/s)": 0.051515 }, { "epoch": 0.47368421052631576, "grad_norm": 0.4445281215969849, "learning_rate": 1.735723910673132e-05, "loss": 0.7695138454437256, "memory(GiB)": 69.37, "step": 90, "train_speed(iter/s)": 0.051651 }, { "epoch": 0.5, "grad_norm": 0.49202631008863995, "learning_rate": 1.7071067811865477e-05, "loss": 0.7878283023834228, "memory(GiB)": 69.37, "step": 95, "train_speed(iter/s)": 0.051798 }, { "epoch": 0.5052631578947369, "eval_loss": 0.7192742824554443, "eval_runtime": 9.8387, "eval_samples_per_second": 6.403, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7890584829845175, "step": 96 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5136302863742109, "learning_rate": 1.6772815716257414e-05, "loss": 0.8238545417785644, "memory(GiB)": 69.37, "step": 100, "train_speed(iter/s)": 0.051543 }, { "epoch": 0.5526315789473685, "grad_norm": 0.40357521887804787, "learning_rate": 1.646299237860941e-05, "loss": 0.7880066871643067, "memory(GiB)": 69.37, "step": 105, "train_speed(iter/s)": 0.050576 }, { "epoch": 0.5789473684210527, "grad_norm": 0.42779690136435417, "learning_rate": 1.6142127126896682e-05, "loss": 0.7965791702270508, "memory(GiB)": 69.37, "step": 110, "train_speed(iter/s)": 0.050695 }, { "epoch": 0.5894736842105263, "eval_loss": 0.7136698961257935, "eval_runtime": 9.8339, "eval_samples_per_second": 6.406, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7901154043914609, "step": 112 }, { "epoch": 0.6052631578947368, "grad_norm": 0.405474461817072, "learning_rate": 1.5810768154019386e-05, "loss": 0.7739765167236328, "memory(GiB)": 69.68, "step": 115, "train_speed(iter/s)": 0.050597 }, { "epoch": 0.631578947368421, "grad_norm": 0.44011902270506637, "learning_rate": 1.5469481581224274e-05, "loss": 0.7681276321411132, "memory(GiB)": 69.68, "step": 120, "train_speed(iter/s)": 0.050764 }, { "epoch": 0.6578947368421053, "grad_norm": 0.40246980138168537, "learning_rate": 1.5118850490896012e-05, "loss": 0.7708123683929443, "memory(GiB)": 69.68, "step": 125, "train_speed(iter/s)": 0.050868 }, { "epoch": 0.6736842105263158, "eval_loss": 0.7104138135910034, "eval_runtime": 9.7939, "eval_samples_per_second": 6.433, "eval_steps_per_second": 0.204, "eval_token_acc": 0.790924758622003, "step": 128 }, { "epoch": 0.6842105263157895, "grad_norm": 0.3791182451908356, "learning_rate": 1.4759473930370738e-05, "loss": 0.7606860160827636, "memory(GiB)": 69.68, "step": 130, "train_speed(iter/s)": 0.05075 }, { "epoch": 0.7105263157894737, "grad_norm": 0.3794671895337737, "learning_rate": 1.4391965888473705e-05, "loss": 0.7771711349487305, "memory(GiB)": 69.68, "step": 135, "train_speed(iter/s)": 0.050853 }, { "epoch": 0.7368421052631579, "grad_norm": 0.37709314975078, "learning_rate": 1.4016954246529697e-05, "loss": 0.7493702888488769, "memory(GiB)": 69.68, "step": 140, "train_speed(iter/s)": 0.050943 }, { "epoch": 0.7578947368421053, "eval_loss": 0.706990122795105, "eval_runtime": 9.807, "eval_samples_per_second": 6.424, "eval_steps_per_second": 0.204, "eval_token_acc": 0.7914865456761441, "step": 144 }, { "epoch": 0.7631578947368421, "grad_norm": 0.39953813706935337, "learning_rate": 1.3635079705638298e-05, "loss": 0.7736559391021729, "memory(GiB)": 69.68, "step": 145, "train_speed(iter/s)": 0.050829 }, { "epoch": 0.7894736842105263, "grad_norm": 0.3830666041149983, "learning_rate": 1.3246994692046837e-05, "loss": 0.7789090156555176, "memory(GiB)": 69.68, "step": 150, "train_speed(iter/s)": 0.050937 }, { "epoch": 0.8157894736842105, "grad_norm": 0.421924385981196, "learning_rate": 1.2853362242491054e-05, "loss": 0.7540519714355469, "memory(GiB)": 69.68, "step": 155, "train_speed(iter/s)": 0.050961 }, { "epoch": 0.8421052631578947, "grad_norm": 0.4181935666985859, "learning_rate": 1.2454854871407993e-05, "loss": 0.7660489082336426, "memory(GiB)": 69.68, "step": 160, "train_speed(iter/s)": 0.051056 }, { "epoch": 0.8421052631578947, "eval_loss": 0.7034122347831726, "eval_runtime": 9.8416, "eval_samples_per_second": 6.401, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7925434670830873, "step": 160 }, { "epoch": 0.868421052631579, "grad_norm": 0.404345099061113, "learning_rate": 1.2052153421956343e-05, "loss": 0.7705528259277343, "memory(GiB)": 69.68, "step": 165, "train_speed(iter/s)": 0.050964 }, { "epoch": 0.8947368421052632, "grad_norm": 0.4116934893822773, "learning_rate": 1.164594590280734e-05, "loss": 0.7849107265472413, "memory(GiB)": 69.68, "step": 170, "train_speed(iter/s)": 0.051044 }, { "epoch": 0.9210526315789473, "grad_norm": 0.4090058942775372, "learning_rate": 1.123692631269348e-05, "loss": 0.757164478302002, "memory(GiB)": 69.68, "step": 175, "train_speed(iter/s)": 0.051102 }, { "epoch": 0.9263157894736842, "eval_loss": 0.7005300521850586, "eval_runtime": 9.7897, "eval_samples_per_second": 6.435, "eval_steps_per_second": 0.204, "eval_token_acc": 0.7931933309211403, "step": 176 }, { "epoch": 0.9473684210526315, "grad_norm": 0.3661599756724907, "learning_rate": 1.0825793454723325e-05, "loss": 0.7720006942749024, "memory(GiB)": 69.68, "step": 180, "train_speed(iter/s)": 0.051016 }, { "epoch": 0.9736842105263158, "grad_norm": 0.40343584878225963, "learning_rate": 1.0413249742488132e-05, "loss": 0.7824891090393067, "memory(GiB)": 69.68, "step": 185, "train_speed(iter/s)": 0.051107 }, { "epoch": 1.0, "grad_norm": 0.41506034439679096, "learning_rate": 1e-05, "loss": 0.7296091079711914, "memory(GiB)": 69.68, "step": 190, "train_speed(iter/s)": 0.051173 }, { "epoch": 1.0105263157894737, "eval_loss": 0.7000582218170166, "eval_runtime": 9.8558, "eval_samples_per_second": 6.392, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7939455542648207, "step": 192 }, { "epoch": 1.0263157894736843, "grad_norm": 0.38246407542421074, "learning_rate": 9.586750257511868e-06, "loss": 0.6210448265075683, "memory(GiB)": 69.68, "step": 195, "train_speed(iter/s)": 0.05107 }, { "epoch": 1.0526315789473684, "grad_norm": 0.4086238383459442, "learning_rate": 9.174206545276678e-06, "loss": 0.6003309726715088, "memory(GiB)": 69.68, "step": 200, "train_speed(iter/s)": 0.051127 }, { "epoch": 1.0789473684210527, "grad_norm": 0.5181020519262887, "learning_rate": 8.763073687306523e-06, "loss": 0.607297945022583, "memory(GiB)": 69.68, "step": 205, "train_speed(iter/s)": 0.050639 }, { "epoch": 1.0947368421052632, "eval_loss": 0.7147046327590942, "eval_runtime": 10.3765, "eval_samples_per_second": 6.071, "eval_steps_per_second": 0.193, "eval_token_acc": 0.7924292053093638, "step": 208 }, { "epoch": 1.1052631578947367, "grad_norm": 0.37648543638620585, "learning_rate": 8.35405409719266e-06, "loss": 0.6064910888671875, "memory(GiB)": 69.68, "step": 210, "train_speed(iter/s)": 0.050556 }, { "epoch": 1.131578947368421, "grad_norm": 0.49314042147480297, "learning_rate": 7.947846578043658e-06, "loss": 0.5927825927734375, "memory(GiB)": 69.68, "step": 215, "train_speed(iter/s)": 0.050629 }, { "epoch": 1.1578947368421053, "grad_norm": 0.3822092612971886, "learning_rate": 7.545145128592009e-06, "loss": 0.5842184543609619, "memory(GiB)": 69.68, "step": 220, "train_speed(iter/s)": 0.050696 }, { "epoch": 1.1789473684210527, "eval_loss": 0.7113193273544312, "eval_runtime": 9.9005, "eval_samples_per_second": 6.363, "eval_steps_per_second": 0.202, "eval_token_acc": 0.7929076764868314, "step": 224 }, { "epoch": 1.1842105263157894, "grad_norm": 0.4045395338102251, "learning_rate": 7.14663775750895e-06, "loss": 0.6170342445373536, "memory(GiB)": 69.68, "step": 225, "train_speed(iter/s)": 0.050639 }, { "epoch": 1.2105263157894737, "grad_norm": 0.3917516152680252, "learning_rate": 6.7530053079531664e-06, "loss": 0.6077111244201661, "memory(GiB)": 69.68, "step": 230, "train_speed(iter/s)": 0.0507 }, { "epoch": 1.236842105263158, "grad_norm": 0.37092803642068417, "learning_rate": 6.364920294361701e-06, "loss": 0.5995924472808838, "memory(GiB)": 69.68, "step": 235, "train_speed(iter/s)": 0.050762 }, { "epoch": 1.263157894736842, "grad_norm": 0.3904513301842613, "learning_rate": 5.983045753470308e-06, "loss": 0.5835226058959961, "memory(GiB)": 69.68, "step": 240, "train_speed(iter/s)": 0.050813 }, { "epoch": 1.263157894736842, "eval_loss": 0.7080445289611816, "eval_runtime": 9.9199, "eval_samples_per_second": 6.351, "eval_steps_per_second": 0.202, "eval_token_acc": 0.7932052331892365, "step": 240 }, { "epoch": 1.2894736842105263, "grad_norm": 0.3419804528738108, "learning_rate": 5.608034111526298e-06, "loss": 0.5964583396911621, "memory(GiB)": 69.68, "step": 245, "train_speed(iter/s)": 0.05074 }, { "epoch": 1.3157894736842106, "grad_norm": 0.3778594571130598, "learning_rate": 5.240526069629265e-06, "loss": 0.6174930572509766, "memory(GiB)": 69.68, "step": 250, "train_speed(iter/s)": 0.050809 }, { "epoch": 1.3421052631578947, "grad_norm": 0.3454812999080356, "learning_rate": 4.881149509103993e-06, "loss": 0.6054057121276856, "memory(GiB)": 69.68, "step": 255, "train_speed(iter/s)": 0.050865 }, { "epoch": 1.3473684210526315, "eval_loss": 0.7069370746612549, "eval_runtime": 9.9384, "eval_samples_per_second": 6.339, "eval_steps_per_second": 0.201, "eval_token_acc": 0.7936860848203233, "step": 256 }, { "epoch": 1.368421052631579, "grad_norm": 0.3459021630061238, "learning_rate": 4.530518418775734e-06, "loss": 0.6047042846679688, "memory(GiB)": 69.68, "step": 260, "train_speed(iter/s)": 0.050799 }, { "epoch": 1.3947368421052633, "grad_norm": 0.3670462674615966, "learning_rate": 4.189231845980618e-06, "loss": 0.5852503776550293, "memory(GiB)": 69.68, "step": 265, "train_speed(iter/s)": 0.050844 }, { "epoch": 1.4210526315789473, "grad_norm": 0.36764701477111333, "learning_rate": 3.857872873103322e-06, "loss": 0.6259764671325684, "memory(GiB)": 69.68, "step": 270, "train_speed(iter/s)": 0.050909 }, { "epoch": 1.431578947368421, "eval_loss": 0.7064932584762573, "eval_runtime": 9.8662, "eval_samples_per_second": 6.385, "eval_steps_per_second": 0.203, "eval_token_acc": 0.7939836415227286, "step": 272 }, { "epoch": 1.4473684210526316, "grad_norm": 0.3707913076135043, "learning_rate": 3.5370076213905904e-06, "loss": 0.5928922653198242, "memory(GiB)": 69.68, "step": 275, "train_speed(iter/s)": 0.050851 }, { "epoch": 1.4736842105263157, "grad_norm": 0.3282385891332036, "learning_rate": 3.2271842837425917e-06, "loss": 0.5990563869476319, "memory(GiB)": 69.68, "step": 280, "train_speed(iter/s)": 0.050922 }, { "epoch": 1.5, "grad_norm": 0.30485631330802376, "learning_rate": 2.9289321881345257e-06, "loss": 0.5840599060058593, "memory(GiB)": 69.68, "step": 285, "train_speed(iter/s)": 0.050976 }, { "epoch": 1.5157894736842106, "eval_loss": 0.7049282789230347, "eval_runtime": 9.8976, "eval_samples_per_second": 6.365, "eval_steps_per_second": 0.202, "eval_token_acc": 0.7940907619355945, "step": 288 }, { "epoch": 1.526315789473684, "grad_norm": 0.3541905065359078, "learning_rate": 2.642760893268684e-06, "loss": 0.6048089027404785, "memory(GiB)": 69.68, "step": 290, "train_speed(iter/s)": 0.050888 }, { "epoch": 1.5526315789473686, "grad_norm": 0.33886535203860046, "learning_rate": 2.369159318001937e-06, "loss": 0.6013635635375977, "memory(GiB)": 69.68, "step": 295, "train_speed(iter/s)": 0.050935 }, { "epoch": 1.5789473684210527, "grad_norm": 0.3560564200551741, "learning_rate": 2.1085949060360654e-06, "loss": 0.5912067413330078, "memory(GiB)": 69.68, "step": 300, "train_speed(iter/s)": 0.050982 }, { "epoch": 1.6, "eval_loss": 0.7051891088485718, "eval_runtime": 10.0734, "eval_samples_per_second": 6.254, "eval_steps_per_second": 0.199, "eval_token_acc": 0.7941740778122679, "step": 304 }, { "epoch": 1.6052631578947367, "grad_norm": 0.4101933630853145, "learning_rate": 1.861512827298051e-06, "loss": 0.6162192344665527, "memory(GiB)": 69.68, "step": 305, "train_speed(iter/s)": 0.05055 }, { "epoch": 1.631578947368421, "grad_norm": 0.35731263913788486, "learning_rate": 1.6283352173747148e-06, "loss": 0.5980302810668945, "memory(GiB)": 69.68, "step": 310, "train_speed(iter/s)": 0.050574 }, { "epoch": 1.6578947368421053, "grad_norm": 0.3254781881255832, "learning_rate": 1.409460456301147e-06, "loss": 0.6072481155395508, "memory(GiB)": 69.68, "step": 315, "train_speed(iter/s)": 0.050633 }, { "epoch": 1.6842105263157894, "grad_norm": 0.34740054250648605, "learning_rate": 1.2052624879351105e-06, "loss": 0.590502119064331, "memory(GiB)": 69.68, "step": 320, "train_speed(iter/s)": 0.050687 }, { "epoch": 1.6842105263157894, "eval_loss": 0.705406665802002, "eval_runtime": 9.9732, "eval_samples_per_second": 6.317, "eval_steps_per_second": 0.201, "eval_token_acc": 0.7945025804117233, "step": 320 }, { "epoch": 1.7105263157894737, "grad_norm": 0.3669361360683435, "learning_rate": 1.0160901810802114e-06, "loss": 0.6109643936157226, "memory(GiB)": 69.68, "step": 325, "train_speed(iter/s)": 0.050646 }, { "epoch": 1.736842105263158, "grad_norm": 0.35255244252893525, "learning_rate": 8.42266733449425e-07, "loss": 0.598551607131958, "memory(GiB)": 69.68, "step": 330, "train_speed(iter/s)": 0.050696 }, { "epoch": 1.763157894736842, "grad_norm": 0.40857052465226457, "learning_rate": 6.840891194872112e-07, "loss": 0.5822429656982422, "memory(GiB)": 69.68, "step": 335, "train_speed(iter/s)": 0.050739 }, { "epoch": 1.768421052631579, "eval_loss": 0.7043415307998657, "eval_runtime": 10.0333, "eval_samples_per_second": 6.279, "eval_steps_per_second": 0.199, "eval_token_acc": 0.7944002209060959, "step": 336 }, { "epoch": 1.7894736842105263, "grad_norm": 0.33686853385429094, "learning_rate": 5.418275829936537e-07, "loss": 0.5957964897155762, "memory(GiB)": 69.68, "step": 340, "train_speed(iter/s)": 0.05072 }, { "epoch": 1.8157894736842106, "grad_norm": 0.3347165001043132, "learning_rate": 4.1572517541747294e-07, "loss": 0.5949300765991211, "memory(GiB)": 69.68, "step": 345, "train_speed(iter/s)": 0.050761 }, { "epoch": 1.8421052631578947, "grad_norm": 0.3526011133437637, "learning_rate": 3.059973406066963e-07, "loss": 0.6076947689056397, "memory(GiB)": 69.68, "step": 350, "train_speed(iter/s)": 0.050779 }, { "epoch": 1.8526315789473684, "eval_loss": 0.7042702436447144, "eval_runtime": 9.8892, "eval_samples_per_second": 6.371, "eval_steps_per_second": 0.202, "eval_token_acc": 0.7943549922873303, "step": 352 }, { "epoch": 1.868421052631579, "grad_norm": 0.34715798268979603, "learning_rate": 2.1283154672645522e-07, "loss": 0.5999661445617676, "memory(GiB)": 69.68, "step": 355, "train_speed(iter/s)": 0.050733 }, { "epoch": 1.8947368421052633, "grad_norm": 0.3528812895767884, "learning_rate": 1.3638696597277678e-07, "loss": 0.583561372756958, "memory(GiB)": 69.68, "step": 360, "train_speed(iter/s)": 0.050768 }, { "epoch": 1.9210526315789473, "grad_norm": 0.330353105837426, "learning_rate": 7.679420262954984e-08, "loss": 0.5857907295227051, "memory(GiB)": 69.68, "step": 365, "train_speed(iter/s)": 0.050822 }, { "epoch": 1.936842105263158, "eval_loss": 0.7041947841644287, "eval_runtime": 9.9348, "eval_samples_per_second": 6.341, "eval_steps_per_second": 0.201, "eval_token_acc": 0.7944621127001962, "step": 368 }, { "epoch": 1.9473684210526314, "grad_norm": 0.3549687395322014, "learning_rate": 3.4155069933301535e-08, "loss": 0.6037016868591308, "memory(GiB)": 69.68, "step": 370, "train_speed(iter/s)": 0.050781 }, { "epoch": 1.973684210526316, "grad_norm": 0.3394111981962662, "learning_rate": 8.542416126989805e-09, "loss": 0.6164089202880859, "memory(GiB)": 69.68, "step": 375, "train_speed(iter/s)": 0.050825 }, { "epoch": 2.0, "grad_norm": 0.3439629679872151, "learning_rate": 0.0, "loss": 0.6200397491455079, "memory(GiB)": 69.68, "step": 380, "train_speed(iter/s)": 0.050867 }, { "epoch": 2.0, "eval_loss": 0.7041846513748169, "eval_runtime": 9.8952, "eval_samples_per_second": 6.367, "eval_steps_per_second": 0.202, "eval_token_acc": 0.7943787968235226, "step": 380 } ], "logging_steps": 5, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2673065754361856.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }