Toucan-Qwen2.5-32B-Instruct-v0.1 / trainer_state.json
zhangchenxu's picture
Upload folder using huggingface_hub
c9bd9ac verified
raw
history blame
26.2 kB
{
"best_global_step": 192,
"best_metric": 0.70005822,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 16,
"global_step": 380,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005263157894736842,
"grad_norm": 9.152583101647197,
"learning_rate": 1.9999658256641746e-05,
"loss": 1.3120107650756836,
"memory(GiB)": 57.01,
"step": 1,
"train_speed(iter/s)": 0.036088
},
{
"epoch": 0.02631578947368421,
"grad_norm": 2.1260795018452656,
"learning_rate": 1.999145758387301e-05,
"loss": 0.9991766810417175,
"memory(GiB)": 57.01,
"step": 5,
"train_speed(iter/s)": 0.048355
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.7836553551546025,
"learning_rate": 1.99658449300667e-05,
"loss": 0.9186409950256348,
"memory(GiB)": 57.01,
"step": 10,
"train_speed(iter/s)": 0.049883
},
{
"epoch": 0.07894736842105263,
"grad_norm": 0.6412562781618624,
"learning_rate": 1.992320579737045e-05,
"loss": 0.8510271072387695,
"memory(GiB)": 57.01,
"step": 15,
"train_speed(iter/s)": 0.051154
},
{
"epoch": 0.08421052631578947,
"eval_loss": 0.7916765213012695,
"eval_runtime": 9.8416,
"eval_samples_per_second": 6.401,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7726357334653692,
"step": 16
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.49696683676371833,
"learning_rate": 1.9863613034027224e-05,
"loss": 0.8826911926269532,
"memory(GiB)": 69.3,
"step": 20,
"train_speed(iter/s)": 0.050284
},
{
"epoch": 0.13157894736842105,
"grad_norm": 0.5207448076725332,
"learning_rate": 1.9787168453273546e-05,
"loss": 0.831389045715332,
"memory(GiB)": 69.3,
"step": 25,
"train_speed(iter/s)": 0.050966
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.5062200984947397,
"learning_rate": 1.9694002659393306e-05,
"loss": 0.8272812843322754,
"memory(GiB)": 69.3,
"step": 30,
"train_speed(iter/s)": 0.051443
},
{
"epoch": 0.16842105263157894,
"eval_loss": 0.7559030652046204,
"eval_runtime": 9.8411,
"eval_samples_per_second": 6.402,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7801532059949344,
"step": 32
},
{
"epoch": 0.18421052631578946,
"grad_norm": 0.4939312053721415,
"learning_rate": 1.958427482458253e-05,
"loss": 0.8164405822753906,
"memory(GiB)": 69.37,
"step": 35,
"train_speed(iter/s)": 0.051059
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.7658040629902148,
"learning_rate": 1.9458172417006347e-05,
"loss": 0.8088149070739746,
"memory(GiB)": 69.37,
"step": 40,
"train_speed(iter/s)": 0.051441
},
{
"epoch": 0.23684210526315788,
"grad_norm": 0.4138057058676066,
"learning_rate": 1.9315910880512792e-05,
"loss": 0.7972204208374023,
"memory(GiB)": 69.37,
"step": 45,
"train_speed(iter/s)": 0.051684
},
{
"epoch": 0.25263157894736843,
"eval_loss": 0.7412319183349609,
"eval_runtime": 9.8493,
"eval_samples_per_second": 6.396,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7837714954961817,
"step": 48
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.40921663738924585,
"learning_rate": 1.9157733266550577e-05,
"loss": 0.8212543487548828,
"memory(GiB)": 69.37,
"step": 50,
"train_speed(iter/s)": 0.051332
},
{
"epoch": 0.2894736842105263,
"grad_norm": 0.42411505277750844,
"learning_rate": 1.898390981891979e-05,
"loss": 0.820932674407959,
"memory(GiB)": 69.37,
"step": 55,
"train_speed(iter/s)": 0.051591
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.4372810679248111,
"learning_rate": 1.879473751206489e-05,
"loss": 0.7952181816101074,
"memory(GiB)": 69.37,
"step": 60,
"train_speed(iter/s)": 0.051797
},
{
"epoch": 0.3368421052631579,
"eval_loss": 0.7309797406196594,
"eval_runtime": 9.8337,
"eval_samples_per_second": 6.407,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7857353697320562,
"step": 64
},
{
"epoch": 0.34210526315789475,
"grad_norm": 0.4228185459938873,
"learning_rate": 1.8590539543698852e-05,
"loss": 0.8058796882629394,
"memory(GiB)": 69.37,
"step": 65,
"train_speed(iter/s)": 0.051508
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.42601403175611896,
"learning_rate": 1.8371664782625287e-05,
"loss": 0.794715690612793,
"memory(GiB)": 69.37,
"step": 70,
"train_speed(iter/s)": 0.051591
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.4262322625536522,
"learning_rate": 1.813848717270195e-05,
"loss": 0.7831051349639893,
"memory(GiB)": 69.37,
"step": 75,
"train_speed(iter/s)": 0.051639
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.48084059766704834,
"learning_rate": 1.789140509396394e-05,
"loss": 0.77615327835083,
"memory(GiB)": 69.37,
"step": 80,
"train_speed(iter/s)": 0.051759
},
{
"epoch": 0.42105263157894735,
"eval_loss": 0.7247982621192932,
"eval_runtime": 9.8915,
"eval_samples_per_second": 6.369,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.7875040467711527,
"step": 80
},
{
"epoch": 0.4473684210526316,
"grad_norm": 0.4217600796086705,
"learning_rate": 1.7630840681998068e-05,
"loss": 0.7695322036743164,
"memory(GiB)": 69.37,
"step": 85,
"train_speed(iter/s)": 0.051515
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.4445281215969849,
"learning_rate": 1.735723910673132e-05,
"loss": 0.7695138454437256,
"memory(GiB)": 69.37,
"step": 90,
"train_speed(iter/s)": 0.051651
},
{
"epoch": 0.5,
"grad_norm": 0.49202631008863995,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.7878283023834228,
"memory(GiB)": 69.37,
"step": 95,
"train_speed(iter/s)": 0.051798
},
{
"epoch": 0.5052631578947369,
"eval_loss": 0.7192742824554443,
"eval_runtime": 9.8387,
"eval_samples_per_second": 6.403,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7890584829845175,
"step": 96
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.5136302863742109,
"learning_rate": 1.6772815716257414e-05,
"loss": 0.8238545417785644,
"memory(GiB)": 69.37,
"step": 100,
"train_speed(iter/s)": 0.051543
},
{
"epoch": 0.5526315789473685,
"grad_norm": 0.40357521887804787,
"learning_rate": 1.646299237860941e-05,
"loss": 0.7880066871643067,
"memory(GiB)": 69.37,
"step": 105,
"train_speed(iter/s)": 0.050576
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.42779690136435417,
"learning_rate": 1.6142127126896682e-05,
"loss": 0.7965791702270508,
"memory(GiB)": 69.37,
"step": 110,
"train_speed(iter/s)": 0.050695
},
{
"epoch": 0.5894736842105263,
"eval_loss": 0.7136698961257935,
"eval_runtime": 9.8339,
"eval_samples_per_second": 6.406,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7901154043914609,
"step": 112
},
{
"epoch": 0.6052631578947368,
"grad_norm": 0.405474461817072,
"learning_rate": 1.5810768154019386e-05,
"loss": 0.7739765167236328,
"memory(GiB)": 69.68,
"step": 115,
"train_speed(iter/s)": 0.050597
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.44011902270506637,
"learning_rate": 1.5469481581224274e-05,
"loss": 0.7681276321411132,
"memory(GiB)": 69.68,
"step": 120,
"train_speed(iter/s)": 0.050764
},
{
"epoch": 0.6578947368421053,
"grad_norm": 0.40246980138168537,
"learning_rate": 1.5118850490896012e-05,
"loss": 0.7708123683929443,
"memory(GiB)": 69.68,
"step": 125,
"train_speed(iter/s)": 0.050868
},
{
"epoch": 0.6736842105263158,
"eval_loss": 0.7104138135910034,
"eval_runtime": 9.7939,
"eval_samples_per_second": 6.433,
"eval_steps_per_second": 0.204,
"eval_token_acc": 0.790924758622003,
"step": 128
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.3791182451908356,
"learning_rate": 1.4759473930370738e-05,
"loss": 0.7606860160827636,
"memory(GiB)": 69.68,
"step": 130,
"train_speed(iter/s)": 0.05075
},
{
"epoch": 0.7105263157894737,
"grad_norm": 0.3794671895337737,
"learning_rate": 1.4391965888473705e-05,
"loss": 0.7771711349487305,
"memory(GiB)": 69.68,
"step": 135,
"train_speed(iter/s)": 0.050853
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.37709314975078,
"learning_rate": 1.4016954246529697e-05,
"loss": 0.7493702888488769,
"memory(GiB)": 69.68,
"step": 140,
"train_speed(iter/s)": 0.050943
},
{
"epoch": 0.7578947368421053,
"eval_loss": 0.706990122795105,
"eval_runtime": 9.807,
"eval_samples_per_second": 6.424,
"eval_steps_per_second": 0.204,
"eval_token_acc": 0.7914865456761441,
"step": 144
},
{
"epoch": 0.7631578947368421,
"grad_norm": 0.39953813706935337,
"learning_rate": 1.3635079705638298e-05,
"loss": 0.7736559391021729,
"memory(GiB)": 69.68,
"step": 145,
"train_speed(iter/s)": 0.050829
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.3830666041149983,
"learning_rate": 1.3246994692046837e-05,
"loss": 0.7789090156555176,
"memory(GiB)": 69.68,
"step": 150,
"train_speed(iter/s)": 0.050937
},
{
"epoch": 0.8157894736842105,
"grad_norm": 0.421924385981196,
"learning_rate": 1.2853362242491054e-05,
"loss": 0.7540519714355469,
"memory(GiB)": 69.68,
"step": 155,
"train_speed(iter/s)": 0.050961
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.4181935666985859,
"learning_rate": 1.2454854871407993e-05,
"loss": 0.7660489082336426,
"memory(GiB)": 69.68,
"step": 160,
"train_speed(iter/s)": 0.051056
},
{
"epoch": 0.8421052631578947,
"eval_loss": 0.7034122347831726,
"eval_runtime": 9.8416,
"eval_samples_per_second": 6.401,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7925434670830873,
"step": 160
},
{
"epoch": 0.868421052631579,
"grad_norm": 0.404345099061113,
"learning_rate": 1.2052153421956343e-05,
"loss": 0.7705528259277343,
"memory(GiB)": 69.68,
"step": 165,
"train_speed(iter/s)": 0.050964
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.4116934893822773,
"learning_rate": 1.164594590280734e-05,
"loss": 0.7849107265472413,
"memory(GiB)": 69.68,
"step": 170,
"train_speed(iter/s)": 0.051044
},
{
"epoch": 0.9210526315789473,
"grad_norm": 0.4090058942775372,
"learning_rate": 1.123692631269348e-05,
"loss": 0.757164478302002,
"memory(GiB)": 69.68,
"step": 175,
"train_speed(iter/s)": 0.051102
},
{
"epoch": 0.9263157894736842,
"eval_loss": 0.7005300521850586,
"eval_runtime": 9.7897,
"eval_samples_per_second": 6.435,
"eval_steps_per_second": 0.204,
"eval_token_acc": 0.7931933309211403,
"step": 176
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.3661599756724907,
"learning_rate": 1.0825793454723325e-05,
"loss": 0.7720006942749024,
"memory(GiB)": 69.68,
"step": 180,
"train_speed(iter/s)": 0.051016
},
{
"epoch": 0.9736842105263158,
"grad_norm": 0.40343584878225963,
"learning_rate": 1.0413249742488132e-05,
"loss": 0.7824891090393067,
"memory(GiB)": 69.68,
"step": 185,
"train_speed(iter/s)": 0.051107
},
{
"epoch": 1.0,
"grad_norm": 0.41506034439679096,
"learning_rate": 1e-05,
"loss": 0.7296091079711914,
"memory(GiB)": 69.68,
"step": 190,
"train_speed(iter/s)": 0.051173
},
{
"epoch": 1.0105263157894737,
"eval_loss": 0.7000582218170166,
"eval_runtime": 9.8558,
"eval_samples_per_second": 6.392,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7939455542648207,
"step": 192
},
{
"epoch": 1.0263157894736843,
"grad_norm": 0.38246407542421074,
"learning_rate": 9.586750257511868e-06,
"loss": 0.6210448265075683,
"memory(GiB)": 69.68,
"step": 195,
"train_speed(iter/s)": 0.05107
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.4086238383459442,
"learning_rate": 9.174206545276678e-06,
"loss": 0.6003309726715088,
"memory(GiB)": 69.68,
"step": 200,
"train_speed(iter/s)": 0.051127
},
{
"epoch": 1.0789473684210527,
"grad_norm": 0.5181020519262887,
"learning_rate": 8.763073687306523e-06,
"loss": 0.607297945022583,
"memory(GiB)": 69.68,
"step": 205,
"train_speed(iter/s)": 0.050639
},
{
"epoch": 1.0947368421052632,
"eval_loss": 0.7147046327590942,
"eval_runtime": 10.3765,
"eval_samples_per_second": 6.071,
"eval_steps_per_second": 0.193,
"eval_token_acc": 0.7924292053093638,
"step": 208
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.37648543638620585,
"learning_rate": 8.35405409719266e-06,
"loss": 0.6064910888671875,
"memory(GiB)": 69.68,
"step": 210,
"train_speed(iter/s)": 0.050556
},
{
"epoch": 1.131578947368421,
"grad_norm": 0.49314042147480297,
"learning_rate": 7.947846578043658e-06,
"loss": 0.5927825927734375,
"memory(GiB)": 69.68,
"step": 215,
"train_speed(iter/s)": 0.050629
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.3822092612971886,
"learning_rate": 7.545145128592009e-06,
"loss": 0.5842184543609619,
"memory(GiB)": 69.68,
"step": 220,
"train_speed(iter/s)": 0.050696
},
{
"epoch": 1.1789473684210527,
"eval_loss": 0.7113193273544312,
"eval_runtime": 9.9005,
"eval_samples_per_second": 6.363,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.7929076764868314,
"step": 224
},
{
"epoch": 1.1842105263157894,
"grad_norm": 0.4045395338102251,
"learning_rate": 7.14663775750895e-06,
"loss": 0.6170342445373536,
"memory(GiB)": 69.68,
"step": 225,
"train_speed(iter/s)": 0.050639
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.3917516152680252,
"learning_rate": 6.7530053079531664e-06,
"loss": 0.6077111244201661,
"memory(GiB)": 69.68,
"step": 230,
"train_speed(iter/s)": 0.0507
},
{
"epoch": 1.236842105263158,
"grad_norm": 0.37092803642068417,
"learning_rate": 6.364920294361701e-06,
"loss": 0.5995924472808838,
"memory(GiB)": 69.68,
"step": 235,
"train_speed(iter/s)": 0.050762
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.3904513301842613,
"learning_rate": 5.983045753470308e-06,
"loss": 0.5835226058959961,
"memory(GiB)": 69.68,
"step": 240,
"train_speed(iter/s)": 0.050813
},
{
"epoch": 1.263157894736842,
"eval_loss": 0.7080445289611816,
"eval_runtime": 9.9199,
"eval_samples_per_second": 6.351,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.7932052331892365,
"step": 240
},
{
"epoch": 1.2894736842105263,
"grad_norm": 0.3419804528738108,
"learning_rate": 5.608034111526298e-06,
"loss": 0.5964583396911621,
"memory(GiB)": 69.68,
"step": 245,
"train_speed(iter/s)": 0.05074
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.3778594571130598,
"learning_rate": 5.240526069629265e-06,
"loss": 0.6174930572509766,
"memory(GiB)": 69.68,
"step": 250,
"train_speed(iter/s)": 0.050809
},
{
"epoch": 1.3421052631578947,
"grad_norm": 0.3454812999080356,
"learning_rate": 4.881149509103993e-06,
"loss": 0.6054057121276856,
"memory(GiB)": 69.68,
"step": 255,
"train_speed(iter/s)": 0.050865
},
{
"epoch": 1.3473684210526315,
"eval_loss": 0.7069370746612549,
"eval_runtime": 9.9384,
"eval_samples_per_second": 6.339,
"eval_steps_per_second": 0.201,
"eval_token_acc": 0.7936860848203233,
"step": 256
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.3459021630061238,
"learning_rate": 4.530518418775734e-06,
"loss": 0.6047042846679688,
"memory(GiB)": 69.68,
"step": 260,
"train_speed(iter/s)": 0.050799
},
{
"epoch": 1.3947368421052633,
"grad_norm": 0.3670462674615966,
"learning_rate": 4.189231845980618e-06,
"loss": 0.5852503776550293,
"memory(GiB)": 69.68,
"step": 265,
"train_speed(iter/s)": 0.050844
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.36764701477111333,
"learning_rate": 3.857872873103322e-06,
"loss": 0.6259764671325684,
"memory(GiB)": 69.68,
"step": 270,
"train_speed(iter/s)": 0.050909
},
{
"epoch": 1.431578947368421,
"eval_loss": 0.7064932584762573,
"eval_runtime": 9.8662,
"eval_samples_per_second": 6.385,
"eval_steps_per_second": 0.203,
"eval_token_acc": 0.7939836415227286,
"step": 272
},
{
"epoch": 1.4473684210526316,
"grad_norm": 0.3707913076135043,
"learning_rate": 3.5370076213905904e-06,
"loss": 0.5928922653198242,
"memory(GiB)": 69.68,
"step": 275,
"train_speed(iter/s)": 0.050851
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.3282385891332036,
"learning_rate": 3.2271842837425917e-06,
"loss": 0.5990563869476319,
"memory(GiB)": 69.68,
"step": 280,
"train_speed(iter/s)": 0.050922
},
{
"epoch": 1.5,
"grad_norm": 0.30485631330802376,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.5840599060058593,
"memory(GiB)": 69.68,
"step": 285,
"train_speed(iter/s)": 0.050976
},
{
"epoch": 1.5157894736842106,
"eval_loss": 0.7049282789230347,
"eval_runtime": 9.8976,
"eval_samples_per_second": 6.365,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.7940907619355945,
"step": 288
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.3541905065359078,
"learning_rate": 2.642760893268684e-06,
"loss": 0.6048089027404785,
"memory(GiB)": 69.68,
"step": 290,
"train_speed(iter/s)": 0.050888
},
{
"epoch": 1.5526315789473686,
"grad_norm": 0.33886535203860046,
"learning_rate": 2.369159318001937e-06,
"loss": 0.6013635635375977,
"memory(GiB)": 69.68,
"step": 295,
"train_speed(iter/s)": 0.050935
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.3560564200551741,
"learning_rate": 2.1085949060360654e-06,
"loss": 0.5912067413330078,
"memory(GiB)": 69.68,
"step": 300,
"train_speed(iter/s)": 0.050982
},
{
"epoch": 1.6,
"eval_loss": 0.7051891088485718,
"eval_runtime": 10.0734,
"eval_samples_per_second": 6.254,
"eval_steps_per_second": 0.199,
"eval_token_acc": 0.7941740778122679,
"step": 304
},
{
"epoch": 1.6052631578947367,
"grad_norm": 0.4101933630853145,
"learning_rate": 1.861512827298051e-06,
"loss": 0.6162192344665527,
"memory(GiB)": 69.68,
"step": 305,
"train_speed(iter/s)": 0.05055
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.35731263913788486,
"learning_rate": 1.6283352173747148e-06,
"loss": 0.5980302810668945,
"memory(GiB)": 69.68,
"step": 310,
"train_speed(iter/s)": 0.050574
},
{
"epoch": 1.6578947368421053,
"grad_norm": 0.3254781881255832,
"learning_rate": 1.409460456301147e-06,
"loss": 0.6072481155395508,
"memory(GiB)": 69.68,
"step": 315,
"train_speed(iter/s)": 0.050633
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.34740054250648605,
"learning_rate": 1.2052624879351105e-06,
"loss": 0.590502119064331,
"memory(GiB)": 69.68,
"step": 320,
"train_speed(iter/s)": 0.050687
},
{
"epoch": 1.6842105263157894,
"eval_loss": 0.705406665802002,
"eval_runtime": 9.9732,
"eval_samples_per_second": 6.317,
"eval_steps_per_second": 0.201,
"eval_token_acc": 0.7945025804117233,
"step": 320
},
{
"epoch": 1.7105263157894737,
"grad_norm": 0.3669361360683435,
"learning_rate": 1.0160901810802114e-06,
"loss": 0.6109643936157226,
"memory(GiB)": 69.68,
"step": 325,
"train_speed(iter/s)": 0.050646
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.35255244252893525,
"learning_rate": 8.42266733449425e-07,
"loss": 0.598551607131958,
"memory(GiB)": 69.68,
"step": 330,
"train_speed(iter/s)": 0.050696
},
{
"epoch": 1.763157894736842,
"grad_norm": 0.40857052465226457,
"learning_rate": 6.840891194872112e-07,
"loss": 0.5822429656982422,
"memory(GiB)": 69.68,
"step": 335,
"train_speed(iter/s)": 0.050739
},
{
"epoch": 1.768421052631579,
"eval_loss": 0.7043415307998657,
"eval_runtime": 10.0333,
"eval_samples_per_second": 6.279,
"eval_steps_per_second": 0.199,
"eval_token_acc": 0.7944002209060959,
"step": 336
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.33686853385429094,
"learning_rate": 5.418275829936537e-07,
"loss": 0.5957964897155762,
"memory(GiB)": 69.68,
"step": 340,
"train_speed(iter/s)": 0.05072
},
{
"epoch": 1.8157894736842106,
"grad_norm": 0.3347165001043132,
"learning_rate": 4.1572517541747294e-07,
"loss": 0.5949300765991211,
"memory(GiB)": 69.68,
"step": 345,
"train_speed(iter/s)": 0.050761
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.3526011133437637,
"learning_rate": 3.059973406066963e-07,
"loss": 0.6076947689056397,
"memory(GiB)": 69.68,
"step": 350,
"train_speed(iter/s)": 0.050779
},
{
"epoch": 1.8526315789473684,
"eval_loss": 0.7042702436447144,
"eval_runtime": 9.8892,
"eval_samples_per_second": 6.371,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.7943549922873303,
"step": 352
},
{
"epoch": 1.868421052631579,
"grad_norm": 0.34715798268979603,
"learning_rate": 2.1283154672645522e-07,
"loss": 0.5999661445617676,
"memory(GiB)": 69.68,
"step": 355,
"train_speed(iter/s)": 0.050733
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.3528812895767884,
"learning_rate": 1.3638696597277678e-07,
"loss": 0.583561372756958,
"memory(GiB)": 69.68,
"step": 360,
"train_speed(iter/s)": 0.050768
},
{
"epoch": 1.9210526315789473,
"grad_norm": 0.330353105837426,
"learning_rate": 7.679420262954984e-08,
"loss": 0.5857907295227051,
"memory(GiB)": 69.68,
"step": 365,
"train_speed(iter/s)": 0.050822
},
{
"epoch": 1.936842105263158,
"eval_loss": 0.7041947841644287,
"eval_runtime": 9.9348,
"eval_samples_per_second": 6.341,
"eval_steps_per_second": 0.201,
"eval_token_acc": 0.7944621127001962,
"step": 368
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.3549687395322014,
"learning_rate": 3.4155069933301535e-08,
"loss": 0.6037016868591308,
"memory(GiB)": 69.68,
"step": 370,
"train_speed(iter/s)": 0.050781
},
{
"epoch": 1.973684210526316,
"grad_norm": 0.3394111981962662,
"learning_rate": 8.542416126989805e-09,
"loss": 0.6164089202880859,
"memory(GiB)": 69.68,
"step": 375,
"train_speed(iter/s)": 0.050825
},
{
"epoch": 2.0,
"grad_norm": 0.3439629679872151,
"learning_rate": 0.0,
"loss": 0.6200397491455079,
"memory(GiB)": 69.68,
"step": 380,
"train_speed(iter/s)": 0.050867
},
{
"epoch": 2.0,
"eval_loss": 0.7041846513748169,
"eval_runtime": 9.8952,
"eval_samples_per_second": 6.367,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.7943787968235226,
"step": 380
}
],
"logging_steps": 5,
"max_steps": 380,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2673065754361856.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}