Training in progress, step 553, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +964 -4

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b213ff613aa4d225b5aebb407711c2c1f75037b6993c7723f7f9337e984fcf37
 size 125918320

 version https://git-lfs.github.com/spec/v1
+oid sha256:b677e4bf5c8e6273a99bda7d3d2c0afce4b991f97e4ce838664059bb222700c6
 size 125918320

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ff0efbdaf1525ac3ba4b42e957345b41a0771671f527103ec00de1122755ab3
 size 64684244

 version https://git-lfs.github.com/spec/v1
+oid sha256:5fea8135af20d4c63d861a4c8774fe66918b6f8f8d4020335bf9dae0a8919ad8
 size 64684244

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9d3085fcb96b042d4a5c8a2af90988d91e76b0d2c9417d36d7d8547d31c5122
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:74efbaaf0803a77ee24332433c97b4c4efa9d7113b5aa6975ff10a0c932be806
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3858aeabcc7bea20553f1848582bcfd9209b90f240be2ce1e8e02e28e10d1519
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:b4b0627147991988ca81f95580a67cfcca6614bdcbd8a39f73ebb79a57cc0418
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.7544097693351425,
   "eval_steps": 500,
-  "global_step": 417,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2926,6 +2926,966 @@
       "learning_rate": 1.444255676716637e-05,
       "loss": 3.0206,
       "step": 417
     }
   ],
   "logging_steps": 1,
@@ -2940,12 +3900,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 4.841700083399393e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0009045680687472,
   "eval_steps": 500,
+  "global_step": 553,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 1.444255676716637e-05,
       "loss": 3.0206,
       "step": 417
+    },
+    {
+      "epoch": 0.7562189054726368,
+      "grad_norm": 6.555543422698975,
+      "learning_rate": 1.424162119926336e-05,
+      "loss": 3.9264,
+      "step": 418
+    },
+    {
+      "epoch": 0.7580280416101312,
+      "grad_norm": 6.464334487915039,
+      "learning_rate": 1.40418608407689e-05,
+      "loss": 3.7147,
+      "step": 419
+    },
+    {
+      "epoch": 0.7598371777476255,
+      "grad_norm": 6.810556411743164,
+      "learning_rate": 1.3843282256864599e-05,
+      "loss": 3.7548,
+      "step": 420
+    },
+    {
+      "epoch": 0.7616463138851198,
+      "grad_norm": 6.693656921386719,
+      "learning_rate": 1.3645891973892772e-05,
+      "loss": 3.8381,
+      "step": 421
+    },
+    {
+      "epoch": 0.7634554500226142,
+      "grad_norm": 6.932521820068359,
+      "learning_rate": 1.3449696479141854e-05,
+      "loss": 4.0131,
+      "step": 422
+    },
+    {
+      "epoch": 0.7652645861601085,
+      "grad_norm": 6.466965675354004,
+      "learning_rate": 1.325470222063327e-05,
+      "loss": 3.4764,
+      "step": 423
+    },
+    {
+      "epoch": 0.7670737222976028,
+      "grad_norm": 7.379868507385254,
+      "learning_rate": 1.3060915606909413e-05,
+      "loss": 3.9783,
+      "step": 424
+    },
+    {
+      "epoch": 0.7688828584350973,
+      "grad_norm": 7.2505083084106445,
+      "learning_rate": 1.2868343006823114e-05,
+      "loss": 3.2758,
+      "step": 425
+    },
+    {
+      "epoch": 0.7706919945725916,
+      "grad_norm": 6.697902202606201,
+      "learning_rate": 1.2676990749328254e-05,
+      "loss": 3.3263,
+      "step": 426
+    },
+    {
+      "epoch": 0.772501130710086,
+      "grad_norm": 7.053414821624756,
+      "learning_rate": 1.2486865123271868e-05,
+      "loss": 3.518,
+      "step": 427
+    },
+    {
+      "epoch": 0.7743102668475803,
+      "grad_norm": 7.461160182952881,
+      "learning_rate": 1.2297972377187361e-05,
+      "loss": 3.8753,
+      "step": 428
+    },
+    {
+      "epoch": 0.7761194029850746,
+      "grad_norm": 7.320030212402344,
+      "learning_rate": 1.2110318719089158e-05,
+      "loss": 3.9377,
+      "step": 429
+    },
+    {
+      "epoch": 0.777928539122569,
+      "grad_norm": 6.707526683807373,
+      "learning_rate": 1.1923910316268782e-05,
+      "loss": 3.577,
+      "step": 430
+    },
+    {
+      "epoch": 0.7797376752600633,
+      "grad_norm": 7.633321285247803,
+      "learning_rate": 1.1738753295091986e-05,
+      "loss": 3.8751,
+      "step": 431
+    },
+    {
+      "epoch": 0.7815468113975577,
+      "grad_norm": 7.908304691314697,
+      "learning_rate": 1.1554853740797555e-05,
+      "loss": 3.6976,
+      "step": 432
+    },
+    {
+      "epoch": 0.783355947535052,
+      "grad_norm": 7.982555866241455,
+      "learning_rate": 1.1372217697297249e-05,
+      "loss": 3.6444,
+      "step": 433
+    },
+    {
+      "epoch": 0.7851650836725463,
+      "grad_norm": 7.220830917358398,
+      "learning_rate": 1.1190851166977217e-05,
+      "loss": 3.5019,
+      "step": 434
+    },
+    {
+      "epoch": 0.7869742198100407,
+      "grad_norm": 6.941662311553955,
+      "learning_rate": 1.101076011050065e-05,
+      "loss": 3.5344,
+      "step": 435
+    },
+    {
+      "epoch": 0.788783355947535,
+      "grad_norm": 6.613343715667725,
+      "learning_rate": 1.0831950446611949e-05,
+      "loss": 3.4078,
+      "step": 436
+    },
+    {
+      "epoch": 0.7905924920850294,
+      "grad_norm": 7.152531147003174,
+      "learning_rate": 1.065442805194214e-05,
+      "loss": 3.7873,
+      "step": 437
+    },
+    {
+      "epoch": 0.7924016282225237,
+      "grad_norm": 6.536508083343506,
+      "learning_rate": 1.0478198760815832e-05,
+      "loss": 2.7608,
+      "step": 438
+    },
+    {
+      "epoch": 0.7942107643600181,
+      "grad_norm": 6.4156904220581055,
+      "learning_rate": 1.0303268365059382e-05,
+      "loss": 2.9791,
+      "step": 439
+    },
+    {
+      "epoch": 0.7960199004975125,
+      "grad_norm": 5.869282245635986,
+      "learning_rate": 1.0129642613810576e-05,
+      "loss": 3.5902,
+      "step": 440
+    },
+    {
+      "epoch": 0.7978290366350068,
+      "grad_norm": 6.527062892913818,
+      "learning_rate": 9.957327213329687e-06,
+      "loss": 5.1202,
+      "step": 441
+    },
+    {
+      "epoch": 0.7996381727725012,
+      "grad_norm": 9.36404800415039,
+      "learning_rate": 9.786327826811942e-06,
+      "loss": 5.934,
+      "step": 442
+    },
+    {
+      "epoch": 0.8014473089099955,
+      "grad_norm": 8.085516929626465,
+      "learning_rate": 9.616650074201383e-06,
+      "loss": 5.8671,
+      "step": 443
+    },
+    {
+      "epoch": 0.8032564450474898,
+      "grad_norm": 8.88477611541748,
+      "learning_rate": 9.448299532006149e-06,
+      "loss": 6.4374,
+      "step": 444
+    },
+    {
+      "epoch": 0.8050655811849842,
+      "grad_norm": 10.586947441101074,
+      "learning_rate": 9.281281733115288e-06,
+      "loss": 5.8352,
+      "step": 445
+    },
+    {
+      "epoch": 0.8068747173224785,
+      "grad_norm": 9.525203704833984,
+      "learning_rate": 9.115602166616805e-06,
+      "loss": 5.6282,
+      "step": 446
+    },
+    {
+      "epoch": 0.8086838534599728,
+      "grad_norm": 8.886335372924805,
+      "learning_rate": 8.951266277617326e-06,
+      "loss": 4.3723,
+      "step": 447
+    },
+    {
+      "epoch": 0.8104929895974672,
+      "grad_norm": 9.162277221679688,
+      "learning_rate": 8.78827946706311e-06,
+      "loss": 4.5638,
+      "step": 448
+    },
+    {
+      "epoch": 0.8123021257349615,
+      "grad_norm": 11.042312622070312,
+      "learning_rate": 8.626647091562612e-06,
+      "loss": 5.3548,
+      "step": 449
+    },
+    {
+      "epoch": 0.8141112618724559,
+      "grad_norm": 15.882914543151855,
+      "learning_rate": 8.466374463210346e-06,
+      "loss": 4.3994,
+      "step": 450
+    },
+    {
+      "epoch": 0.8159203980099502,
+      "grad_norm": 6.315022945404053,
+      "learning_rate": 8.307466849412366e-06,
+      "loss": 3.6107,
+      "step": 451
+    },
+    {
+      "epoch": 0.8177295341474446,
+      "grad_norm": 6.3623175621032715,
+      "learning_rate": 8.149929472713125e-06,
+      "loss": 3.3937,
+      "step": 452
+    },
+    {
+      "epoch": 0.819538670284939,
+      "grad_norm": 8.390593528747559,
+      "learning_rate": 7.993767510623834e-06,
+      "loss": 4.0698,
+      "step": 453
+    },
+    {
+      "epoch": 0.8213478064224333,
+      "grad_norm": 8.719853401184082,
+      "learning_rate": 7.838986095452311e-06,
+      "loss": 3.5636,
+      "step": 454
+    },
+    {
+      "epoch": 0.8231569425599277,
+      "grad_norm": 8.96474838256836,
+      "learning_rate": 7.685590314134294e-06,
+      "loss": 3.81,
+      "step": 455
+    },
+    {
+      "epoch": 0.824966078697422,
+      "grad_norm": 7.717404842376709,
+      "learning_rate": 7.533585208066301e-06,
+      "loss": 3.1148,
+      "step": 456
+    },
+    {
+      "epoch": 0.8267752148349163,
+      "grad_norm": 6.736077308654785,
+      "learning_rate": 7.382975772939865e-06,
+      "loss": 2.6002,
+      "step": 457
+    },
+    {
+      "epoch": 0.8285843509724107,
+      "grad_norm": 6.166501998901367,
+      "learning_rate": 7.2337669585774205e-06,
+      "loss": 2.667,
+      "step": 458
+    },
+    {
+      "epoch": 0.830393487109905,
+      "grad_norm": 6.2878851890563965,
+      "learning_rate": 7.085963668769552e-06,
+      "loss": 2.9261,
+      "step": 459
+    },
+    {
+      "epoch": 0.8322026232473994,
+      "grad_norm": 6.576757431030273,
+      "learning_rate": 6.939570761113939e-06,
+      "loss": 3.0651,
+      "step": 460
+    },
+    {
+      "epoch": 0.8340117593848937,
+      "grad_norm": 7.321299076080322,
+      "learning_rate": 6.794593046855613e-06,
+      "loss": 3.4093,
+      "step": 461
+    },
+    {
+      "epoch": 0.835820895522388,
+      "grad_norm": 6.695417404174805,
+      "learning_rate": 6.651035290728858e-06,
+      "loss": 3.0874,
+      "step": 462
+    },
+    {
+      "epoch": 0.8376300316598824,
+      "grad_norm": 6.7737956047058105,
+      "learning_rate": 6.508902210800649e-06,
+      "loss": 3.2408,
+      "step": 463
+    },
+    {
+      "epoch": 0.8394391677973767,
+      "grad_norm": 6.985015869140625,
+      "learning_rate": 6.36819847831554e-06,
+      "loss": 3.3483,
+      "step": 464
+    },
+    {
+      "epoch": 0.841248303934871,
+      "grad_norm": 7.275407791137695,
+      "learning_rate": 6.228928717542204e-06,
+      "loss": 3.6385,
+      "step": 465
+    },
+    {
+      "epoch": 0.8430574400723655,
+      "grad_norm": 7.622275352478027,
+      "learning_rate": 6.091097505621374e-06,
+      "loss": 3.5877,
+      "step": 466
+    },
+    {
+      "epoch": 0.8448665762098598,
+      "grad_norm": 7.0419840812683105,
+      "learning_rate": 5.9547093724155235e-06,
+      "loss": 3.6186,
+      "step": 467
+    },
+    {
+      "epoch": 0.8466757123473542,
+      "grad_norm": 6.253328800201416,
+      "learning_rate": 5.8197688003598815e-06,
+      "loss": 3.0429,
+      "step": 468
+    },
+    {
+      "epoch": 0.8484848484848485,
+      "grad_norm": 7.221704006195068,
+      "learning_rate": 5.686280224315188e-06,
+      "loss": 3.9403,
+      "step": 469
+    },
+    {
+      "epoch": 0.8502939846223428,
+      "grad_norm": 6.812623977661133,
+      "learning_rate": 5.554248031421871e-06,
+      "loss": 3.6381,
+      "step": 470
+    },
+    {
+      "epoch": 0.8521031207598372,
+      "grad_norm": 6.868687152862549,
+      "learning_rate": 5.423676560955976e-06,
+      "loss": 3.789,
+      "step": 471
+    },
+    {
+      "epoch": 0.8539122568973315,
+      "grad_norm": 6.682875156402588,
+      "learning_rate": 5.294570104186436e-06,
+      "loss": 3.7681,
+      "step": 472
+    },
+    {
+      "epoch": 0.8557213930348259,
+      "grad_norm": 6.816806793212891,
+      "learning_rate": 5.166932904234101e-06,
+      "loss": 3.6686,
+      "step": 473
+    },
+    {
+      "epoch": 0.8575305291723202,
+      "grad_norm": 6.691549301147461,
+      "learning_rate": 5.040769155932284e-06,
+      "loss": 3.9365,
+      "step": 474
+    },
+    {
+      "epoch": 0.8593396653098145,
+      "grad_norm": 7.533728122711182,
+      "learning_rate": 4.916083005688865e-06,
+      "loss": 3.7974,
+      "step": 475
+    },
+    {
+      "epoch": 0.8611488014473089,
+      "grad_norm": 6.8411993980407715,
+      "learning_rate": 4.792878551350055e-06,
+      "loss": 3.7125,
+      "step": 476
+    },
+    {
+      "epoch": 0.8629579375848032,
+      "grad_norm": 6.767866611480713,
+      "learning_rate": 4.671159842065698e-06,
+      "loss": 3.4574,
+      "step": 477
+    },
+    {
+      "epoch": 0.8647670737222976,
+      "grad_norm": 7.180441856384277,
+      "learning_rate": 4.550930878156185e-06,
+      "loss": 4.3838,
+      "step": 478
+    },
+    {
+      "epoch": 0.866576209859792,
+      "grad_norm": 6.1762237548828125,
+      "learning_rate": 4.432195610981032e-06,
+      "loss": 3.9066,
+      "step": 479
+    },
+    {
+      "epoch": 0.8683853459972863,
+      "grad_norm": 6.982791423797607,
+      "learning_rate": 4.314957942808956e-06,
+      "loss": 3.5071,
+      "step": 480
+    },
+    {
+      "epoch": 0.8701944821347807,
+      "grad_norm": 7.1507368087768555,
+      "learning_rate": 4.199221726689634e-06,
+      "loss": 4.2304,
+      "step": 481
+    },
+    {
+      "epoch": 0.872003618272275,
+      "grad_norm": 6.495567321777344,
+      "learning_rate": 4.084990766327135e-06,
+      "loss": 3.4303,
+      "step": 482
+    },
+    {
+      "epoch": 0.8738127544097694,
+      "grad_norm": 6.444892406463623,
+      "learning_rate": 3.972268815954832e-06,
+      "loss": 3.7368,
+      "step": 483
+    },
+    {
+      "epoch": 0.8756218905472637,
+      "grad_norm": 6.858259201049805,
+      "learning_rate": 3.861059580212056e-06,
+      "loss": 3.6686,
+      "step": 484
+    },
+    {
+      "epoch": 0.877431026684758,
+      "grad_norm": 6.297997951507568,
+      "learning_rate": 3.7513667140223417e-06,
+      "loss": 3.3313,
+      "step": 485
+    },
+    {
+      "epoch": 0.8792401628222524,
+      "grad_norm": 6.799905300140381,
+      "learning_rate": 3.6431938224733008e-06,
+      "loss": 3.4037,
+      "step": 486
+    },
+    {
+      "epoch": 0.8810492989597467,
+      "grad_norm": 7.7271318435668945,
+      "learning_rate": 3.5365444606981435e-06,
+      "loss": 3.579,
+      "step": 487
+    },
+    {
+      "epoch": 0.882858435097241,
+      "grad_norm": 6.616701126098633,
+      "learning_rate": 3.4314221337588217e-06,
+      "loss": 2.8946,
+      "step": 488
+    },
+    {
+      "epoch": 0.8846675712347354,
+      "grad_norm": 6.063207626342773,
+      "learning_rate": 3.3278302965308596e-06,
+      "loss": 3.6569,
+      "step": 489
+    },
+    {
+      "epoch": 0.8864767073722297,
+      "grad_norm": 6.46480131149292,
+      "learning_rate": 3.2257723535898175e-06,
+      "loss": 4.9097,
+      "step": 490
+    },
+    {
+      "epoch": 0.8882858435097241,
+      "grad_norm": 6.829686164855957,
+      "learning_rate": 3.125251659099332e-06,
+      "loss": 5.4115,
+      "step": 491
+    },
+    {
+      "epoch": 0.8900949796472184,
+      "grad_norm": 7.085306167602539,
+      "learning_rate": 3.0262715167009458e-06,
+      "loss": 5.6045,
+      "step": 492
+    },
+    {
+      "epoch": 0.8919041157847128,
+      "grad_norm": 7.94856071472168,
+      "learning_rate": 2.928835179405548e-06,
+      "loss": 5.6366,
+      "step": 493
+    },
+    {
+      "epoch": 0.8937132519222072,
+      "grad_norm": 8.26148796081543,
+      "learning_rate": 2.8329458494863847e-06,
+      "loss": 5.7356,
+      "step": 494
+    },
+    {
+      "epoch": 0.8955223880597015,
+      "grad_norm": 8.643485069274902,
+      "learning_rate": 2.738606678373873e-06,
+      "loss": 5.2438,
+      "step": 495
+    },
+    {
+      "epoch": 0.8973315241971959,
+      "grad_norm": 9.256296157836914,
+      "learning_rate": 2.645820766552026e-06,
+      "loss": 5.6607,
+      "step": 496
+    },
+    {
+      "epoch": 0.8991406603346902,
+      "grad_norm": 10.048086166381836,
+      "learning_rate": 2.554591163456527e-06,
+      "loss": 5.3024,
+      "step": 497
+    },
+    {
+      "epoch": 0.9009497964721845,
+      "grad_norm": 10.239492416381836,
+      "learning_rate": 2.4649208673745316e-06,
+      "loss": 4.6682,
+      "step": 498
+    },
+    {
+      "epoch": 0.9027589326096789,
+      "grad_norm": 12.140185356140137,
+      "learning_rate": 2.3768128253461253e-06,
+      "loss": 5.7405,
+      "step": 499
+    },
+    {
+      "epoch": 0.9045680687471732,
+      "grad_norm": 11.735483169555664,
+      "learning_rate": 2.2902699330674573e-06,
+      "loss": 4.4613,
+      "step": 500
+    },
+    {
+      "epoch": 0.9063772048846676,
+      "grad_norm": 5.878971099853516,
+      "learning_rate": 2.205295034795596e-06,
+      "loss": 3.6071,
+      "step": 501
+    },
+    {
+      "epoch": 0.9081863410221619,
+      "grad_norm": 7.937101364135742,
+      "learning_rate": 2.1218909232550155e-06,
+      "loss": 3.7888,
+      "step": 502
+    },
+    {
+      "epoch": 0.9099954771596562,
+      "grad_norm": 9.610072135925293,
+      "learning_rate": 2.0400603395458407e-06,
+      "loss": 3.6901,
+      "step": 503
+    },
+    {
+      "epoch": 0.9118046132971506,
+      "grad_norm": 7.69047737121582,
+      "learning_rate": 1.9598059730537466e-06,
+      "loss": 3.14,
+      "step": 504
+    },
+    {
+      "epoch": 0.9136137494346449,
+      "grad_norm": 7.912228107452393,
+      "learning_rate": 1.8811304613615909e-06,
+      "loss": 2.9848,
+      "step": 505
+    },
+    {
+      "epoch": 0.9154228855721394,
+      "grad_norm": 7.0957489013671875,
+      "learning_rate": 1.8040363901627e-06,
+      "loss": 2.4616,
+      "step": 506
+    },
+    {
+      "epoch": 0.9172320217096337,
+      "grad_norm": 6.868281364440918,
+      "learning_rate": 1.7285262931759082e-06,
+      "loss": 2.9809,
+      "step": 507
+    },
+    {
+      "epoch": 0.919041157847128,
+      "grad_norm": 7.486063480377197,
+      "learning_rate": 1.6546026520622759e-06,
+      "loss": 2.928,
+      "step": 508
+    },
+    {
+      "epoch": 0.9208502939846224,
+      "grad_norm": 7.200881481170654,
+      "learning_rate": 1.5822678963435478e-06,
+      "loss": 3.2972,
+      "step": 509
+    },
+    {
+      "epoch": 0.9226594301221167,
+      "grad_norm": 6.531789779663086,
+      "learning_rate": 1.5115244033222731e-06,
+      "loss": 2.9837,
+      "step": 510
+    },
+    {
+      "epoch": 0.924468566259611,
+      "grad_norm": 6.44420051574707,
+      "learning_rate": 1.4423744980037068e-06,
+      "loss": 3.4836,
+      "step": 511
+    },
+    {
+      "epoch": 0.9262777023971054,
+      "grad_norm": 6.985484600067139,
+      "learning_rate": 1.3748204530193987e-06,
+      "loss": 3.5155,
+      "step": 512
+    },
+    {
+      "epoch": 0.9280868385345997,
+      "grad_norm": 6.48081111907959,
+      "learning_rate": 1.3088644885524637e-06,
+      "loss": 2.7183,
+      "step": 513
+    },
+    {
+      "epoch": 0.9298959746720941,
+      "grad_norm": 6.328634738922119,
+      "learning_rate": 1.2445087722646575e-06,
+      "loss": 3.8345,
+      "step": 514
+    },
+    {
+      "epoch": 0.9317051108095884,
+      "grad_norm": 6.782224655151367,
+      "learning_rate": 1.1817554192251e-06,
+      "loss": 4.047,
+      "step": 515
+    },
+    {
+      "epoch": 0.9335142469470827,
+      "grad_norm": 7.253306865692139,
+      "learning_rate": 1.1206064918408143e-06,
+      "loss": 3.5562,
+      "step": 516
+    },
+    {
+      "epoch": 0.9353233830845771,
+      "grad_norm": 7.641778469085693,
+      "learning_rate": 1.0610639997888916e-06,
+      "loss": 3.4378,
+      "step": 517
+    },
+    {
+      "epoch": 0.9371325192220714,
+      "grad_norm": 8.075942039489746,
+      "learning_rate": 1.0031298999504558e-06,
+      "loss": 4.6599,
+      "step": 518
+    },
+    {
+      "epoch": 0.9389416553595658,
+      "grad_norm": 6.147914409637451,
+      "learning_rate": 9.468060963463755e-07,
+      "loss": 3.3544,
+      "step": 519
+    },
+    {
+      "epoch": 0.9407507914970602,
+      "grad_norm": 6.727973937988281,
+      "learning_rate": 8.920944400746589e-07,
+      "loss": 3.6667,
+      "step": 520
+    },
+    {
+      "epoch": 0.9425599276345545,
+      "grad_norm": 6.554884433746338,
+      "learning_rate": 8.389967292496359e-07,
+      "loss": 2.9346,
+      "step": 521
+    },
+    {
+      "epoch": 0.9443690637720489,
+      "grad_norm": 6.31584358215332,
+      "learning_rate": 7.875147089428437e-07,
+      "loss": 3.7725,
+      "step": 522
+    },
+    {
+      "epoch": 0.9461781999095432,
+      "grad_norm": 7.042573928833008,
+      "learning_rate": 7.376500711257061e-07,
+      "loss": 4.3886,
+      "step": 523
+    },
+    {
+      "epoch": 0.9479873360470376,
+      "grad_norm": 6.561697959899902,
+      "learning_rate": 6.894044546138845e-07,
+      "loss": 3.9783,
+      "step": 524
+    },
+    {
+      "epoch": 0.9497964721845319,
+      "grad_norm": 6.423247814178467,
+      "learning_rate": 6.427794450134528e-07,
+      "loss": 3.5429,
+      "step": 525
+    },
+    {
+      "epoch": 0.9516056083220262,
+      "grad_norm": 8.323712348937988,
+      "learning_rate": 5.977765746687569e-07,
+      "loss": 3.9779,
+      "step": 526
+    },
+    {
+      "epoch": 0.9534147444595206,
+      "grad_norm": 7.722854137420654,
+      "learning_rate": 5.543973226120935e-07,
+      "loss": 4.0921,
+      "step": 527
+    },
+    {
+      "epoch": 0.9552238805970149,
+      "grad_norm": 6.630421161651611,
+      "learning_rate": 5.126431145150546e-07,
+      "loss": 4.1686,
+      "step": 528
+    },
+    {
+      "epoch": 0.9570330167345092,
+      "grad_norm": 6.825472831726074,
+      "learning_rate": 4.7251532264170895e-07,
+      "loss": 4.5243,
+      "step": 529
+    },
+    {
+      "epoch": 0.9588421528720036,
+      "grad_norm": 6.3734965324401855,
+      "learning_rate": 4.340152658034835e-07,
+      "loss": 3.5058,
+      "step": 530
+    },
+    {
+      "epoch": 0.9606512890094979,
+      "grad_norm": 6.643232345581055,
+      "learning_rate": 3.971442093158195e-07,
+      "loss": 3.8592,
+      "step": 531
+    },
+    {
+      "epoch": 0.9624604251469923,
+      "grad_norm": 6.958840847015381,
+      "learning_rate": 3.61903364956595e-07,
+      "loss": 3.8099,
+      "step": 532
+    },
+    {
+      "epoch": 0.9642695612844867,
+      "grad_norm": 6.364724159240723,
+      "learning_rate": 3.282938909263122e-07,
+      "loss": 3.5475,
+      "step": 533
+    },
+    {
+      "epoch": 0.966078697421981,
+      "grad_norm": 6.911684513092041,
+      "learning_rate": 2.9631689180999457e-07,
+      "loss": 3.4872,
+      "step": 534
+    },
+    {
+      "epoch": 0.9678878335594754,
+      "grad_norm": 6.476950645446777,
+      "learning_rate": 2.6597341854092684e-07,
+      "loss": 3.4591,
+      "step": 535
+    },
+    {
+      "epoch": 0.9696969696969697,
+      "grad_norm": 7.094813823699951,
+      "learning_rate": 2.3726446836608296e-07,
+      "loss": 3.6974,
+      "step": 536
+    },
+    {
+      "epoch": 0.9715061058344641,
+      "grad_norm": 8.420538902282715,
+      "learning_rate": 2.101909848133743e-07,
+      "loss": 3.4565,
+      "step": 537
+    },
+    {
+      "epoch": 0.9733152419719584,
+      "grad_norm": 7.818291187286377,
+      "learning_rate": 1.8475385766063003e-07,
+      "loss": 3.5051,
+      "step": 538
+    },
+    {
+      "epoch": 0.9751243781094527,
+      "grad_norm": 7.150783538818359,
+      "learning_rate": 1.6095392290635393e-07,
+      "loss": 3.7192,
+      "step": 539
+    },
+    {
+      "epoch": 0.9769335142469471,
+      "grad_norm": 6.040256023406982,
+      "learning_rate": 1.3879196274224627e-07,
+      "loss": 3.8009,
+      "step": 540
+    },
+    {
+      "epoch": 0.9787426503844414,
+      "grad_norm": 6.803682327270508,
+      "learning_rate": 1.1826870552749669e-07,
+      "loss": 5.3465,
+      "step": 541
+    },
+    {
+      "epoch": 0.9805517865219358,
+      "grad_norm": 6.929571151733398,
+      "learning_rate": 9.938482576487552e-08,
+      "loss": 5.3678,
+      "step": 542
+    },
+    {
+      "epoch": 0.9823609226594301,
+      "grad_norm": 7.471601486206055,
+      "learning_rate": 8.214094407851813e-08,
+      "loss": 5.8603,
+      "step": 543
+    },
+    {
+      "epoch": 0.9841700587969244,
+      "grad_norm": 8.225199699401855,
+      "learning_rate": 6.653762719355805e-08,
+      "loss": 5.9743,
+      "step": 544
+    },
+    {
+      "epoch": 0.9859791949344188,
+      "grad_norm": 8.476972579956055,
+      "learning_rate": 5.257538791749172e-08,
+      "loss": 5.3998,
+      "step": 545
+    },
+    {
+      "epoch": 0.9877883310719131,
+      "grad_norm": 9.56104564666748,
+      "learning_rate": 4.025468512333097e-08,
+      "loss": 5.3685,
+      "step": 546
+    },
+    {
+      "epoch": 0.9895974672094076,
+      "grad_norm": 8.92081356048584,
+      "learning_rate": 2.957592373452056e-08,
+      "loss": 4.7335,
+      "step": 547
+    },
+    {
+      "epoch": 0.9914066033469019,
+      "grad_norm": 10.06761360168457,
+      "learning_rate": 2.053945471162666e-08,
+      "loss": 5.0433,
+      "step": 548
+    },
+    {
+      "epoch": 0.9932157394843962,
+      "grad_norm": 10.523520469665527,
+      "learning_rate": 1.3145575040801606e-08,
+      "loss": 5.8829,
+      "step": 549
+    },
+    {
+      "epoch": 0.9950248756218906,
+      "grad_norm": 11.902076721191406,
+      "learning_rate": 7.394527724030598e-09,
+      "loss": 4.1261,
+      "step": 550
+    },
+    {
+      "epoch": 0.9968340117593849,
+      "grad_norm": 7.296747207641602,
+      "learning_rate": 3.286501771138095e-09,
+      "loss": 3.5647,
+      "step": 551
+    },
+    {
+      "epoch": 0.9986431478968792,
+      "grad_norm": 6.893759250640869,
+      "learning_rate": 8.216321935816674e-10,
+      "loss": 3.7598,
+      "step": 552
+    },
+    {
+      "epoch": 0.9986431478968792,
+      "eval_loss": 0.9817073345184326,
+      "eval_runtime": 38.8717,
+      "eval_samples_per_second": 11.988,
+      "eval_steps_per_second": 3.01,
+      "step": 552
+    },
+    {
+      "epoch": 1.0009045680687472,
+      "grad_norm": 7.406569957733154,
+      "learning_rate": 0.0,
+      "loss": 4.3257,
+      "step": 553
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 6.383309931864392e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null