Upload HuMBERT model

Browse files

The first Hungarian modernBERT

Files changed (9) hide show

config.json +47 -0
model.safetensors +3 -0
rng_state.pth +3 -0
scheduler.ptrom +0 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +73 -0
trainer_state.json +1437 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForMaskedLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": null,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "reference_compile": true,
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 52000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12223fde4b5b7faa4707932cb33e904fdb96a8bdec71c6ee17364281b8ec9eb4
+size 603655064

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2256c6848c20bc86005d7d5f232e62b8387cd260dd85250b5c4b9eacb36d91b0
+size 14244

scheduler.ptrom ADDED Viewed

Binary file (1.06 kB). View file

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1437 @@

+{
+  "best_metric": 2.840968608856201,
+  "best_model_checkpoint": "/home/jovyan/work/jupytershared/madaraszg/hun_modernBERT-base/checkpoint-900000",
+  "epoch": 0.7064444611025793,
+  "eval_steps": 50000,
+  "global_step": 900000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0039246914505698845,
+      "grad_norm": 5.427852153778076,
+      "learning_rate": 0.0004998,
+      "loss": 12.7757,
+      "step": 5000
+    },
+    {
+      "epoch": 0.007849382901139769,
+      "grad_norm": 3.9721639156341553,
+      "learning_rate": 0.0004999952362977937,
+      "loss": 9.605,
+      "step": 10000
+    },
+    {
+      "epoch": 0.011774074351709655,
+      "grad_norm": 4.18765115737915,
+      "learning_rate": 0.0004999809491856918,
+      "loss": 8.2774,
+      "step": 15000
+    },
+    {
+      "epoch": 0.015698765802279538,
+      "grad_norm": 3.2071168422698975,
+      "learning_rate": 0.0004999571563624537,
+      "loss": 7.7417,
+      "step": 20000
+    },
+    {
+      "epoch": 0.019623457252849426,
+      "grad_norm": 3.8722894191741943,
+      "learning_rate": 0.0004999237996462354,
+      "loss": 7.5097,
+      "step": 25000
+    },
+    {
+      "epoch": 0.02354814870341931,
+      "grad_norm": 3.7934718132019043,
+      "learning_rate": 0.0004998809069899119,
+      "loss": 7.2104,
+      "step": 30000
+    },
+    {
+      "epoch": 0.027472840153989195,
+      "grad_norm": 3.849618673324585,
+      "learning_rate": 0.0004998285486553762,
+      "loss": 7.3994,
+      "step": 35000
+    },
+    {
+      "epoch": 0.031397531604559076,
+      "grad_norm": 4.657052516937256,
+      "learning_rate": 0.0004997666008298054,
+      "loss": 6.9989,
+      "step": 40000
+    },
+    {
+      "epoch": 0.035322223055128964,
+      "grad_norm": 4.1764092445373535,
+      "learning_rate": 0.0004996951230621116,
+      "loss": 6.8514,
+      "step": 45000
+    },
+    {
+      "epoch": 0.03924691450569885,
+      "grad_norm": 3.2545883655548096,
+      "learning_rate": 0.000499614118079557,
+      "loss": 6.7546,
+      "step": 50000
+    },
+    {
+      "epoch": 0.03924691450569885,
+      "eval_loss": 3.438565969467163,
+      "eval_runtime": 920.3232,
+      "eval_samples_per_second": 234.641,
+      "eval_steps_per_second": 29.331,
+      "step": 50000
+    },
+    {
+      "epoch": 0.04317160595626873,
+      "grad_norm": 2.773167610168457,
+      "learning_rate": 0.0004995236461452331,
+      "loss": 6.6936,
+      "step": 55000
+    },
+    {
+      "epoch": 0.04709629740683862,
+      "grad_norm": 3.7268900871276855,
+      "learning_rate": 0.0004994236230404189,
+      "loss": 6.6314,
+      "step": 60000
+    },
+    {
+      "epoch": 0.0510209888574085,
+      "grad_norm": 4.384599685668945,
+      "learning_rate": 0.0004993141554730779,
+      "loss": 6.5624,
+      "step": 65000
+    },
+    {
+      "epoch": 0.05494568030797839,
+      "grad_norm": 3.721292018890381,
+      "learning_rate": 0.0004991950913888839,
+      "loss": 6.5376,
+      "step": 70000
+    },
+    {
+      "epoch": 0.05887037175854827,
+      "grad_norm": 4.107857704162598,
+      "learning_rate": 0.0004990665724978456,
+      "loss": 6.4852,
+      "step": 75000
+    },
+    {
+      "epoch": 0.06279506320911815,
+      "grad_norm": 4.358203887939453,
+      "learning_rate": 0.0004989285579755796,
+      "loss": 6.4618,
+      "step": 80000
+    },
+    {
+      "epoch": 0.06671975465968805,
+      "grad_norm": 2.996066093444824,
+      "learning_rate": 0.0004987810835455134,
+      "loss": 6.4299,
+      "step": 85000
+    },
+    {
+      "epoch": 0.07064444611025793,
+      "grad_norm": 3.4833667278289795,
+      "learning_rate": 0.0004986240634463282,
+      "loss": 6.3824,
+      "step": 90000
+    },
+    {
+      "epoch": 0.07456913756082781,
+      "grad_norm": 4.706465721130371,
+      "learning_rate": 0.0004984575950481576,
+      "loss": 6.3747,
+      "step": 95000
+    },
+    {
+      "epoch": 0.0784938290113977,
+      "grad_norm": 5.1914215087890625,
+      "learning_rate": 0.0004982817626825089,
+      "loss": 6.3426,
+      "step": 100000
+    },
+    {
+      "epoch": 0.0784938290113977,
+      "eval_loss": 3.232980966567993,
+      "eval_runtime": 1042.7899,
+      "eval_samples_per_second": 207.085,
+      "eval_steps_per_second": 25.886,
+      "step": 100000
+    },
+    {
+      "epoch": 0.08241852046196758,
+      "grad_norm": 3.467543363571167,
+      "learning_rate": 0.0004980962857598535,
+      "loss": 6.323,
+      "step": 105000
+    },
+    {
+      "epoch": 0.08634321191253747,
+      "grad_norm": 4.3943400382995605,
+      "learning_rate": 0.000497901382561167,
+      "loss": 6.2856,
+      "step": 110000
+    },
+    {
+      "epoch": 0.09026790336310735,
+      "grad_norm": 5.44612979888916,
+      "learning_rate": 0.0004976970661964071,
+      "loss": 6.2661,
+      "step": 115000
+    },
+    {
+      "epoch": 0.09419259481367724,
+      "grad_norm": 4.370021820068359,
+      "learning_rate": 0.0004974833064126723,
+      "loss": 6.2631,
+      "step": 120000
+    },
+    {
+      "epoch": 0.09811728626424712,
+      "grad_norm": 5.689448833465576,
+      "learning_rate": 0.0004972601113595269,
+      "loss": 6.2489,
+      "step": 125000
+    },
+    {
+      "epoch": 0.102041977714817,
+      "grad_norm": 4.484179496765137,
+      "learning_rate": 0.0004970275370315128,
+      "loss": 6.2418,
+      "step": 130000
+    },
+    {
+      "epoch": 0.10596666916538688,
+      "grad_norm": 3.9610681533813477,
+      "learning_rate": 0.0004967854498415297,
+      "loss": 6.2117,
+      "step": 135000
+    },
+    {
+      "epoch": 0.10989136061595678,
+      "grad_norm": 3.5981478691101074,
+      "learning_rate": 0.0004965340014730877,
+      "loss": 6.1937,
+      "step": 140000
+    },
+    {
+      "epoch": 0.11381605206652666,
+      "grad_norm": 4.028958320617676,
+      "learning_rate": 0.0004962731540273645,
+      "loss": 6.185,
+      "step": 145000
+    },
+    {
+      "epoch": 0.11774074351709654,
+      "grad_norm": 3.569103479385376,
+      "learning_rate": 0.0004960029174491356,
+      "loss": 6.1823,
+      "step": 150000
+    },
+    {
+      "epoch": 0.11774074351709654,
+      "eval_loss": 3.1528232097625732,
+      "eval_runtime": 1097.5085,
+      "eval_samples_per_second": 196.76,
+      "eval_steps_per_second": 24.596,
+      "step": 150000
+    },
+    {
+      "epoch": 0.12166543496766644,
+      "grad_norm": 3.469241142272949,
+      "learning_rate": 0.0004957233020411361,
+      "loss": 6.167,
+      "step": 155000
+    },
+    {
+      "epoch": 0.1255901264182363,
+      "grad_norm": 5.208609580993652,
+      "learning_rate": 0.0004954343772197868,
+      "loss": 6.1673,
+      "step": 160000
+    },
+    {
+      "epoch": 0.1295148178688062,
+      "grad_norm": 4.649272918701172,
+      "learning_rate": 0.0004951359777341922,
+      "loss": 6.1443,
+      "step": 165000
+    },
+    {
+      "epoch": 0.1334395093193761,
+      "grad_norm": 4.373592376708984,
+      "learning_rate": 0.0004948284162183676,
+      "loss": 6.1392,
+      "step": 170000
+    },
+    {
+      "epoch": 0.13736420076994596,
+      "grad_norm": 4.671726703643799,
+      "learning_rate": 0.0004945113350346094,
+      "loss": 6.1305,
+      "step": 175000
+    },
+    {
+      "epoch": 0.14128889222051585,
+      "grad_norm": 3.430539846420288,
+      "learning_rate": 0.0004941849281568307,
+      "loss": 6.1348,
+      "step": 180000
+    },
+    {
+      "epoch": 0.14521358367108575,
+      "grad_norm": 4.936987400054932,
+      "learning_rate": 0.0004938494122969139,
+      "loss": 6.1163,
+      "step": 185000
+    },
+    {
+      "epoch": 0.14913827512165562,
+      "grad_norm": 7.4127984046936035,
+      "learning_rate": 0.0004935043273650284,
+      "loss": 6.1055,
+      "step": 190000
+    },
+    {
+      "epoch": 0.1530629665722255,
+      "grad_norm": 4.886425495147705,
+      "learning_rate": 0.0004931501668390127,
+      "loss": 6.1162,
+      "step": 195000
+    },
+    {
+      "epoch": 0.1569876580227954,
+      "grad_norm": 5.542958736419678,
+      "learning_rate": 0.000492786518985573,
+      "loss": 6.0902,
+      "step": 200000
+    },
+    {
+      "epoch": 0.1569876580227954,
+      "eval_loss": 3.106311321258545,
+      "eval_runtime": 1143.7295,
+      "eval_samples_per_second": 188.809,
+      "eval_steps_per_second": 23.602,
+      "step": 200000
+    },
+    {
+      "epoch": 0.16091234947336527,
+      "grad_norm": 5.0378923416137695,
+      "learning_rate": 0.0004924137585324416,
+      "loss": 6.0822,
+      "step": 205000
+    },
+    {
+      "epoch": 0.16483704092393517,
+      "grad_norm": 3.9391469955444336,
+      "learning_rate": 0.0004920318334402453,
+      "loss": 6.0806,
+      "step": 210000
+    },
+    {
+      "epoch": 0.16876173237450504,
+      "grad_norm": 4.512287139892578,
+      "learning_rate": 0.0004916405262093358,
+      "loss": 6.0895,
+      "step": 215000
+    },
+    {
+      "epoch": 0.17268642382507493,
+      "grad_norm": 4.639518737792969,
+      "learning_rate": 0.0004912400838187446,
+      "loss": 6.0726,
+      "step": 220000
+    },
+    {
+      "epoch": 0.17661111527564483,
+      "grad_norm": 4.6264190673828125,
+      "learning_rate": 0.0004908306928184586,
+      "loss": 6.075,
+      "step": 225000
+    },
+    {
+      "epoch": 0.1805358067262147,
+      "grad_norm": 4.821324348449707,
+      "learning_rate": 0.000490411707628039,
+      "loss": 6.0763,
+      "step": 230000
+    },
+    {
+      "epoch": 0.1844604981767846,
+      "grad_norm": 4.997890949249268,
+      "learning_rate": 0.0004899836359827696,
+      "loss": 6.0439,
+      "step": 235000
+    },
+    {
+      "epoch": 0.18838518962735448,
+      "grad_norm": 5.442014694213867,
+      "learning_rate": 0.0004895464997071264,
+      "loss": 6.048,
+      "step": 240000
+    },
+    {
+      "epoch": 0.19230988107792435,
+      "grad_norm": 4.34916353225708,
+      "learning_rate": 0.0004891002307518132,
+      "loss": 6.059,
+      "step": 245000
+    },
+    {
+      "epoch": 0.19623457252849424,
+      "grad_norm": 4.754409313201904,
+      "learning_rate": 0.0004886448461307771,
+      "loss": 6.0483,
+      "step": 250000
+    },
+    {
+      "epoch": 0.19623457252849424,
+      "eval_loss": 3.0871126651763916,
+      "eval_runtime": 1102.0863,
+      "eval_samples_per_second": 195.943,
+      "eval_steps_per_second": 24.494,
+      "step": 250000
+    },
+    {
+      "epoch": 0.20015926397906414,
+      "grad_norm": 7.54548978805542,
+      "learning_rate": 0.0004881803632054984,
+      "loss": 6.0235,
+      "step": 255000
+    },
+    {
+      "epoch": 0.204083955429634,
+      "grad_norm": 6.219805717468262,
+      "learning_rate": 0.00048770689534197696,
+      "loss": 6.0259,
+      "step": 260000
+    },
+    {
+      "epoch": 0.2080086468802039,
+      "grad_norm": 4.785896301269531,
+      "learning_rate": 0.00048722417362181855,
+      "loss": 6.0152,
+      "step": 265000
+    },
+    {
+      "epoch": 0.21193333833077377,
+      "grad_norm": 6.293859958648682,
+      "learning_rate": 0.000486732602694764,
+      "loss": 6.014,
+      "step": 270000
+    },
+    {
+      "epoch": 0.21585802978134366,
+      "grad_norm": 5.359744548797607,
+      "learning_rate": 0.0004862318078178062,
+      "loss": 6.0129,
+      "step": 275000
+    },
+    {
+      "epoch": 0.21978272123191356,
+      "grad_norm": 4.500082015991211,
+      "learning_rate": 0.00048572210591012405,
+      "loss": 6.0007,
+      "step": 280000
+    },
+    {
+      "epoch": 0.22370741268248343,
+      "grad_norm": 5.4786906242370605,
+      "learning_rate": 0.00048520341712729654,
+      "loss": 5.9918,
+      "step": 285000
+    },
+    {
+      "epoch": 0.22763210413305332,
+      "grad_norm": 5.049560546875,
+      "learning_rate": 0.0004846758677132269,
+      "loss": 6.0041,
+      "step": 290000
+    },
+    {
+      "epoch": 0.23155679558362322,
+      "grad_norm": 4.548033714294434,
+      "learning_rate": 0.00048413915837784905,
+      "loss": 6.009,
+      "step": 295000
+    },
+    {
+      "epoch": 0.23548148703419308,
+      "grad_norm": 5.688722133636475,
+      "learning_rate": 0.00048359362898595776,
+      "loss": 5.9754,
+      "step": 300000
+    },
+    {
+      "epoch": 0.23548148703419308,
+      "eval_loss": 3.070676803588867,
+      "eval_runtime": 1156.3208,
+      "eval_samples_per_second": 186.753,
+      "eval_steps_per_second": 23.345,
+      "step": 300000
+    },
+    {
+      "epoch": 0.23940617848476298,
+      "grad_norm": 3.3441436290740967,
+      "learning_rate": 0.0004830391938668317,
+      "loss": 5.9819,
+      "step": 305000
+    },
+    {
+      "epoch": 0.24333086993533287,
+      "grad_norm": 4.076082229614258,
+      "learning_rate": 0.0004824759877544745,
+      "loss": 5.9817,
+      "step": 310000
+    },
+    {
+      "epoch": 0.24725556138590274,
+      "grad_norm": 4.681301593780518,
+      "learning_rate": 0.00048190380670406807,
+      "loss": 5.9629,
+      "step": 315000
+    },
+    {
+      "epoch": 0.2511802528364726,
+      "grad_norm": 5.435401439666748,
+      "learning_rate": 0.0004813229014844921,
+      "loss": 5.9558,
+      "step": 320000
+    },
+    {
+      "epoch": 0.2551049442870425,
+      "grad_norm": 5.168973922729492,
+      "learning_rate": 0.0004807330617418338,
+      "loss": 5.9585,
+      "step": 325000
+    },
+    {
+      "epoch": 0.2590296357376124,
+      "grad_norm": 3.879523754119873,
+      "learning_rate": 0.0004801344253307013,
+      "loss": 5.9367,
+      "step": 330000
+    },
+    {
+      "epoch": 0.2629543271881823,
+      "grad_norm": 3.8004238605499268,
+      "learning_rate": 0.00047952689266699537,
+      "loss": 5.9508,
+      "step": 335000
+    },
+    {
+      "epoch": 0.2668790186387522,
+      "grad_norm": 3.6730406284332275,
+      "learning_rate": 0.0004789107299737269,
+      "loss": 5.945,
+      "step": 340000
+    },
+    {
+      "epoch": 0.270803710089322,
+      "grad_norm": 5.348697662353516,
+      "learning_rate": 0.00047828621778500213,
+      "loss": 5.9411,
+      "step": 345000
+    },
+    {
+      "epoch": 0.2747284015398919,
+      "grad_norm": 6.138799667358398,
+      "learning_rate": 0.00047765250211029227,
+      "loss": 5.9372,
+      "step": 350000
+    },
+    {
+      "epoch": 0.2747284015398919,
+      "eval_loss": 3.0503780841827393,
+      "eval_runtime": 1159.4067,
+      "eval_samples_per_second": 186.256,
+      "eval_steps_per_second": 23.283,
+      "step": 350000
+    },
+    {
+      "epoch": 0.2786530929904618,
+      "grad_norm": 5.234776020050049,
+      "learning_rate": 0.0004770101037396353,
+      "loss": 5.9323,
+      "step": 355000
+    },
+    {
+      "epoch": 0.2825777844410317,
+      "grad_norm": 5.453892707824707,
+      "learning_rate": 0.00047635904717418853,
+      "loss": 5.9306,
+      "step": 360000
+    },
+    {
+      "epoch": 0.2865024758916016,
+      "grad_norm": 5.497768402099609,
+      "learning_rate": 0.00047569935724533363,
+      "loss": 5.9311,
+      "step": 365000
+    },
+    {
+      "epoch": 0.2904271673421715,
+      "grad_norm": 5.382653713226318,
+      "learning_rate": 0.0004750313282043671,
+      "loss": 5.9188,
+      "step": 370000
+    },
+    {
+      "epoch": 0.29435185879274134,
+      "grad_norm": 3.7600419521331787,
+      "learning_rate": 0.0004743548595639592,
+      "loss": 5.9109,
+      "step": 375000
+    },
+    {
+      "epoch": 0.29827655024331123,
+      "grad_norm": 6.006251811981201,
+      "learning_rate": 0.00047366929239918046,
+      "loss": 5.9207,
+      "step": 380000
+    },
+    {
+      "epoch": 0.30220124169388113,
+      "grad_norm": 6.547440052032471,
+      "learning_rate": 0.0004729753307170254,
+      "loss": 5.9264,
+      "step": 385000
+    },
+    {
+      "epoch": 0.306125933144451,
+      "grad_norm": 5.451165199279785,
+      "learning_rate": 0.0004722730060967992,
+      "loss": 5.9077,
+      "step": 390000
+    },
+    {
+      "epoch": 0.3100506245950209,
+      "grad_norm": 6.919593811035156,
+      "learning_rate": 0.0004715620642838824,
+      "loss": 5.9068,
+      "step": 395000
+    },
+    {
+      "epoch": 0.3139753160455908,
+      "grad_norm": 5.905742645263672,
+      "learning_rate": 0.00047084281681554897,
+      "loss": 5.9082,
+      "step": 400000
+    },
+    {
+      "epoch": 0.3139753160455908,
+      "eval_loss": 3.0186421871185303,
+      "eval_runtime": 1163.0768,
+      "eval_samples_per_second": 185.668,
+      "eval_steps_per_second": 23.209,
+      "step": 400000
+    },
+    {
+      "epoch": 0.31790000749616065,
+      "grad_norm": 7.0828986167907715,
+      "learning_rate": 0.00047011514974306426,
+      "loss": 5.9015,
+      "step": 405000
+    },
+    {
+      "epoch": 0.32182469894673055,
+      "grad_norm": 5.617277145385742,
+      "learning_rate": 0.0004693792389168259,
+      "loss": 5.8899,
+      "step": 410000
+    },
+    {
+      "epoch": 0.32574939039730044,
+      "grad_norm": 5.834865093231201,
+      "learning_rate": 0.0004686349676316719,
+      "loss": 5.9025,
+      "step": 415000
+    },
+    {
+      "epoch": 0.32967408184787034,
+      "grad_norm": 4.863770961761475,
+      "learning_rate": 0.00046788190992146675,
+      "loss": 5.8755,
+      "step": 420000
+    },
+    {
+      "epoch": 0.33359877329844023,
+      "grad_norm": 4.0446391105651855,
+      "learning_rate": 0.0004671209981513359,
+      "loss": 5.8738,
+      "step": 425000
+    },
+    {
+      "epoch": 0.33752346474901007,
+      "grad_norm": 4.7902045249938965,
+      "learning_rate": 0.0004663516572272386,
+      "loss": 5.8761,
+      "step": 430000
+    },
+    {
+      "epoch": 0.34144815619957997,
+      "grad_norm": 3.621206283569336,
+      "learning_rate": 0.00046557406792387514,
+      "loss": 5.877,
+      "step": 435000
+    },
+    {
+      "epoch": 0.34537284765014986,
+      "grad_norm": 4.824281692504883,
+      "learning_rate": 0.0004647881018418675,
+      "loss": 5.8776,
+      "step": 440000
+    },
+    {
+      "epoch": 0.34929753910071976,
+      "grad_norm": 5.029583930969238,
+      "learning_rate": 0.0004639942630746584,
+      "loss": 5.8648,
+      "step": 445000
+    },
+    {
+      "epoch": 0.35322223055128965,
+      "grad_norm": 4.80470609664917,
+      "learning_rate": 0.0004631924303768197,
+      "loss": 5.8674,
+      "step": 450000
+    },
+    {
+      "epoch": 0.35322223055128965,
+      "eval_loss": 3.011624336242676,
+      "eval_runtime": 1103.4516,
+      "eval_samples_per_second": 195.7,
+      "eval_steps_per_second": 24.463,
+      "step": 450000
+    },
+    {
+      "epoch": 0.35714692200185955,
+      "grad_norm": 4.197587490081787,
+      "learning_rate": 0.0004623819874532173,
+      "loss": 5.8754,
+      "step": 455000
+    },
+    {
+      "epoch": 0.3610716134524294,
+      "grad_norm": 4.719716548919678,
+      "learning_rate": 0.0004615632797034139,
+      "loss": 5.8609,
+      "step": 460000
+    },
+    {
+      "epoch": 0.3649963049029993,
+      "grad_norm": 5.633116245269775,
+      "learning_rate": 0.00046073683199574194,
+      "loss": 5.8661,
+      "step": 465000
+    },
+    {
+      "epoch": 0.3689209963535692,
+      "grad_norm": 4.527806758880615,
+      "learning_rate": 0.00045990218220831147,
+      "loss": 5.8481,
+      "step": 470000
+    },
+    {
+      "epoch": 0.37284568780413907,
+      "grad_norm": 7.49449348449707,
+      "learning_rate": 0.0004590596960876785,
+      "loss": 5.8393,
+      "step": 475000
+    },
+    {
+      "epoch": 0.37677037925470896,
+      "grad_norm": 5.242143630981445,
+      "learning_rate": 0.00045820958150135304,
+      "loss": 5.854,
+      "step": 480000
+    },
+    {
+      "epoch": 0.3806950707052788,
+      "grad_norm": 4.846193313598633,
+      "learning_rate": 0.0004573510176867819,
+      "loss": 5.8465,
+      "step": 485000
+    },
+    {
+      "epoch": 0.3846197621558487,
+      "grad_norm": 4.976168632507324,
+      "learning_rate": 0.00045648471959526093,
+      "loss": 5.8378,
+      "step": 490000
+    },
+    {
+      "epoch": 0.3885444536064186,
+      "grad_norm": 5.064621448516846,
+      "learning_rate": 0.00045561054930020917,
+      "loss": 5.8489,
+      "step": 495000
+    },
+    {
+      "epoch": 0.3924691450569885,
+      "grad_norm": 7.152626991271973,
+      "learning_rate": 0.0004547285401292574,
+      "loss": 5.8365,
+      "step": 500000
+    },
+    {
+      "epoch": 0.3924691450569885,
+      "eval_loss": 2.987938642501831,
+      "eval_runtime": 1182.504,
+      "eval_samples_per_second": 182.618,
+      "eval_steps_per_second": 22.828,
+      "step": 500000
+    },
+    {
+      "epoch": 0.3963938365075584,
+      "grad_norm": 5.304065227508545,
+      "learning_rate": 0.0004538385468960594,
+      "loss": 5.8472,
+      "step": 505000
+    },
+    {
+      "epoch": 0.4003185279581283,
+      "grad_norm": 5.780062198638916,
+      "learning_rate": 0.00045294077923425393,
+      "loss": 5.8492,
+      "step": 510000
+    },
+    {
+      "epoch": 0.4042432194086981,
+      "grad_norm": 5.706847667694092,
+      "learning_rate": 0.0004520356352035454,
+      "loss": 5.8333,
+      "step": 515000
+    },
+    {
+      "epoch": 0.408167910859268,
+      "grad_norm": 5.198765277862549,
+      "learning_rate": 0.00045112297511828384,
+      "loss": 5.827,
+      "step": 520000
+    },
+    {
+      "epoch": 0.4120926023098379,
+      "grad_norm": 5.543708324432373,
+      "learning_rate": 0.0004502019134736622,
+      "loss": 5.8469,
+      "step": 525000
+    },
+    {
+      "epoch": 0.4160172937604078,
+      "grad_norm": 6.349376678466797,
+      "learning_rate": 0.00044927339954835976,
+      "loss": 5.8168,
+      "step": 530000
+    },
+    {
+      "epoch": 0.4199419852109777,
+      "grad_norm": 5.693711280822754,
+      "learning_rate": 0.0004483376613453707,
+      "loss": 5.8331,
+      "step": 535000
+    },
+    {
+      "epoch": 0.42386667666154754,
+      "grad_norm": 4.938353538513184,
+      "learning_rate": 0.0004473947436224123,
+      "loss": 5.8324,
+      "step": 540000
+    },
+    {
+      "epoch": 0.42779136811211743,
+      "grad_norm": 4.152617454528809,
+      "learning_rate": 0.0004464433541422548,
+      "loss": 5.8357,
+      "step": 545000
+    },
+    {
+      "epoch": 0.4317160595626873,
+      "grad_norm": 6.794117450714111,
+      "learning_rate": 0.0004454844692892605,
+      "loss": 5.8409,
+      "step": 550000
+    },
+    {
+      "epoch": 0.4317160595626873,
+      "eval_loss": 2.996899127960205,
+      "eval_runtime": 1172.5997,
+      "eval_samples_per_second": 184.16,
+      "eval_steps_per_second": 23.021,
+      "step": 550000
+    },
+    {
+      "epoch": 0.4356407510132572,
+      "grad_norm": 6.325488567352295,
+      "learning_rate": 0.0004445189016962925,
+      "loss": 5.8321,
+      "step": 555000
+    },
+    {
+      "epoch": 0.4395654424638271,
+      "grad_norm": 4.354986667633057,
+      "learning_rate": 0.0004435453375563152,
+      "loss": 5.8083,
+      "step": 560000
+    },
+    {
+      "epoch": 0.443490133914397,
+      "grad_norm": 6.791714191436768,
+      "learning_rate": 0.00044256458853620686,
+      "loss": 5.8181,
+      "step": 565000
+    },
+    {
+      "epoch": 0.44741482536496685,
+      "grad_norm": 5.404956817626953,
+      "learning_rate": 0.00044157629958651906,
+      "loss": 5.8209,
+      "step": 570000
+    },
+    {
+      "epoch": 0.45133951681553675,
+      "grad_norm": 6.600470066070557,
+      "learning_rate": 0.00044058110366587233,
+      "loss": 5.8144,
+      "step": 575000
+    },
+    {
+      "epoch": 0.45526420826610664,
+      "grad_norm": 7.110249996185303,
+      "learning_rate": 0.00043957844343575327,
+      "loss": 5.8177,
+      "step": 580000
+    },
+    {
+      "epoch": 0.45918889971667654,
+      "grad_norm": 4.988192081451416,
+      "learning_rate": 0.00043856896110892765,
+      "loss": 5.8048,
+      "step": 585000
+    },
+    {
+      "epoch": 0.46311359116724643,
+      "grad_norm": 5.140357494354248,
+      "learning_rate": 0.00043755168272890363,
+      "loss": 5.8167,
+      "step": 590000
+    },
+    {
+      "epoch": 0.46703828261781627,
+      "grad_norm": 7.784915447235107,
+      "learning_rate": 0.00043652786802263427,
+      "loss": 5.8068,
+      "step": 595000
+    },
+    {
+      "epoch": 0.47096297406838616,
+      "grad_norm": 5.686777591705322,
+      "learning_rate": 0.000435496326481159,
+      "loss": 5.8049,
+      "step": 600000
+    },
+    {
+      "epoch": 0.47096297406838616,
+      "eval_loss": 2.9820611476898193,
+      "eval_runtime": 1175.9282,
+      "eval_samples_per_second": 183.639,
+      "eval_steps_per_second": 22.955,
+      "step": 600000
+    },
+    {
+      "epoch": 0.47488766551895606,
+      "grad_norm": 4.776956081390381,
+      "learning_rate": 0.00043445771008574104,
+      "loss": 5.7853,
+      "step": 605000
+    },
+    {
+      "epoch": 0.47881235696952595,
+      "grad_norm": 5.395595073699951,
+      "learning_rate": 0.00043341205844934426,
+      "loss": 5.7943,
+      "step": 610000
+    },
+    {
+      "epoch": 0.48273704842009585,
+      "grad_norm": 6.324551582336426,
+      "learning_rate": 0.0004323598339897348,
+      "loss": 5.7885,
+      "step": 615000
+    },
+    {
+      "epoch": 0.48666173987066574,
+      "grad_norm": 8.08753776550293,
+      "learning_rate": 0.0004313002345566292,
+      "loss": 5.7802,
+      "step": 620000
+    },
+    {
+      "epoch": 0.4905864313212356,
+      "grad_norm": 4.667470932006836,
+      "learning_rate": 0.0004302343624111244,
+      "loss": 5.7936,
+      "step": 625000
+    },
+    {
+      "epoch": 0.4945111227718055,
+      "grad_norm": 4.392518043518066,
+      "learning_rate": 0.00042916076273439863,
+      "loss": 5.7818,
+      "step": 630000
+    },
+    {
+      "epoch": 0.4984358142223754,
+      "grad_norm": 5.52575159072876,
+      "learning_rate": 0.00042808076064426405,
+      "loss": 5.7728,
+      "step": 635000
+    },
+    {
+      "epoch": 0.5023605056729452,
+      "grad_norm": 5.269731521606445,
+      "learning_rate": 0.0004269941873694671,
+      "loss": 5.7826,
+      "step": 640000
+    },
+    {
+      "epoch": 0.5062851971235152,
+      "grad_norm": 4.810575008392334,
+      "learning_rate": 0.00042590064944661286,
+      "loss": 5.7781,
+      "step": 645000
+    },
+    {
+      "epoch": 0.510209888574085,
+      "grad_norm": 4.98996114730835,
+      "learning_rate": 0.0004248001845288868,
+      "loss": 5.7747,
+      "step": 650000
+    },
+    {
+      "epoch": 0.510209888574085,
+      "eval_loss": 2.961513042449951,
+      "eval_runtime": 1127.3033,
+      "eval_samples_per_second": 191.56,
+      "eval_steps_per_second": 23.946,
+      "step": 650000
+    },
+    {
+      "epoch": 0.514134580024655,
+      "grad_norm": 7.207222938537598,
+      "learning_rate": 0.00042369371911227634,
+      "loss": 5.7746,
+      "step": 655000
+    },
+    {
+      "epoch": 0.5180592714752248,
+      "grad_norm": 5.494042873382568,
+      "learning_rate": 0.0004225801900348928,
+      "loss": 5.7699,
+      "step": 660000
+    },
+    {
+      "epoch": 0.5219839629257946,
+      "grad_norm": 6.930171012878418,
+      "learning_rate": 0.00042145985659294296,
+      "loss": 5.7644,
+      "step": 665000
+    },
+    {
+      "epoch": 0.5259086543763646,
+      "grad_norm": 4.1922760009765625,
+      "learning_rate": 0.00042033320972059365,
+      "loss": 5.7637,
+      "step": 670000
+    },
+    {
+      "epoch": 0.5298333458269344,
+      "grad_norm": 5.89008092880249,
+      "learning_rate": 0.00041920006891640475,
+      "loss": 5.7521,
+      "step": 675000
+    },
+    {
+      "epoch": 0.5337580372775044,
+      "grad_norm": 5.436644077301025,
+      "learning_rate": 0.00041806047738122455,
+      "loss": 5.7454,
+      "step": 680000
+    },
+    {
+      "epoch": 0.5376827287280742,
+      "grad_norm": 6.073439121246338,
+      "learning_rate": 0.00041691470849126257,
+      "loss": 5.752,
+      "step": 685000
+    },
+    {
+      "epoch": 0.541607420178644,
+      "grad_norm": 5.298029899597168,
+      "learning_rate": 0.0004157623473475629,
+      "loss": 5.748,
+      "step": 690000
+    },
+    {
+      "epoch": 0.545532111629214,
+      "grad_norm": 16.55943489074707,
+      "learning_rate": 0.00041460389899361316,
+      "loss": 5.7357,
+      "step": 695000
+    },
+    {
+      "epoch": 0.5494568030797838,
+      "grad_norm": 6.340664863586426,
+      "learning_rate": 0.00041343871022983314,
+      "loss": 5.7385,
+      "step": 700000
+    },
+    {
+      "epoch": 0.5494568030797838,
+      "eval_loss": 2.939816951751709,
+      "eval_runtime": 1153.8155,
+      "eval_samples_per_second": 187.158,
+      "eval_steps_per_second": 23.395,
+      "step": 700000
+    },
+    {
+      "epoch": 0.5533814945303538,
+      "grad_norm": 5.492395877838135,
+      "learning_rate": 0.0004122675228441709,
+      "loss": 5.7247,
+      "step": 705000
+    },
+    {
+      "epoch": 0.5573061859809236,
+      "grad_norm": 5.045653820037842,
+      "learning_rate": 0.000411090385214718,
+      "loss": 5.7244,
+      "step": 710000
+    },
+    {
+      "epoch": 0.5612308774314936,
+      "grad_norm": 5.396609783172607,
+      "learning_rate": 0.00040990663367439613,
+      "loss": 5.7248,
+      "step": 715000
+    },
+    {
+      "epoch": 0.5651555688820634,
+      "grad_norm": 5.786918640136719,
+      "learning_rate": 0.0004087174991472511,
+      "loss": 5.719,
+      "step": 720000
+    },
+    {
+      "epoch": 0.5690802603326333,
+      "grad_norm": 6.569219589233398,
+      "learning_rate": 0.0004075215988876539,
+      "loss": 5.7122,
+      "step": 725000
+    },
+    {
+      "epoch": 0.5730049517832032,
+      "grad_norm": 5.519715309143066,
+      "learning_rate": 0.00040631993176130574,
+      "loss": 5.7093,
+      "step": 730000
+    },
+    {
+      "epoch": 0.576929643233773,
+      "grad_norm": 6.44736385345459,
+      "learning_rate": 0.00040511206274228083,
+      "loss": 5.7029,
+      "step": 735000
+    },
+    {
+      "epoch": 0.580854334684343,
+      "grad_norm": 5.93750524520874,
+      "learning_rate": 0.00040389852112315274,
+      "loss": 5.7025,
+      "step": 740000
+    },
+    {
+      "epoch": 0.5847790261349128,
+      "grad_norm": 5.719981670379639,
+      "learning_rate": 0.0004026793567057317,
+      "loss": 5.7028,
+      "step": 745000
+    },
+    {
+      "epoch": 0.5887037175854827,
+      "grad_norm": 6.105369567871094,
+      "learning_rate": 0.00040145388229039284,
+      "loss": 5.6877,
+      "step": 750000
+    },
+    {
+      "epoch": 0.5887037175854827,
+      "eval_loss": 2.918062925338745,
+      "eval_runtime": 1152.6996,
+      "eval_samples_per_second": 187.339,
+      "eval_steps_per_second": 23.418,
+      "step": 750000
+    },
+    {
+      "epoch": 0.5926284090360526,
+      "grad_norm": 6.412558078765869,
+      "learning_rate": 0.0004002223845301598,
+      "loss": 5.6934,
+      "step": 755000
+    },
+    {
+      "epoch": 0.5965531004866225,
+      "grad_norm": 5.392569541931152,
+      "learning_rate": 0.0003989856510076611,
+      "loss": 5.6804,
+      "step": 760000
+    },
+    {
+      "epoch": 0.6004777919371924,
+      "grad_norm": 4.88016939163208,
+      "learning_rate": 0.0003977434865733831,
+      "loss": 5.6811,
+      "step": 765000
+    },
+    {
+      "epoch": 0.6044024833877623,
+      "grad_norm": 3.566276788711548,
+      "learning_rate": 0.0003964951911528423,
+      "loss": 5.6704,
+      "step": 770000
+    },
+    {
+      "epoch": 0.6083271748383321,
+      "grad_norm": 5.245337963104248,
+      "learning_rate": 0.00039524130838671694,
+      "loss": 5.6752,
+      "step": 775000
+    },
+    {
+      "epoch": 0.612251866288902,
+      "grad_norm": 5.569911003112793,
+      "learning_rate": 0.0003939823910692354,
+      "loss": 5.678,
+      "step": 780000
+    },
+    {
+      "epoch": 0.6161765577394719,
+      "grad_norm": 6.871520042419434,
+      "learning_rate": 0.0003927174794803744,
+      "loss": 5.6667,
+      "step": 785000
+    },
+    {
+      "epoch": 0.6201012491900418,
+      "grad_norm": 4.4467034339904785,
+      "learning_rate": 0.0003914473792912365,
+      "loss": 5.656,
+      "step": 790000
+    },
+    {
+      "epoch": 0.6240259406406117,
+      "grad_norm": 7.6152448654174805,
+      "learning_rate": 0.00039017188643885047,
+      "loss": 5.6461,
+      "step": 795000
+    },
+    {
+      "epoch": 0.6279506320911816,
+      "grad_norm": 6.26963996887207,
+      "learning_rate": 0.0003888910495512182,
+      "loss": 5.6608,
+      "step": 800000
+    },
+    {
+      "epoch": 0.6279506320911816,
+      "eval_loss": 2.8902182579040527,
+      "eval_runtime": 1155.3086,
+      "eval_samples_per_second": 186.916,
+      "eval_steps_per_second": 23.365,
+      "step": 800000
+    },
+    {
+      "epoch": 0.6318753235417515,
+      "grad_norm": 4.018621921539307,
+      "learning_rate": 0.0003876054331712927,
+      "loss": 5.6525,
+      "step": 805000
+    },
+    {
+      "epoch": 0.6358000149923213,
+      "grad_norm": 5.362500190734863,
+      "learning_rate": 0.00038631379809951025,
+      "loss": 5.636,
+      "step": 810000
+    },
+    {
+      "epoch": 0.6397247064428913,
+      "grad_norm": 6.672492027282715,
+      "learning_rate": 0.00038501722393710636,
+      "loss": 5.6273,
+      "step": 815000
+    },
+    {
+      "epoch": 0.6436493978934611,
+      "grad_norm": 4.727617263793945,
+      "learning_rate": 0.0003837152413000372,
+      "loss": 5.6233,
+      "step": 820000
+    },
+    {
+      "epoch": 0.647574089344031,
+      "grad_norm": 6.725893974304199,
+      "learning_rate": 0.0003824086826970783,
+      "loss": 5.6242,
+      "step": 825000
+    },
+    {
+      "epoch": 0.6514987807946009,
+      "grad_norm": 5.975217819213867,
+      "learning_rate": 0.0003810976040094335,
+      "loss": 5.6134,
+      "step": 830000
+    },
+    {
+      "epoch": 0.6554234722451707,
+      "grad_norm": 5.095362663269043,
+      "learning_rate": 0.0003797804773430985,
+      "loss": 5.619,
+      "step": 835000
+    },
+    {
+      "epoch": 0.6593481636957407,
+      "grad_norm": 5.389176845550537,
+      "learning_rate": 0.00037845813587251276,
+      "loss": 5.614,
+      "step": 840000
+    },
+    {
+      "epoch": 0.6632728551463105,
+      "grad_norm": 5.1916351318359375,
+      "learning_rate": 0.000377131690836747,
+      "loss": 5.6095,
+      "step": 845000
+    },
+    {
+      "epoch": 0.6671975465968805,
+      "grad_norm": 6.15248966217041,
+      "learning_rate": 0.0003757998670619693,
+      "loss": 5.6033,
+      "step": 850000
+    },
+    {
+      "epoch": 0.6671975465968805,
+      "eval_loss": 2.8692898750305176,
+      "eval_runtime": 1175.9151,
+      "eval_samples_per_second": 183.641,
+      "eval_steps_per_second": 22.956,
+      "step": 850000
+    },
+    {
+      "epoch": 0.6711222380474503,
+      "grad_norm": 6.101210594177246,
+      "learning_rate": 0.0003744637809722457,
+      "loss": 5.6013,
+      "step": 855000
+    },
+    {
+      "epoch": 0.6750469294980201,
+      "grad_norm": 6.87960958480835,
+      "learning_rate": 0.0003731226828172506,
+      "loss": 5.592,
+      "step": 860000
+    },
+    {
+      "epoch": 0.6789716209485901,
+      "grad_norm": 5.071200370788574,
+      "learning_rate": 0.0003717768906257172,
+      "loss": 5.5893,
+      "step": 865000
+    },
+    {
+      "epoch": 0.6828963123991599,
+      "grad_norm": 7.438723564147949,
+      "learning_rate": 0.0003704264557058006,
+      "loss": 5.598,
+      "step": 870000
+    },
+    {
+      "epoch": 0.6868210038497299,
+      "grad_norm": 6.021530628204346,
+      "learning_rate": 0.00036907170111211227,
+      "loss": 5.59,
+      "step": 875000
+    },
+    {
+      "epoch": 0.6907456953002997,
+      "grad_norm": 5.211944103240967,
+      "learning_rate": 0.000367711863796493,
+      "loss": 5.5696,
+      "step": 880000
+    },
+    {
+      "epoch": 0.6946703867508696,
+      "grad_norm": 6.795500755310059,
+      "learning_rate": 0.00036634863039584095,
+      "loss": 5.5723,
+      "step": 885000
+    },
+    {
+      "epoch": 0.6985950782014395,
+      "grad_norm": 5.494241714477539,
+      "learning_rate": 0.0003649795953068187,
+      "loss": 5.5547,
+      "step": 890000
+    },
+    {
+      "epoch": 0.7025197696520094,
+      "grad_norm": 6.3428544998168945,
+      "learning_rate": 0.00036360672336543015,
+      "loss": 5.5556,
+      "step": 895000
+    },
+    {
+      "epoch": 0.7064444611025793,
+      "grad_norm": 6.741039276123047,
+      "learning_rate": 0.0003622292442014995,
+      "loss": 5.5512,
+      "step": 900000
+    },
+    {
+      "epoch": 0.7064444611025793,
+      "eval_loss": 2.840968608856201,
+      "eval_runtime": 1111.2113,
+      "eval_samples_per_second": 194.334,
+      "eval_steps_per_second": 24.292,
+      "step": 900000
+    }
+  ],
+  "logging_steps": 5000,
+  "max_steps": 2547970,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.752003246057943e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2cc9b694f6483da0c07d8c6c9c2da9a68a7197ae2d74b602b5b263830cfbb5f
+size 5432