[2025-10-10 13:10:41,462] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:24741] baseline 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2025-10-10 13:10:41,462] [INFO] [axolotl.cli.config.load_cfg:248] [PID:24741] config:
{
  "activation_offloading": false,
  "adapter": "qlora",
  "axolotl_config_path": "config.yaml",
  "base_model": "nferruz/ProtGPT2",
  "base_model_config": "nferruz/ProtGPT2",
  "batch_size": 2,
  "bf16": false,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_75",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_processes": 2,
  "datasets": [
    {
      "ds_type": "json",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "/content/sequences_tokenized.jsonl",
      "trust_remote_code": false
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_steps": 0.01,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": true,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "hub_model_id": "ProtGPT2-Oxido",
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": false,
  "learning_rate": 0.002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 100,
  "lora_alpha": 16,
  "lora_dropout": 0.05,
  "lora_r": 32,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "gpt2",
  "num_epochs": 3.0,
  "optimizer": "paged_adamw_32bit",
  "output_dir": "./qlora-out",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_strategy": "epoch",
  "sequence_len": 2048,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "eos_token": "<|endoftext|>",
    "pad_token": "<|endoftext|>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "nferruz/ProtGPT2",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "torch_dtype": "torch.float16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "type_of_model": "AutoModelForCausalLM",
  "use_ray": false,
  "val_set_size": 0.02,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_steps": 100,
  "weight_decay": 0.0,
  "world_size": 1
}
[2025-10-10 13:10:42,465] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:24741] EOS: 0 / <|endoftext|>
[2025-10-10 13:10:42,465] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:24741] BOS: 0 / <|endoftext|>
[2025-10-10 13:10:42,465] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:24741] PAD: 0 / <|endoftext|>
[2025-10-10 13:10:42,465] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:24741] UNK: 0 / <|endoftext|>
[2025-10-10 13:10:42,465] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:24741] No Chat template selected. Consider adding a chat template for easier inference.
[2025-10-10 13:10:42,466] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:24741] Unable to find prepared dataset in last_run_prepared/120d8e2ed44f3c537dc9a20773f86561
[2025-10-10 13:10:42,466] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:24741] Loading raw datasets...
[2025-10-10 13:10:42,466] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:24741] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2025-10-10 13:10:42,802] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:24741] Loading dataset: /content/sequences_tokenized.jsonl with base_type: None and prompt_style: None
[2025-10-10 13:10:42,820] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:24741] min_input_len: 6
[2025-10-10 13:10:42,821] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:24741] max_input_len: 512
Dropping Long Sequences (>2048) (num_proc=2):   0% 0/6304 [00:00<?, ? examples/s]Dropping Long Sequences (>2048) (num_proc=2):  16% 1000/6304 [00:00<00:01, 2728.50 examples/s]Dropping Long Sequences (>2048) (num_proc=2):  48% 3000/6304 [00:00<00:00, 5756.81 examples/s]Dropping Long Sequences (>2048) (num_proc=2):  79% 5000/6304 [00:00<00:00, 7979.32 examples/s]Dropping Long Sequences (>2048) (num_proc=2): 100% 6304/6304 [00:00<00:00, 6859.52 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=2):   0% 0/6304 [00:00<?, ? examples/s]Drop Samples with Zero Trainable Tokens (num_proc=2):  16% 1000/6304 [00:00<00:01, 3093.94 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=2):  48% 3000/6304 [00:00<00:00, 7754.71 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=2):  79% 5000/6304 [00:00<00:00, 11055.90 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=2): 100% 6304/6304 [00:00<00:00, 8853.66 examples/s] 
Add position_id column (Sample Packing) (num_proc=2):   0% 0/6304 [00:00<?, ? examples/s]Add position_id column (Sample Packing) (num_proc=2):  16% 1000/6304 [00:00<00:01, 3374.43 examples/s]Add position_id column (Sample Packing) (num_proc=2):  48% 3000/6304 [00:00<00:00, 6482.08 examples/s]Add position_id column (Sample Packing) (num_proc=2):  79% 5000/6304 [00:00<00:00, 7737.71 examples/s]Add position_id column (Sample Packing) (num_proc=2): 100% 6304/6304 [00:00<00:00, 6958.78 examples/s]
Saving the dataset (0/2 shards):   0% 0/6304 [00:00<?, ? examples/s]Saving the dataset (0/2 shards):  50% 3152/6304 [00:00<00:00, 29151.65 examples/s]Saving the dataset (1/2 shards): 100% 6304/6304 [00:00<00:00, 29151.65 examples/s]Saving the dataset (2/2 shards): 100% 6304/6304 [00:00<00:00, 29151.65 examples/s]Saving the dataset (2/2 shards): 100% 6304/6304 [00:00<00:00, 33750.47 examples/s]
[2025-10-10 13:10:45,620] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:24741] total_num_tokens: 16_570
[2025-10-10 13:10:45,622] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:24741] `total_supervised_tokens: 16_570`
[2025-10-10 13:10:48,083] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 0.9218025207519531
[2025-10-10 13:10:49,020] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 0.9363107681274414
[2025-10-10 13:10:49,934] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 0.9140019416809082
[2025-10-10 13:10:50,862] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 0.9271283149719238
[2025-10-10 13:10:50,882] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]
[2025-10-10 13:10:50,882] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:24741] data_loader_len: 4
[2025-10-10 13:10:50,883] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:24741] sample_packing_eff_est across ranks: [0.8989800347222222]
[2025-10-10 13:10:50,883] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:24741] sample_packing_eff_est: None
[2025-10-10 13:10:50,883] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:24741] total_num_steps: 12
[2025-10-10 13:10:50,893] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:24741] total_num_tokens: 746_874
[2025-10-10 13:10:50,932] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:24741] `total_supervised_tokens: 746_874`
[2025-10-10 13:10:52,871] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 0.9358129501342773
[2025-10-10 13:10:53,781] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 0.9106135368347168
[2025-10-10 13:10:55,014] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2319858074188232
[2025-10-10 13:10:56,287] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2722358703613281
[2025-10-10 13:10:56,287] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [183]
[2025-10-10 13:10:56,287] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:24741] data_loader_len: 183
[2025-10-10 13:10:56,287] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:24741] sample_packing_eff_est across ranks: [0.9936909272820164]
[2025-10-10 13:10:56,287] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:24741] sample_packing_eff_est: 1.0
[2025-10-10 13:10:56,287] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:24741] total_num_steps: 549
[2025-10-10 13:10:56,287] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:24741] Maximum number of steps set at 549
[2025-10-10 13:10:56,297] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:24741] Loading tokenizer... nferruz/ProtGPT2
[2025-10-10 13:10:57,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:24741] EOS: 0 / <|endoftext|>
[2025-10-10 13:10:57,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:24741] BOS: 0 / <|endoftext|>
[2025-10-10 13:10:57,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:24741] PAD: 0 / <|endoftext|>
[2025-10-10 13:10:57,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:24741] UNK: 0 / <|endoftext|>
[2025-10-10 13:10:57,214] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:24741] No Chat template selected. Consider adding a chat template for easier inference.
[2025-10-10 13:10:57,215] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:24741] Loading model
[2025-10-10 13:10:57,333] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:24741] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-10-10 13:10:57,334] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:24741] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-10-10 13:10:57,335] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:24741] Applying multipack dataloader patch for sample packing...
[2025-10-10 13:11:27,461] [WARNING] [axolotl.loaders.model._adjust_model_config:273] [PID:24741] increasing model.config.max_position_embeddings from 1024 to 2048
[2025-10-10 13:11:27,467] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:24741] converting PEFT model w/ prepare_model_for_kbit_training
[2025-10-10 13:11:27,479] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:24741] Converting modules to torch.float16
[2025-10-10 13:11:27,481] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:24741] Memory usage after model load 0.849GB (+0.849GB allocated, +0.918GB reserved)
[2025-10-10 13:11:27,482] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:24741] found linear modules: ['c_attn', 'c_fc', 'c_proj']
trainable params: 23,592,960 || all params: 797,623,040 || trainable%: 2.9579
[2025-10-10 13:11:27,888] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:24741] after adapters 0.618GB (+0.618GB allocated, +1.012GB reserved)
[2025-10-10 13:11:39,738] [INFO] [axolotl.train.save_initial_configs:398] [PID:24741] Pre-saving adapter config to ./qlora-out...
[2025-10-10 13:11:39,738] [INFO] [axolotl.train.save_initial_configs:402] [PID:24741] Pre-saving tokenizer to ./qlora-out...
[2025-10-10 13:11:39,828] [INFO] [axolotl.train.save_initial_configs:407] [PID:24741] Pre-saving model config to ./qlora-out...
[2025-10-10 13:11:39,836] [INFO] [axolotl.train.execute_training:196] [PID:24741] Starting trainer...
[2025-10-10 13:11:45,415] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8461699485778809
[2025-10-10 13:11:47,771] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 2.3554582595825195
[2025-10-10 13:11:49,329] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5573019981384277
[2025-10-10 13:11:51,006] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6764421463012695
[2025-10-10 13:11:51,006] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [183]
  0% 0/549 [00:00<?, ?it/s][2025-10-10 13:11:51,161] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:11:54,146] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2840440273284912
[2025-10-10 13:11:55,365] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.218794584274292
[2025-10-10 13:11:56,599] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2343621253967285
[2025-10-10 13:11:58,099] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4991233348846436
[2025-10-10 13:11:58,099] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:00<00:00,  3.33it/s][A
 75% 3/4 [00:01<00:00,  1.68it/s][A
100% 4/4 [00:02<00:00,  1.30it/s][A                           
                                 [A{'eval_loss': 8.67106819152832, 'eval_runtime': 7.1874, 'eval_samples_per_second': 17.67, 'eval_steps_per_second': 8.904, 'memory/max_active (GiB)': 2.61, 'memory/max_allocated (GiB)': 2.61, 'memory/device_reserved (GiB)': 3.21, 'epoch': 0}
  0% 0/549 [00:14<?, ?it/s]
100% 4/4 [00:03<00:00,  1.30it/s][A
                                 [A  0% 1/549 [00:21<3:16:49, 21.55s/it]  0% 2/549 [00:25<1:40:35, 11.03s/it]  1% 3/549 [00:28<1:09:56,  7.69s/it]  1% 4/549 [00:32<55:34,  6.12s/it]    1% 5/549 [00:36<47:37,  5.25s/it]  1% 6/549 [00:40<42:50,  4.73s/it][2025-10-10 13:12:31,245] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:12:33,921] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.245626449584961
[2025-10-10 13:12:35,290] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3684089183807373
[2025-10-10 13:12:37,029] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.738837718963623
[2025-10-10 13:12:38,274] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2442030906677246
[2025-10-10 13:12:38,274] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.90it/s][A
 75% 3/4 [00:02<00:00,  1.36it/s][A
100% 4/4 [00:03<00:00,  1.16it/s][A                                   
                                 [A{'eval_loss': 8.665609359741211, 'eval_runtime': 5.6196, 'eval_samples_per_second': 22.599, 'eval_steps_per_second': 11.389, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.03}
  1% 6/549 [00:52<42:50,  4.73s/it]
100% 4/4 [00:03<00:00,  1.16it/s][A
                                 [A  1% 7/549 [00:56<1:17:17,  8.56s/it]  1% 8/549 [01:00<1:03:22,  7.03s/it]  2% 9/549 [01:04<54:04,  6.01s/it]    2% 10/549 [01:07<47:47,  5.32s/it]  2% 11/549 [01:11<43:30,  4.85s/it]  2% 12/549 [01:15<40:33,  4.53s/it][2025-10-10 13:13:06,562] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:13:09,050] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2255823612213135
[2025-10-10 13:13:10,291] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.240588665008545
[2025-10-10 13:13:11,581] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2900962829589844
[2025-10-10 13:13:13,335] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.753103256225586
[2025-10-10 13:13:13,335] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.86it/s][A
 75% 3/4 [00:02<00:00,  1.34it/s][A
100% 4/4 [00:03<00:00,  1.14it/s][A                                    
                                 [A{'eval_loss': 8.614709854125977, 'eval_runtime': 5.8541, 'eval_samples_per_second': 21.694, 'eval_steps_per_second': 10.932, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.07}
  2% 12/549 [01:28<40:33,  4.53s/it]
100% 4/4 [00:03<00:00,  1.14it/s][A
                                 [A  2% 13/549 [01:31<1:12:45,  8.14s/it]  3% 14/549 [01:35<1:00:59,  6.84s/it]  3% 15/549 [01:39<52:49,  5.93s/it]    3% 16/549 [01:43<47:08,  5.31s/it]  3% 17/549 [01:47<43:11,  4.87s/it]  3% 18/549 [01:51<40:28,  4.57s/it][2025-10-10 13:13:42,268] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:13:44,777] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.233976125717163
[2025-10-10 13:13:45,991] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2140934467315674
[2025-10-10 13:13:47,230] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2387192249298096
[2025-10-10 13:13:48,517] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2868869304656982
[2025-10-10 13:13:48,517] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.83it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                    
                                 [A{'eval_loss': 8.517555236816406, 'eval_runtime': 6.4388, 'eval_samples_per_second': 19.724, 'eval_steps_per_second': 9.94, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.1}
  3% 18/549 [02:03<40:28,  4.57s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A  3% 19/549 [02:07<1:12:15,  8.18s/it]  4% 20/549 [02:11<1:00:54,  6.91s/it]  4% 21/549 [02:15<52:54,  6.01s/it]    4% 22/549 [02:19<47:19,  5.39s/it]  4% 23/549 [02:23<43:26,  4.96s/it]  4% 24/549 [02:27<40:38,  4.64s/it][2025-10-10 13:14:18,514] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:14:20,985] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2237637042999268
[2025-10-10 13:14:22,224] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2387983798980713
[2025-10-10 13:14:23,447] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2227163314819336
[2025-10-10 13:14:24,682] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2346465587615967
[2025-10-10 13:14:24,682] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 8.410308837890625, 'eval_runtime': 6.0594, 'eval_samples_per_second': 20.959, 'eval_steps_per_second': 10.562, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.13}
  4% 24/549 [02:39<40:38,  4.64s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A  5% 25/549 [02:43<1:10:34,  8.08s/it]  5% 26/549 [02:47<59:22,  6.81s/it]    5% 27/549 [02:51<51:30,  5.92s/it]  5% 28/549 [02:54<45:59,  5.30s/it]  5% 29/549 [02:58<42:11,  4.87s/it]  5% 30/549 [03:02<39:29,  4.57s/it][2025-10-10 13:14:53,875] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:14:56,364] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.229734182357788
[2025-10-10 13:14:57,586] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2214250564575195
[2025-10-10 13:14:58,806] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.219433307647705
[2025-10-10 13:15:00,136] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.329803705215454
[2025-10-10 13:15:00,136] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 8.288145065307617, 'eval_runtime': 5.7346, 'eval_samples_per_second': 22.146, 'eval_steps_per_second': 11.16, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.16}
  5% 30/549 [03:14<39:29,  4.57s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A  6% 31/549 [03:18<1:08:40,  7.95s/it]  6% 32/549 [03:22<57:59,  6.73s/it]    6% 33/549 [03:26<50:32,  5.88s/it]  6% 34/549 [03:30<45:20,  5.28s/it]  6% 35/549 [03:34<41:45,  4.87s/it]  7% 36/549 [03:38<39:13,  4.59s/it][2025-10-10 13:15:29,235] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:15:31,722] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.212110996246338
[2025-10-10 13:15:32,954] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.231447458267212
[2025-10-10 13:15:34,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2508065700531006
[2025-10-10 13:15:35,438] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2330925464630127
[2025-10-10 13:15:35,439] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                    
                                 [A{'eval_loss': 8.149022102355957, 'eval_runtime': 5.7853, 'eval_samples_per_second': 21.952, 'eval_steps_per_second': 11.062, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.2}
  7% 36/549 [03:50<39:13,  4.59s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A  7% 37/549 [03:53<1:08:00,  7.97s/it]  7% 38/549 [03:57<57:23,  6.74s/it]    7% 39/549 [04:01<50:02,  5.89s/it]  7% 40/549 [04:05<44:51,  5.29s/it]  7% 41/549 [04:09<41:10,  4.86s/it]  8% 42/549 [04:13<38:40,  4.58s/it][2025-10-10 13:16:04,533] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:16:07,240] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2179176807403564
[2025-10-10 13:16:08,479] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.239612340927124
[2025-10-10 13:16:09,694] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2147700786590576
[2025-10-10 13:16:10,944] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2497029304504395
[2025-10-10 13:16:10,945] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                    
                                 [A{'eval_loss': 7.923487663269043, 'eval_runtime': 5.7808, 'eval_samples_per_second': 21.969, 'eval_steps_per_second': 11.071, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.23}
  8% 42/549 [04:25<38:40,  4.58s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A  8% 43/549 [04:29<1:07:37,  8.02s/it]  8% 44/549 [04:33<56:57,  6.77s/it]    8% 45/549 [04:37<49:35,  5.90s/it]  8% 46/549 [04:41<44:25,  5.30s/it]  9% 47/549 [04:44<40:49,  4.88s/it]  9% 48/549 [04:48<38:22,  4.60s/it][2025-10-10 13:16:40,037] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:16:43,053] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2188963890075684
[2025-10-10 13:16:44,293] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2395522594451904
[2025-10-10 13:16:45,513] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.21950364112854
[2025-10-10 13:16:46,763] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2497367858886719
[2025-10-10 13:16:46,763] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 7.707249641418457, 'eval_runtime': 5.7224, 'eval_samples_per_second': 22.194, 'eval_steps_per_second': 11.184, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.26}
  9% 48/549 [05:01<38:22,  4.60s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A  9% 49/549 [05:05<1:07:36,  8.11s/it]  9% 50/549 [05:09<56:51,  6.84s/it]    9% 51/549 [05:12<49:28,  5.96s/it]  9% 52/549 [05:16<44:15,  5.34s/it] 10% 53/549 [05:20<40:31,  4.90s/it] 10% 54/549 [05:24<36:22,  4.41s/it][2025-10-10 13:17:15,168] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:17:18,275] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8323063850402832
[2025-10-10 13:17:19,609] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3336091041564941
[2025-10-10 13:17:20,827] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2181472778320312
[2025-10-10 13:17:22,034] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2064990997314453
[2025-10-10 13:17:22,034] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 7.54141092300415, 'eval_runtime': 5.7423, 'eval_samples_per_second': 22.117, 'eval_steps_per_second': 11.145, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.3}
 10% 54/549 [05:36<36:22,  4.41s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 10% 55/549 [05:40<1:06:03,  8.02s/it] 10% 56/549 [05:44<55:34,  6.76s/it]   10% 57/549 [05:48<48:18,  5.89s/it] 11% 58/549 [05:52<43:15,  5.29s/it] 11% 59/549 [05:55<39:46,  4.87s/it] 11% 60/549 [05:59<37:18,  4.58s/it][2025-10-10 13:17:50,975] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:17:53,534] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2939870357513428
[2025-10-10 13:17:55,286] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7515311241149902
[2025-10-10 13:17:56,595] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3087666034698486
[2025-10-10 13:17:58,392] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7966506481170654
[2025-10-10 13:17:58,392] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 7.278426647186279, 'eval_runtime': 6.4276, 'eval_samples_per_second': 19.758, 'eval_steps_per_second': 9.957, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.33}
 11% 60/549 [06:13<37:18,  4.58s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 11% 61/549 [06:17<1:09:20,  8.52s/it] 11% 62/549 [06:21<57:52,  7.13s/it]   11% 63/549 [06:25<49:53,  6.16s/it] 12% 64/549 [06:29<44:24,  5.49s/it] 12% 65/549 [06:33<40:32,  5.03s/it] 12% 66/549 [06:37<37:50,  4.70s/it][2025-10-10 13:18:28,296] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:18:33,461] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 2.976652145385742
[2025-10-10 13:18:35,243] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7817871570587158
[2025-10-10 13:18:36,478] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2347698211669922
[2025-10-10 13:18:37,729] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2507171630859375
[2025-10-10 13:18:37,729] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.80it/s][A
 75% 3/4 [00:02<00:00,  1.31it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                    
                                 [A{'eval_loss': 7.130417823791504, 'eval_runtime': 6.5516, 'eval_samples_per_second': 19.385, 'eval_steps_per_second': 9.769, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.36}
 12% 66/549 [06:53<37:50,  4.70s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 12% 67/549 [06:57<1:14:42,  9.30s/it] 12% 68/549 [07:01<1:01:33,  7.68s/it] 13% 69/549 [07:04<52:22,  6.55s/it]   13% 70/549 [07:08<45:55,  5.75s/it] 13% 71/549 [07:12<41:24,  5.20s/it] 13% 72/549 [07:16<38:15,  4.81s/it][2025-10-10 13:19:07,848] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:19:10,905] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.743417501449585
[2025-10-10 13:19:12,274] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3681507110595703
[2025-10-10 13:19:13,507] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.232816457748413
[2025-10-10 13:19:14,734] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2264103889465332
[2025-10-10 13:19:14,734] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                    
                                 [A{'eval_loss': 6.794009685516357, 'eval_runtime': 5.7504, 'eval_samples_per_second': 22.086, 'eval_steps_per_second': 11.13, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.39}
 13% 72/549 [07:29<38:15,  4.81s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 13% 73/549 [07:33<1:05:59,  8.32s/it] 13% 74/549 [07:36<53:50,  6.80s/it]   14% 75/549 [07:40<46:47,  5.92s/it] 14% 76/549 [07:44<41:57,  5.32s/it] 14% 77/549 [07:48<38:32,  4.90s/it] 14% 78/549 [07:52<36:06,  4.60s/it][2025-10-10 13:19:43,212] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:19:45,690] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2276828289031982
[2025-10-10 13:19:47,405] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7151846885681152
[2025-10-10 13:19:48,875] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4696624279022217
[2025-10-10 13:19:50,118] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.24239182472229
[2025-10-10 13:19:50,118] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 6.708348751068115, 'eval_runtime': 5.7043, 'eval_samples_per_second': 22.264, 'eval_steps_per_second': 11.22, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.43}
 14% 78/549 [08:04<36:06,  4.60s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 14% 79/549 [08:08<1:03:59,  8.17s/it] 15% 80/549 [08:12<53:44,  6.88s/it]   15% 81/549 [08:16<46:34,  5.97s/it] 15% 82/549 [08:20<41:37,  5.35s/it] 15% 83/549 [08:24<38:10,  4.91s/it] 15% 84/549 [08:27<35:43,  4.61s/it][2025-10-10 13:20:19,124] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:20:21,612] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2227427959442139
[2025-10-10 13:20:22,917] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3052830696105957
[2025-10-10 13:20:24,696] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7787699699401855
[2025-10-10 13:20:25,947] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2503879070281982
[2025-10-10 13:20:25,947] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 6.766955852508545, 'eval_runtime': 5.7386, 'eval_samples_per_second': 22.131, 'eval_steps_per_second': 11.152, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.46}
 15% 84/549 [08:40<35:43,  4.61s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 15% 85/549 [08:44<1:03:06,  8.16s/it] 16% 86/549 [08:48<53:01,  6.87s/it]   16% 87/549 [08:52<45:57,  5.97s/it] 16% 88/549 [08:56<41:08,  5.35s/it] 16% 89/549 [08:59<37:43,  4.92s/it] 16% 90/549 [09:03<35:17,  4.61s/it][2025-10-10 13:20:55,024] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:20:57,527] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2476911544799805
[2025-10-10 13:20:58,765] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2383363246917725
[2025-10-10 13:21:00,176] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4102411270141602
[2025-10-10 13:21:01,923] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7466182708740234
[2025-10-10 13:21:01,923] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 6.3829665184021, 'eval_runtime': 5.7206, 'eval_samples_per_second': 22.2, 'eval_steps_per_second': 11.188, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.49}
 16% 90/549 [09:16<35:17,  4.61s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 17% 91/549 [09:20<1:02:22,  8.17s/it] 17% 92/549 [09:24<52:22,  6.88s/it]   17% 93/549 [09:28<45:23,  5.97s/it] 17% 94/549 [09:31<40:32,  5.35s/it] 17% 95/549 [09:35<37:09,  4.91s/it] 17% 96/549 [09:39<34:46,  4.61s/it][2025-10-10 13:21:30,890] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:21:33,421] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2245755195617676
[2025-10-10 13:21:34,670] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2479896545410156
[2025-10-10 13:21:35,919] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2494275569915771
[2025-10-10 13:21:37,555] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6355178356170654
[2025-10-10 13:21:37,555] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                    
                                 [A{'eval_loss': 6.172480583190918, 'eval_runtime': 6.1695, 'eval_samples_per_second': 20.585, 'eval_steps_per_second': 10.374, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.52}
 17% 96/549 [09:52<34:46,  4.61s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 18% 97/549 [09:56<1:02:02,  8.24s/it] 18% 98/549 [10:00<52:01,  6.92s/it]   18% 99/549 [10:04<45:02,  6.00s/it] 18% 100/549 [10:08<40:10,  5.37s/it]                                     {'loss': 7.8585, 'grad_norm': 1.3858423233032227, 'learning_rate': 0.00192, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'tokens_per_second_per_gpu': 42.0, 'epoch': 0.55}
 18% 100/549 [10:08<40:10,  5.37s/it] 18% 101/549 [10:12<36:58,  4.95s/it] 19% 102/549 [10:15<34:32,  4.64s/it][2025-10-10 13:22:07,077] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:22:09,610] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.245088815689087
[2025-10-10 13:22:10,855] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2448465824127197
[2025-10-10 13:22:12,106] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2508001327514648
[2025-10-10 13:22:13,381] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2749567031860352
[2025-10-10 13:22:13,382] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.937534809112549, 'eval_runtime': 6.3177, 'eval_samples_per_second': 20.102, 'eval_steps_per_second': 10.13, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.56}
 19% 102/549 [10:28<34:32,  4.64s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 19% 103/549 [10:32<1:00:52,  8.19s/it] 19% 104/549 [10:36<51:10,  6.90s/it]   19% 105/549 [10:40<44:23,  6.00s/it] 19% 106/549 [10:44<39:36,  5.36s/it] 19% 107/549 [10:48<36:21,  4.94s/it] 20% 108/549 [10:51<34:00,  4.63s/it][2025-10-10 13:22:43,069] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:22:45,569] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.246150016784668
[2025-10-10 13:22:46,822] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2529652118682861
[2025-10-10 13:22:48,105] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2827684879302979
[2025-10-10 13:22:49,347] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2417457103729248
[2025-10-10 13:22:49,347] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.884361267089844, 'eval_runtime': 5.9745, 'eval_samples_per_second': 21.257, 'eval_steps_per_second': 10.712, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.59}
 20% 108/549 [11:04<34:00,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 20% 109/549 [11:08<59:10,  8.07s/it] 20% 110/549 [11:11<49:49,  6.81s/it] 20% 111/549 [11:15<43:16,  5.93s/it] 20% 112/549 [11:19<38:43,  5.32s/it] 21% 113/549 [11:23<35:29,  4.89s/it] 21% 114/549 [11:27<33:16,  4.59s/it][2025-10-10 13:23:18,582] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:23:21,111] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2601120471954346
[2025-10-10 13:23:22,364] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2526865005493164
[2025-10-10 13:23:23,588] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.223961591720581
[2025-10-10 13:23:24,824] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2359259128570557
[2025-10-10 13:23:24,824] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 6.100166320800781, 'eval_runtime': 5.769, 'eval_samples_per_second': 22.014, 'eval_steps_per_second': 11.094, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.62}
 21% 114/549 [11:39<33:16,  4.59s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 21% 115/549 [11:43<57:40,  7.97s/it] 21% 116/549 [11:47<48:37,  6.74s/it] 21% 117/549 [11:51<42:19,  5.88s/it] 21% 118/549 [11:54<37:57,  5.28s/it] 22% 119/549 [11:58<34:52,  4.87s/it] 22% 120/549 [12:02<32:40,  4.57s/it][2025-10-10 13:23:53,849] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:23:56,384] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2743051052093506
[2025-10-10 13:23:57,627] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2425775527954102
[2025-10-10 13:23:58,872] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2450978755950928
[2025-10-10 13:24:00,110] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2371137142181396
[2025-10-10 13:24:00,110] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                     
                                 [A{'eval_loss': 5.836405277252197, 'eval_runtime': 5.7822, 'eval_samples_per_second': 21.964, 'eval_steps_per_second': 11.069, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.66}
 22% 120/549 [12:14<32:40,  4.57s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 22% 121/549 [12:18<56:50,  7.97s/it] 22% 122/549 [12:22<47:53,  6.73s/it] 22% 123/549 [12:26<41:44,  5.88s/it] 23% 124/549 [12:30<37:24,  5.28s/it] 23% 125/549 [12:34<34:22,  4.86s/it] 23% 126/549 [12:38<32:18,  4.58s/it][2025-10-10 13:24:29,185] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:24:32,019] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2589247226715088
[2025-10-10 13:24:33,287] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2684495449066162
[2025-10-10 13:24:34,521] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2337257862091064
[2025-10-10 13:24:35,751] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2298574447631836
[2025-10-10 13:24:35,752] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                     
                                 [A{'eval_loss': 5.894244194030762, 'eval_runtime': 5.7903, 'eval_samples_per_second': 21.933, 'eval_steps_per_second': 11.053, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.69}
 23% 126/549 [12:50<32:18,  4.58s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 23% 127/549 [12:54<56:47,  8.08s/it] 23% 128/549 [12:58<47:47,  6.81s/it] 23% 129/549 [13:02<41:35,  5.94s/it] 24% 130/549 [13:05<37:15,  5.33s/it] 24% 131/549 [13:09<34:11,  4.91s/it] 24% 132/549 [13:13<32:04,  4.61s/it][2025-10-10 13:25:04,943] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:25:08,050] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2962446212768555
[2025-10-10 13:25:09,299] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2489659786224365
[2025-10-10 13:25:10,529] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2295067310333252
[2025-10-10 13:25:11,775] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2457163333892822
[2025-10-10 13:25:11,775] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.814137935638428, 'eval_runtime': 5.7262, 'eval_samples_per_second': 22.179, 'eval_steps_per_second': 11.177, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.72}
 24% 132/549 [13:26<32:04,  4.61s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 24% 133/549 [13:30<56:33,  8.16s/it] 24% 134/549 [13:34<47:29,  6.87s/it] 25% 135/549 [13:37<41:08,  5.96s/it] 25% 136/549 [13:41<36:43,  5.34s/it] 25% 137/549 [13:45<33:38,  4.90s/it] 25% 138/549 [13:49<31:27,  4.59s/it][2025-10-10 13:25:40,704] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:25:43,878] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.726452112197876
[2025-10-10 13:25:45,124] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.245727777481079
[2025-10-10 13:25:46,372] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2481646537780762
[2025-10-10 13:25:47,600] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2278366088867188
[2025-10-10 13:25:47,601] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.674722671508789, 'eval_runtime': 5.7279, 'eval_samples_per_second': 22.172, 'eval_steps_per_second': 11.173, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.75}
 25% 138/549 [14:02<31:27,  4.59s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 25% 139/549 [14:06<55:44,  8.16s/it] 26% 140/549 [14:09<46:48,  6.87s/it] 26% 141/549 [14:13<40:35,  5.97s/it] 26% 142/549 [14:17<36:14,  5.34s/it] 26% 143/549 [14:21<33:13,  4.91s/it] 26% 144/549 [14:25<31:06,  4.61s/it][2025-10-10 13:26:16,597] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:26:19,460] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5904531478881836
[2025-10-10 13:26:21,049] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5887093544006348
[2025-10-10 13:26:22,299] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2505967617034912
[2025-10-10 13:26:23,529] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.229478120803833
[2025-10-10 13:26:23,529] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.685309410095215, 'eval_runtime': 5.7564, 'eval_samples_per_second': 22.062, 'eval_steps_per_second': 11.118, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.79}
 26% 144/549 [14:38<31:06,  4.61s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 26% 145/549 [14:41<55:08,  8.19s/it] 27% 146/549 [14:45<46:16,  6.89s/it] 27% 147/549 [14:49<40:05,  5.98s/it] 27% 148/549 [14:53<35:50,  5.36s/it] 27% 149/549 [14:57<32:50,  4.93s/it] 27% 150/549 [15:01<30:40,  4.61s/it][2025-10-10 13:26:52,575] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:26:55,133] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.290666103363037
[2025-10-10 13:26:56,896] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7622182369232178
[2025-10-10 13:26:58,238] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3421812057495117
[2025-10-10 13:26:59,467] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2286624908447266
[2025-10-10 13:26:59,467] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.722118377685547, 'eval_runtime': 5.7291, 'eval_samples_per_second': 22.167, 'eval_steps_per_second': 11.171, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.82}
 27% 150/549 [15:14<30:40,  4.61s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 28% 151/549 [15:17<54:17,  8.18s/it] 28% 152/549 [15:21<45:33,  6.89s/it] 28% 153/549 [15:25<39:27,  5.98s/it] 28% 154/549 [15:29<35:14,  5.35s/it] 28% 155/549 [15:33<32:17,  4.92s/it] 28% 156/549 [15:36<28:59,  4.43s/it][2025-10-10 13:27:27,880] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:27:30,445] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2810289859771729
[2025-10-10 13:27:31,672] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2266299724578857
[2025-10-10 13:27:33,392] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7197673320770264
[2025-10-10 13:27:34,889] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4970765113830566
[2025-10-10 13:27:34,889] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.537834167480469, 'eval_runtime': 5.7439, 'eval_samples_per_second': 22.11, 'eval_steps_per_second': 11.142, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.85}
 28% 156/549 [15:49<28:59,  4.43s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 29% 157/549 [15:53<52:50,  8.09s/it] 29% 158/549 [15:57<44:26,  6.82s/it] 29% 159/549 [16:01<38:37,  5.94s/it] 29% 160/549 [16:04<34:31,  5.33s/it] 29% 161/549 [16:08<31:41,  4.90s/it] 30% 162/549 [16:12<29:40,  4.60s/it][2025-10-10 13:28:03,961] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:28:06,582] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2377257347106934
[2025-10-10 13:28:07,822] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2394213676452637
[2025-10-10 13:28:09,276] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4542319774627686
[2025-10-10 13:28:10,984] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7078406810760498
[2025-10-10 13:28:10,984] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.492016315460205, 'eval_runtime': 5.8041, 'eval_samples_per_second': 21.881, 'eval_steps_per_second': 11.027, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.89}
 30% 162/549 [16:25<29:40,  4.60s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 30% 163/549 [16:29<52:53,  8.22s/it] 30% 164/549 [16:33<44:20,  6.91s/it] 30% 165/549 [16:37<38:21,  5.99s/it] 30% 166/549 [16:41<34:12,  5.36s/it] 30% 167/549 [16:44<31:21,  4.92s/it] 31% 168/549 [16:48<29:18,  4.62s/it][2025-10-10 13:28:40,029] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:28:42,579] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2564804553985596
[2025-10-10 13:28:43,831] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2514879703521729
[2025-10-10 13:28:45,090] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2588951587677002
[2025-10-10 13:28:46,762] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6718604564666748
[2025-10-10 13:28:46,762] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.551150321960449, 'eval_runtime': 6.0153, 'eval_samples_per_second': 21.113, 'eval_steps_per_second': 10.639, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.92}
 31% 168/549 [17:01<29:18,  4.62s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 31% 169/549 [17:05<52:01,  8.21s/it] 31% 170/549 [17:09<43:37,  6.91s/it] 31% 171/549 [17:13<37:46,  6.00s/it] 31% 172/549 [17:17<33:43,  5.37s/it] 32% 173/549 [17:21<30:52,  4.93s/it] 32% 174/549 [17:24<28:52,  4.62s/it][2025-10-10 13:29:16,068] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:29:18,702] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3219468593597412
[2025-10-10 13:29:20,003] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.301271915435791
[2025-10-10 13:29:21,280] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2761008739471436
[2025-10-10 13:29:22,700] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4196476936340332
[2025-10-10 13:29:22,700] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.445136070251465, 'eval_runtime': 6.3434, 'eval_samples_per_second': 20.021, 'eval_steps_per_second': 10.089, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.95}
 32% 174/549 [17:37<28:52,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 32% 175/549 [17:41<51:37,  8.28s/it] 32% 176/549 [17:45<43:14,  6.96s/it] 32% 177/549 [17:49<37:23,  6.03s/it] 32% 178/549 [17:53<33:19,  5.39s/it] 33% 179/549 [17:57<30:29,  4.94s/it] 33% 180/549 [18:01<28:28,  4.63s/it][2025-10-10 13:29:52,329] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:29:54,842] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2502317428588867
[2025-10-10 13:29:56,106] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2638275623321533
[2025-10-10 13:29:57,362] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.255711317062378
[2025-10-10 13:29:58,622] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.260202169418335
[2025-10-10 13:29:58,622] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.406078338623047, 'eval_runtime': 6.1725, 'eval_samples_per_second': 20.575, 'eval_steps_per_second': 10.369, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 0.98}
 33% 180/549 [18:13<28:28,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 33% 181/549 [18:17<49:55,  8.14s/it] 33% 182/549 [18:21<41:59,  6.86s/it] 33% 183/549 [18:25<36:37,  6.00s/it][2025-10-10 13:30:16,541] [INFO] [axolotl.core.trainers.base._save:671] [PID:24741] Saving model checkpoint to ./qlora-out/checkpoint-183
 34% 184/549 [18:35<43:14,  7.11s/it] 34% 185/549 [18:38<37:16,  6.14s/it] 34% 186/549 [18:42<33:03,  5.46s/it][2025-10-10 13:30:33,999] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:30:37,120] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5404832363128662
[2025-10-10 13:30:38,958] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8376410007476807
[2025-10-10 13:30:40,250] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2913427352905273
[2025-10-10 13:30:41,534] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2837259769439697
[2025-10-10 13:30:41,534] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.442931175231934, 'eval_runtime': 5.8182, 'eval_samples_per_second': 21.828, 'eval_steps_per_second': 11.0, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.02}
 34% 186/549 [18:56<33:03,  5.46s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 34% 187/549 [19:00<54:17,  9.00s/it] 34% 188/549 [19:03<44:58,  7.47s/it] 34% 189/549 [19:07<38:27,  6.41s/it] 35% 190/549 [19:11<33:56,  5.67s/it] 35% 191/549 [19:15<30:43,  5.15s/it] 35% 192/549 [19:19<28:25,  4.78s/it][2025-10-10 13:31:10,871] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:31:13,498] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.309769630432129
[2025-10-10 13:31:15,350] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8515050411224365
[2025-10-10 13:31:16,722] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3719418048858643
[2025-10-10 13:31:18,013] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2908899784088135
[2025-10-10 13:31:18,013] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.410170555114746, 'eval_runtime': 5.8179, 'eval_samples_per_second': 21.829, 'eval_steps_per_second': 11.001, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.05}
 35% 192/549 [19:32<28:25,  4.78s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 35% 193/549 [19:36<49:47,  8.39s/it] 35% 194/549 [19:40<41:34,  7.03s/it] 36% 195/549 [19:44<35:50,  6.07s/it] 36% 196/549 [19:48<31:51,  5.42s/it] 36% 197/549 [19:51<29:03,  4.95s/it] 36% 198/549 [19:55<27:04,  4.63s/it][2025-10-10 13:31:47,022] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:31:49,605] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2902817726135254
[2025-10-10 13:31:51,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4735219478607178
[2025-10-10 13:31:52,847] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7679553031921387
[2025-10-10 13:31:54,139] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2912085056304932
[2025-10-10 13:31:54,139] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.381246566772461, 'eval_runtime': 5.8109, 'eval_samples_per_second': 21.856, 'eval_steps_per_second': 11.014, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.08}
 36% 198/549 [20:08<27:04,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 36% 199/549 [20:12<48:18,  8.28s/it] 36% 200/549 [20:16<40:27,  6.96s/it]                                     {'loss': 5.9664, 'grad_norm': 0.3477332293987274, 'learning_rate': 0.001787100490808991, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'tokens_per_second_per_gpu': 21.15, 'epoch': 1.09}
 36% 200/549 [20:16<40:27,  6.96s/it] 37% 201/549 [20:20<34:59,  6.03s/it] 37% 202/549 [20:24<31:12,  5.40s/it] 37% 203/549 [20:28<28:34,  4.95s/it] 37% 204/549 [20:32<26:41,  4.64s/it][2025-10-10 13:32:23,315] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:32:25,913] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2995872497558594
[2025-10-10 13:32:27,238] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3243978023529053
[2025-10-10 13:32:29,078] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.840087652206421
[2025-10-10 13:32:30,448] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.369553804397583
[2025-10-10 13:32:30,448] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.376354217529297, 'eval_runtime': 5.788, 'eval_samples_per_second': 21.942, 'eval_steps_per_second': 11.057, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.11}
 37% 204/549 [20:45<26:41,  4.64s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 37% 205/549 [20:48<47:32,  8.29s/it] 38% 206/549 [20:52<39:47,  6.96s/it] 38% 207/549 [20:56<34:23,  6.03s/it] 38% 208/549 [21:00<30:40,  5.40s/it] 38% 209/549 [21:04<28:02,  4.95s/it] 38% 210/549 [21:08<26:09,  4.63s/it][2025-10-10 13:32:59,550] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:33:02,198] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3173751831054688
[2025-10-10 13:33:03,523] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3249952793121338
[2025-10-10 13:33:05,165] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.641808271408081
[2025-10-10 13:33:06,800] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6340107917785645
[2025-10-10 13:33:06,800] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.320211410522461, 'eval_runtime': 5.8867, 'eval_samples_per_second': 21.574, 'eval_steps_per_second': 10.872, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.15}
 38% 210/549 [21:21<26:09,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 38% 211/549 [21:25<47:01,  8.35s/it] 39% 212/549 [21:29<39:19,  7.00s/it] 39% 213/549 [21:33<33:58,  6.07s/it] 39% 214/549 [21:37<30:13,  5.41s/it] 39% 215/549 [21:40<27:37,  4.96s/it] 39% 216/549 [21:44<25:46,  4.65s/it][2025-10-10 13:33:36,019] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:33:38,696] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3145294189453125
[2025-10-10 13:33:40,014] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3176522254943848
[2025-10-10 13:33:41,479] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.464585781097412
[2025-10-10 13:33:43,345] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.865448236465454
[2025-10-10 13:33:43,345] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.3533477783203125, 'eval_runtime': 5.8744, 'eval_samples_per_second': 21.619, 'eval_steps_per_second': 10.895, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.18}
 39% 216/549 [21:58<25:46,  4.65s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 40% 217/549 [22:01<46:22,  8.38s/it] 40% 218/549 [22:05<38:46,  7.03s/it] 40% 219/549 [22:09<33:27,  6.08s/it] 40% 220/549 [22:13<29:44,  5.43s/it] 40% 221/549 [22:17<27:09,  4.97s/it] 40% 222/549 [22:21<25:19,  4.65s/it][2025-10-10 13:34:12,552] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:34:15,141] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2889900207519531
[2025-10-10 13:34:16,438] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2969541549682617
[2025-10-10 13:34:17,729] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2903382778167725
[2025-10-10 13:34:19,533] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8035340309143066
[2025-10-10 13:34:19,533] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.282848358154297, 'eval_runtime': 6.023, 'eval_samples_per_second': 21.086, 'eval_steps_per_second': 10.626, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.21}
 40% 222/549 [22:34<25:19,  4.65s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 41% 223/549 [22:38<45:08,  8.31s/it] 41% 224/549 [22:42<37:45,  6.97s/it] 41% 225/549 [22:45<32:36,  6.04s/it] 41% 226/549 [22:49<29:01,  5.39s/it] 41% 227/549 [22:53<26:31,  4.94s/it] 42% 228/549 [22:57<24:46,  4.63s/it][2025-10-10 13:34:48,798] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:34:51,398] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.286377191543579
[2025-10-10 13:34:52,681] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.282608985900879
[2025-10-10 13:34:53,962] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2809438705444336
[2025-10-10 13:34:55,426] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4635910987854004
[2025-10-10 13:34:55,427] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.275903701782227, 'eval_runtime': 6.2819, 'eval_samples_per_second': 20.217, 'eval_steps_per_second': 10.188, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.25}
 42% 228/549 [23:10<24:46,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 42% 229/549 [23:14<44:07,  8.27s/it] 42% 230/549 [23:18<36:56,  6.95s/it] 42% 231/549 [23:22<31:57,  6.03s/it] 42% 232/549 [23:26<28:28,  5.39s/it] 42% 233/549 [23:30<26:07,  4.96s/it] 43% 234/549 [23:33<24:24,  4.65s/it][2025-10-10 13:35:25,089] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:35:27,705] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2982923984527588
[2025-10-10 13:35:28,989] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2843728065490723
[2025-10-10 13:35:30,318] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3283510208129883
[2025-10-10 13:35:31,608] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2893931865692139
[2025-10-10 13:35:31,608] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.25028133392334, 'eval_runtime': 6.4496, 'eval_samples_per_second': 19.691, 'eval_steps_per_second': 9.923, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.28}
 43% 234/549 [23:46<24:24,  4.65s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 43% 235/549 [23:50<43:27,  8.30s/it] 43% 236/549 [23:54<36:23,  6.98s/it] 43% 237/549 [23:58<31:26,  6.05s/it] 43% 238/549 [24:02<27:57,  5.40s/it] 44% 239/549 [24:06<25:35,  4.95s/it] 44% 240/549 [24:10<23:53,  4.64s/it][2025-10-10 13:36:01,376] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:36:03,949] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2798094749450684
[2025-10-10 13:36:05,235] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2851686477661133
[2025-10-10 13:36:06,521] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2858819961547852
[2025-10-10 13:36:07,798] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2766339778900146
[2025-10-10 13:36:07,798] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.200336456298828, 'eval_runtime': 6.3809, 'eval_samples_per_second': 19.903, 'eval_steps_per_second': 10.03, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.31}
 44% 240/549 [24:23<23:53,  4.64s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 44% 241/549 [24:26<41:23,  8.06s/it] 44% 242/549 [24:30<34:49,  6.81s/it] 44% 243/549 [24:34<30:13,  5.93s/it] 44% 244/549 [24:37<27:00,  5.31s/it] 45% 245/549 [24:41<24:46,  4.89s/it] 45% 246/549 [24:45<23:12,  4.59s/it][2025-10-10 13:36:36,871] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:36:39,586] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.389134168624878
[2025-10-10 13:36:40,884] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2981505393981934
[2025-10-10 13:36:42,175] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2911362648010254
[2025-10-10 13:36:43,457] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2809994220733643
[2025-10-10 13:36:43,457] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.178483009338379, 'eval_runtime': 5.8445, 'eval_samples_per_second': 21.73, 'eval_steps_per_second': 10.95, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.34}
 45% 246/549 [24:58<23:12,  4.59s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 45% 247/549 [25:01<40:46,  8.10s/it] 45% 248/549 [25:05<34:15,  6.83s/it] 45% 249/549 [25:09<29:42,  5.94s/it] 46% 250/549 [25:13<26:32,  5.33s/it] 46% 251/549 [25:17<24:19,  4.90s/it] 46% 252/549 [25:21<22:45,  4.60s/it][2025-10-10 13:37:12,570] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:37:15,140] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2792997360229492
[2025-10-10 13:37:16,426] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2853057384490967
[2025-10-10 13:37:17,683] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2565968036651611
[2025-10-10 13:37:18,950] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2670745849609375
[2025-10-10 13:37:18,950] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.10it/s][A                                     
                                 [A{'eval_loss': 5.2125749588012695, 'eval_runtime': 5.8149, 'eval_samples_per_second': 21.84, 'eval_steps_per_second': 11.006, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.38}
 46% 252/549 [25:33<22:45,  4.60s/it]
100% 4/4 [00:03<00:00,  1.10it/s][A
                                 [A 46% 253/549 [25:37<39:38,  8.03s/it] 46% 254/549 [25:41<33:19,  6.78s/it] 46% 255/549 [25:45<28:58,  5.91s/it] 47% 256/549 [25:49<25:54,  5.31s/it] 47% 257/549 [25:52<23:46,  4.88s/it] 47% 258/549 [25:56<22:17,  4.59s/it][2025-10-10 13:37:48,074] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:37:51,049] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2944858074188232
[2025-10-10 13:37:52,344] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2948596477508545
[2025-10-10 13:37:53,649] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3044750690460205
[2025-10-10 13:37:54,926] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2766392230987549
[2025-10-10 13:37:54,926] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                     
                                 [A{'eval_loss': 5.1886444091796875, 'eval_runtime': 5.8293, 'eval_samples_per_second': 21.787, 'eval_steps_per_second': 10.979, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.41}
 47% 258/549 [26:09<22:17,  4.59s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 47% 259/549 [26:13<39:32,  8.18s/it] 47% 260/549 [26:17<33:09,  6.88s/it] 48% 261/549 [26:21<28:46,  5.99s/it] 48% 262/549 [26:25<25:40,  5.37s/it] 48% 263/549 [26:29<23:29,  4.93s/it] 48% 264/549 [26:32<21:59,  4.63s/it][2025-10-10 13:38:24,141] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:38:27,322] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2961266040802002
[2025-10-10 13:38:28,634] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.311255693435669
[2025-10-10 13:38:29,937] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3029818534851074
[2025-10-10 13:38:31,246] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3082985877990723
[2025-10-10 13:38:31,246] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                     
                                 [A{'eval_loss': 5.133144378662109, 'eval_runtime': 5.8367, 'eval_samples_per_second': 21.759, 'eval_steps_per_second': 10.965, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.44}
 48% 264/549 [26:45<21:59,  4.63s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 48% 265/549 [26:49<39:11,  8.28s/it] 48% 266/549 [26:53<32:49,  6.96s/it] 49% 267/549 [26:57<28:22,  6.04s/it] 49% 268/549 [27:00<24:24,  5.21s/it] 49% 269/549 [27:04<22:28,  4.82s/it] 49% 270/549 [27:08<21:06,  4.54s/it][2025-10-10 13:38:59,782] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:39:03,012] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7229037284851074
[2025-10-10 13:39:04,323] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3110601902008057
[2025-10-10 13:39:05,606] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2823243141174316
[2025-10-10 13:39:06,901] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2952723503112793
[2025-10-10 13:39:06,902] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.137051582336426, 'eval_runtime': 5.7827, 'eval_samples_per_second': 21.962, 'eval_steps_per_second': 11.068, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.48}
 49% 270/549 [27:21<21:06,  4.54s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 49% 271/549 [27:25<38:02,  8.21s/it] 50% 272/549 [27:29<31:53,  6.91s/it] 50% 273/549 [27:33<27:36,  6.00s/it] 50% 274/549 [27:37<24:36,  5.37s/it] 50% 275/549 [27:40<22:31,  4.93s/it] 50% 276/549 [27:44<21:02,  4.62s/it][2025-10-10 13:39:36,016] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:39:39,209] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8441014289855957
[2025-10-10 13:39:40,652] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4425406455993652
[2025-10-10 13:39:41,958] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.305793285369873
[2025-10-10 13:39:43,277] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3181705474853516
[2025-10-10 13:39:43,277] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.066106796264648, 'eval_runtime': 5.8019, 'eval_samples_per_second': 21.89, 'eval_steps_per_second': 11.031, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.51}
 50% 276/549 [27:57<21:02,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 50% 277/549 [28:01<37:40,  8.31s/it] 51% 278/549 [28:05<31:29,  6.97s/it] 51% 279/549 [28:09<27:11,  6.04s/it] 51% 280/549 [28:13<24:11,  5.39s/it] 51% 281/549 [28:16<21:14,  4.76s/it] 51% 282/549 [28:20<20:01,  4.50s/it][2025-10-10 13:40:11,711] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:40:14,334] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3228764533996582
[2025-10-10 13:40:16,191] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.856102705001831
[2025-10-10 13:40:17,517] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3263425827026367
[2025-10-10 13:40:18,801] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2833828926086426
[2025-10-10 13:40:18,801] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.1207499504089355, 'eval_runtime': 5.7763, 'eval_samples_per_second': 21.987, 'eval_steps_per_second': 11.08, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.54}
 51% 282/549 [28:33<20:01,  4.50s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 52% 283/549 [28:37<36:15,  8.18s/it] 52% 284/549 [28:41<30:24,  6.88s/it] 52% 285/549 [28:45<26:18,  5.98s/it] 52% 286/549 [28:48<22:40,  5.17s/it] 52% 287/549 [28:52<20:56,  4.80s/it] 52% 288/549 [28:56<19:41,  4.53s/it][2025-10-10 13:40:47,311] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:40:49,884] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2716014385223389
[2025-10-10 13:40:51,187] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.302952527999878
[2025-10-10 13:40:52,997] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8093366622924805
[2025-10-10 13:40:54,396] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.398949146270752
[2025-10-10 13:40:54,396] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 5.056352615356445, 'eval_runtime': 5.778, 'eval_samples_per_second': 21.98, 'eval_steps_per_second': 11.077, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.57}
 52% 288/549 [29:09<19:41,  4.53s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 53% 289/549 [29:12<35:29,  8.19s/it] 53% 290/549 [29:16<29:45,  6.89s/it] 53% 291/549 [29:20<25:44,  5.99s/it] 53% 292/549 [29:24<22:59,  5.37s/it] 53% 293/549 [29:28<21:02,  4.93s/it] 54% 294/549 [29:32<19:37,  4.62s/it][2025-10-10 13:41:23,512] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:41:26,092] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2780861854553223
[2025-10-10 13:41:27,347] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2545959949493408
[2025-10-10 13:41:28,868] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.520756721496582
[2025-10-10 13:41:30,544] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6756396293640137
[2025-10-10 13:41:30,544] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.123226165771484, 'eval_runtime': 5.8064, 'eval_samples_per_second': 21.873, 'eval_steps_per_second': 11.022, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.61}
 54% 294/549 [29:45<19:37,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 54% 295/549 [29:49<34:56,  8.25s/it] 54% 296/549 [29:52<29:14,  6.94s/it] 54% 297/549 [29:56<25:16,  6.02s/it] 54% 298/549 [30:00<22:30,  5.38s/it] 54% 299/549 [30:04<20:34,  4.94s/it] 55% 300/549 [30:08<19:11,  4.62s/it]                                     {'loss': 5.3381, 'grad_norm': 1.279155969619751, 'learning_rate': 0.0012049450205472586, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'tokens_per_second_per_gpu': 62.82, 'epoch': 1.64}
 55% 300/549 [30:08<19:11,  4.62s/it][2025-10-10 13:41:59,671] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:42:02,245] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2725634574890137
[2025-10-10 13:42:03,508] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2624311447143555
[2025-10-10 13:42:04,784] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2758598327636719
[2025-10-10 13:42:06,602] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8172187805175781
[2025-10-10 13:42:06,602] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.03715705871582, 'eval_runtime': 5.8695, 'eval_samples_per_second': 21.637, 'eval_steps_per_second': 10.904, 'memory/max_active (GiB)': 2.67, 'memory/max_allocated (GiB)': 2.67, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.64}
 55% 300/549 [30:21<19:11,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 55% 301/549 [30:25<34:02,  8.23s/it] 55% 302/549 [30:29<28:29,  6.92s/it] 55% 303/549 [30:32<24:36,  6.00s/it] 55% 304/549 [30:36<21:55,  5.37s/it] 56% 305/549 [30:40<20:01,  4.93s/it] 56% 306/549 [30:44<18:41,  4.62s/it][2025-10-10 13:42:35,718] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:42:38,317] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2914865016937256
[2025-10-10 13:42:39,593] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2758712768554688
[2025-10-10 13:42:40,864] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2706935405731201
[2025-10-10 13:42:42,433] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5694029331207275
[2025-10-10 13:42:42,434] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.112299919128418, 'eval_runtime': 6.2942, 'eval_samples_per_second': 20.177, 'eval_steps_per_second': 10.168, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.67}
 56% 306/549 [30:57<18:41,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 56% 307/549 [31:01<33:26,  8.29s/it] 56% 308/549 [31:04<27:15,  6.79s/it] 56% 309/549 [31:08<23:38,  5.91s/it] 56% 310/549 [31:12<21:07,  5.30s/it] 57% 311/549 [31:16<19:24,  4.89s/it] 57% 312/549 [31:20<18:10,  4.60s/it][2025-10-10 13:43:11,471] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:43:14,124] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3028910160064697
[2025-10-10 13:43:15,424] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2999858856201172
[2025-10-10 13:43:16,730] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3056349754333496
[2025-10-10 13:43:18,045] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.314713954925537
[2025-10-10 13:43:18,045] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.025712013244629, 'eval_runtime': 6.3277, 'eval_samples_per_second': 20.071, 'eval_steps_per_second': 10.114, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.7}
 57% 312/549 [31:33<18:10,  4.60s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 57% 313/549 [31:37<32:26,  8.25s/it] 57% 314/549 [31:40<27:11,  6.94s/it] 57% 315/549 [31:44<23:29,  6.02s/it] 58% 316/549 [31:48<20:54,  5.38s/it] 58% 317/549 [31:52<19:06,  4.94s/it] 58% 318/549 [31:56<17:49,  4.63s/it][2025-10-10 13:43:47,716] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:43:50,271] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2804148197174072
[2025-10-10 13:43:51,584] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.312223196029663
[2025-10-10 13:43:52,861] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2768139839172363
[2025-10-10 13:43:54,115] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.253570556640625
[2025-10-10 13:43:54,115] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.83it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 5.036094665527344, 'eval_runtime': 5.9136, 'eval_samples_per_second': 21.476, 'eval_steps_per_second': 10.822, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.74}
 58% 318/549 [32:08<17:49,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 58% 319/549 [32:12<31:00,  8.09s/it] 58% 320/549 [32:16<26:03,  6.83s/it] 58% 321/549 [32:20<22:34,  5.94s/it] 59% 322/549 [32:24<20:09,  5.33s/it] 59% 323/549 [32:28<18:27,  4.90s/it] 59% 324/549 [32:32<17:14,  4.60s/it][2025-10-10 13:44:23,324] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:44:25,881] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2637131214141846
[2025-10-10 13:44:27,160] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2784416675567627
[2025-10-10 13:44:28,425] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2644331455230713
[2025-10-10 13:44:29,691] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2658593654632568
[2025-10-10 13:44:29,691] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.993957996368408, 'eval_runtime': 5.7744, 'eval_samples_per_second': 21.994, 'eval_steps_per_second': 11.083, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.77}
 59% 324/549 [32:44<17:14,  4.60s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 59% 325/549 [32:48<29:56,  8.02s/it] 59% 326/549 [32:52<25:09,  6.77s/it] 60% 327/549 [32:55<21:50,  5.90s/it] 60% 328/549 [32:59<19:30,  5.30s/it] 60% 329/549 [33:03<17:53,  4.88s/it] 60% 330/549 [33:07<16:44,  4.59s/it][2025-10-10 13:44:58,744] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:45:01,309] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2841627597808838
[2025-10-10 13:45:02,598] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2886757850646973
[2025-10-10 13:45:03,860] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2614099979400635
[2025-10-10 13:45:05,121] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2612431049346924
[2025-10-10 13:45:05,121] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.10it/s][A                                     
                                 [A{'eval_loss': 4.980707168579102, 'eval_runtime': 5.825, 'eval_samples_per_second': 21.802, 'eval_steps_per_second': 10.987, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.8}
 60% 330/549 [33:19<16:44,  4.59s/it]
100% 4/4 [00:03<00:00,  1.10it/s][A
                                 [A 60% 331/549 [33:23<29:09,  8.03s/it] 60% 332/549 [33:27<24:29,  6.77s/it] 61% 333/549 [33:31<21:16,  5.91s/it] 61% 334/549 [33:35<19:00,  5.30s/it] 61% 335/549 [33:39<17:24,  4.88s/it] 61% 336/549 [33:43<16:17,  4.59s/it][2025-10-10 13:45:34,234] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:45:37,199] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.272235631942749
[2025-10-10 13:45:38,489] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2894883155822754
[2025-10-10 13:45:39,750] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2607002258300781
[2025-10-10 13:45:41,007] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.256944179534912
[2025-10-10 13:45:41,008] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                     
                                 [A{'eval_loss': 4.9550395011901855, 'eval_runtime': 5.8454, 'eval_samples_per_second': 21.727, 'eval_steps_per_second': 10.949, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.84}
 61% 336/549 [33:55<16:17,  4.59s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 61% 337/549 [33:59<28:48,  8.16s/it] 62% 338/549 [34:02<23:30,  6.69s/it] 62% 339/549 [34:06<20:27,  5.85s/it] 62% 340/549 [34:09<17:40,  5.08s/it] 62% 341/549 [34:13<16:22,  4.72s/it] 62% 342/549 [34:17<15:27,  4.48s/it][2025-10-10 13:46:08,957] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:46:12,140] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.782719373703003
[2025-10-10 13:46:13,456] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3164212703704834
[2025-10-10 13:46:14,727] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2703287601470947
[2025-10-10 13:46:15,997] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2702345848083496
[2025-10-10 13:46:15,998] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.987721920013428, 'eval_runtime': 5.7754, 'eval_samples_per_second': 21.99, 'eval_steps_per_second': 11.082, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.87}
 62% 342/549 [34:30<15:27,  4.48s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 62% 343/549 [34:34<27:56,  8.14s/it] 63% 344/549 [34:38<23:25,  6.86s/it] 63% 345/549 [34:42<20:16,  5.96s/it] 63% 346/549 [34:46<18:04,  5.34s/it] 63% 347/549 [34:50<16:31,  4.91s/it] 63% 348/549 [34:53<15:24,  4.60s/it][2025-10-10 13:46:45,044] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:46:48,119] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7470786571502686
[2025-10-10 13:46:49,670] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5505836009979248
[2025-10-10 13:46:50,995] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3248536586761475
[2025-10-10 13:46:52,300] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3044347763061523
[2025-10-10 13:46:52,300] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.86it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.946439743041992, 'eval_runtime': 5.7767, 'eval_samples_per_second': 21.985, 'eval_steps_per_second': 11.079, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.9}
 63% 348/549 [35:06<15:24,  4.60s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 64% 349/549 [35:10<27:36,  8.28s/it] 64% 350/549 [35:14<23:02,  6.95s/it] 64% 351/549 [35:18<19:51,  6.02s/it] 64% 352/549 [35:22<17:39,  5.38s/it] 64% 353/549 [35:26<16:06,  4.93s/it] 64% 354/549 [35:30<15:01,  4.62s/it][2025-10-10 13:47:21,279] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:47:23,907] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3633801937103271
[2025-10-10 13:47:25,699] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7914912700653076
[2025-10-10 13:47:26,959] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.260002613067627
[2025-10-10 13:47:28,226] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2668824195861816
[2025-10-10 13:47:28,226] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.9122467041015625, 'eval_runtime': 5.7915, 'eval_samples_per_second': 21.929, 'eval_steps_per_second': 11.051, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.93}
 64% 354/549 [35:42<15:01,  4.62s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 65% 355/549 [35:46<26:34,  8.22s/it] 65% 356/549 [35:50<22:14,  6.91s/it] 65% 357/549 [35:54<19:12,  6.00s/it] 65% 358/549 [35:58<17:07,  5.38s/it] 65% 359/549 [36:02<15:39,  4.94s/it] 66% 360/549 [36:06<14:35,  4.63s/it][2025-10-10 13:47:57,394] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:47:59,913] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2456605434417725
[2025-10-10 13:48:01,644] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7309658527374268
[2025-10-10 13:48:03,128] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.48356032371521
[2025-10-10 13:48:04,398] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.270266056060791
[2025-10-10 13:48:04,398] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.941189289093018, 'eval_runtime': 5.7826, 'eval_samples_per_second': 21.962, 'eval_steps_per_second': 11.068, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 1.97}
 66% 360/549 [36:19<14:35,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 66% 361/549 [36:22<25:49,  8.24s/it] 66% 362/549 [36:26<21:35,  6.93s/it] 66% 363/549 [36:30<18:36,  6.00s/it] 66% 364/549 [36:34<16:33,  5.37s/it] 66% 365/549 [36:38<15:06,  4.93s/it] 67% 366/549 [36:42<14:09,  4.64s/it][2025-10-10 13:48:33,538] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:48:36,115] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2747948169708252
[2025-10-10 13:48:37,542] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4260179996490479
[2025-10-10 13:48:39,314] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7726826667785645
[2025-10-10 13:48:40,603] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2883005142211914
[2025-10-10 13:48:40,603] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.909399509429932, 'eval_runtime': 5.7674, 'eval_samples_per_second': 22.021, 'eval_steps_per_second': 11.097, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.0}
 67% 366/549 [36:55<14:09,  4.64s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A[2025-10-10 13:48:46,376] [INFO] [axolotl.core.trainers.base._save:671] [PID:24741] Saving model checkpoint to ./qlora-out/checkpoint-366
 67% 367/549 [37:06<31:54, 10.52s/it] 67% 368/549 [37:10<25:42,  8.52s/it] 67% 369/549 [37:13<20:50,  6.95s/it] 67% 370/549 [37:17<17:59,  6.03s/it] 68% 371/549 [37:21<16:00,  5.39s/it] 68% 372/549 [37:25<14:37,  4.96s/it][2025-10-10 13:49:16,639] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:49:19,397] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2493188381195068
[2025-10-10 13:49:20,670] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2731273174285889
[2025-10-10 13:49:21,916] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2452585697174072
[2025-10-10 13:49:23,190] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2739589214324951
[2025-10-10 13:49:23,190] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.83it/s][A
 75% 3/4 [00:02<00:00,  1.31it/s][A
100% 4/4 [00:03<00:00,  1.10it/s][A                                     
                                 [A{'eval_loss': 4.937836647033691, 'eval_runtime': 5.8269, 'eval_samples_per_second': 21.795, 'eval_steps_per_second': 10.984, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.03}
 68% 372/549 [37:37<14:37,  4.96s/it]
100% 4/4 [00:03<00:00,  1.10it/s][A
                                 [A 68% 373/549 [37:41<24:29,  8.35s/it] 68% 374/549 [37:45<20:27,  7.01s/it] 68% 375/549 [37:49<17:38,  6.08s/it] 68% 376/549 [37:53<15:38,  5.43s/it] 69% 377/549 [37:57<14:13,  4.96s/it] 69% 378/549 [38:01<13:13,  4.64s/it][2025-10-10 13:49:52,383] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:49:55,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2608587741851807
[2025-10-10 13:49:56,450] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2715425491333008
[2025-10-10 13:49:57,715] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2646293640136719
[2025-10-10 13:49:59,010] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2951922416687012
[2025-10-10 13:49:59,010] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.11it/s][A                                     
                                 [A{'eval_loss': 4.906888484954834, 'eval_runtime': 5.8215, 'eval_samples_per_second': 21.816, 'eval_steps_per_second': 10.994, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.07}
 69% 378/549 [38:13<13:13,  4.64s/it]
100% 4/4 [00:03<00:00,  1.11it/s][A
                                 [A 69% 379/549 [38:17<23:03,  8.14s/it] 69% 380/549 [38:21<19:17,  6.85s/it] 69% 381/549 [38:25<16:40,  5.95s/it] 70% 382/549 [38:29<14:50,  5.33s/it] 70% 383/549 [38:32<13:32,  4.90s/it] 70% 384/549 [38:36<12:39,  4.60s/it][2025-10-10 13:50:28,063] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:50:31,217] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2841885089874268
[2025-10-10 13:50:32,493] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2753474712371826
[2025-10-10 13:50:33,766] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2733006477355957
[2025-10-10 13:50:35,051] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2847979068756104
[2025-10-10 13:50:35,051] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.906486988067627, 'eval_runtime': 5.7879, 'eval_samples_per_second': 21.942, 'eval_steps_per_second': 11.057, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.1}
 70% 384/549 [38:49<12:39,  4.60s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 70% 385/549 [38:53<22:27,  8.22s/it] 70% 386/549 [38:57<18:46,  6.91s/it] 70% 387/549 [39:01<16:14,  6.01s/it] 71% 388/549 [39:05<14:26,  5.38s/it] 71% 389/549 [39:09<13:10,  4.94s/it] 71% 390/549 [39:13<12:16,  4.63s/it][2025-10-10 13:51:04,231] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:51:07,499] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5531108379364014
[2025-10-10 13:51:08,778] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2783293724060059
[2025-10-10 13:51:10,059] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2812681198120117
[2025-10-10 13:51:11,349] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.28898286819458
[2025-10-10 13:51:11,349] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.916737079620361, 'eval_runtime': 5.7701, 'eval_samples_per_second': 22.01, 'eval_steps_per_second': 11.092, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.13}
 71% 390/549 [39:25<12:16,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 71% 391/549 [39:29<21:45,  8.27s/it] 71% 392/549 [39:33<18:09,  6.94s/it] 72% 393/549 [39:37<15:38,  6.01s/it] 72% 394/549 [39:41<13:52,  5.37s/it] 72% 395/549 [39:45<12:37,  4.92s/it] 72% 396/549 [39:49<11:45,  4.61s/it][2025-10-10 13:51:40,295] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:51:43,495] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8327646255493164
[2025-10-10 13:51:44,795] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2993097305297852
[2025-10-10 13:51:46,167] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.372373342514038
[2025-10-10 13:51:47,442] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2747223377227783
[2025-10-10 13:51:47,442] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.890364170074463, 'eval_runtime': 5.7714, 'eval_samples_per_second': 22.005, 'eval_steps_per_second': 11.089, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.16}
 72% 396/549 [40:02<11:45,  4.61s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 72% 397/549 [40:05<20:55,  8.26s/it] 72% 398/549 [40:09<17:27,  6.94s/it] 73% 399/549 [40:13<15:02,  6.02s/it] 73% 400/549 [40:17<13:22,  5.38s/it]                                     {'loss': 4.9866, 'grad_norm': 0.22382044792175293, 'learning_rate': 0.0005264834558836156, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'tokens_per_second_per_gpu': 41.85, 'epoch': 2.19}
 73% 400/549 [40:17<13:22,  5.38s/it] 73% 401/549 [40:21<12:11,  4.94s/it] 73% 402/549 [40:25<11:21,  4.63s/it][2025-10-10 13:52:16,526] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:52:19,410] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5968072414398193
[2025-10-10 13:52:21,081] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.670398473739624
[2025-10-10 13:52:22,353] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.272465705871582
[2025-10-10 13:52:23,640] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.285794973373413
[2025-10-10 13:52:23,640] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.866282939910889, 'eval_runtime': 5.7799, 'eval_samples_per_second': 21.973, 'eval_steps_per_second': 11.073, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.2}
 73% 402/549 [40:38<11:21,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 73% 403/549 [40:42<20:07,  8.27s/it] 74% 404/549 [40:45<16:47,  6.95s/it] 74% 405/549 [40:49<14:28,  6.03s/it] 74% 406/549 [40:53<12:52,  5.40s/it] 74% 407/549 [40:57<11:43,  4.95s/it] 74% 408/549 [41:01<10:53,  4.63s/it][2025-10-10 13:52:52,766] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:52:55,394] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.33892822265625
[2025-10-10 13:52:57,235] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.840940237045288
[2025-10-10 13:52:58,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2660002708435059
[2025-10-10 13:52:59,779] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2767868041992188
[2025-10-10 13:52:59,779] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.8776326179504395, 'eval_runtime': 5.754, 'eval_samples_per_second': 22.071, 'eval_steps_per_second': 11.123, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.23}
 74% 408/549 [41:14<10:53,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 74% 409/549 [41:18<19:13,  8.24s/it] 75% 410/549 [41:22<16:02,  6.93s/it] 75% 411/549 [41:25<13:48,  6.01s/it] 75% 412/549 [41:29<12:16,  5.38s/it] 75% 413/549 [41:33<11:11,  4.94s/it] 75% 414/549 [41:37<10:24,  4.63s/it][2025-10-10 13:53:28,861] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:53:31,446] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2704637050628662
[2025-10-10 13:53:32,999] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.552428960800171
[2025-10-10 13:53:34,672] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6721296310424805
[2025-10-10 13:53:35,947] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.274794101715088
[2025-10-10 13:53:35,947] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.875138282775879, 'eval_runtime': 5.7864, 'eval_samples_per_second': 21.948, 'eval_steps_per_second': 11.06, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.26}
 75% 414/549 [41:50<10:24,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 76% 415/549 [41:54<18:28,  8.27s/it] 76% 416/549 [41:58<15:24,  6.95s/it] 76% 417/549 [42:02<13:15,  6.03s/it] 76% 418/549 [42:06<11:46,  5.39s/it] 76% 419/549 [42:10<10:43,  4.95s/it] 77% 420/549 [42:13<09:58,  4.64s/it][2025-10-10 13:54:05,114] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:54:07,798] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3199918270111084
[2025-10-10 13:54:09,209] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4115185737609863
[2025-10-10 13:54:11,055] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8450191020965576
[2025-10-10 13:54:12,332] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2769029140472412
[2025-10-10 13:54:12,332] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.852082252502441, 'eval_runtime': 5.8261, 'eval_samples_per_second': 21.799, 'eval_steps_per_second': 10.985, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.3}
 77% 420/549 [42:27<09:58,  4.64s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 77% 421/549 [42:30<17:45,  8.32s/it] 77% 422/549 [42:34<14:46,  6.98s/it] 77% 423/549 [42:38<12:41,  6.04s/it] 77% 424/549 [42:42<11:15,  5.40s/it] 77% 425/549 [42:46<10:13,  4.95s/it] 78% 426/549 [42:50<09:30,  4.63s/it][2025-10-10 13:54:41,441] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:54:44,036] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2831828594207764
[2025-10-10 13:54:45,310] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2733635902404785
[2025-10-10 13:54:47,008] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.697882890701294
[2025-10-10 13:54:48,636] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6273066997528076
[2025-10-10 13:54:48,636] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.834587574005127, 'eval_runtime': 5.7994, 'eval_samples_per_second': 21.899, 'eval_steps_per_second': 11.036, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.33}
 78% 426/549 [43:03<09:30,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 78% 427/549 [43:07<16:53,  8.31s/it] 78% 428/549 [43:11<14:04,  6.98s/it] 78% 429/549 [43:14<12:05,  6.05s/it] 78% 430/549 [43:18<10:43,  5.40s/it] 79% 431/549 [43:22<09:44,  4.96s/it] 79% 432/549 [43:26<09:02,  4.64s/it][2025-10-10 13:55:17,787] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:55:20,430] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3111767768859863
[2025-10-10 13:55:21,756] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3263280391693115
[2025-10-10 13:55:23,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4486384391784668
[2025-10-10 13:55:25,025] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8193638324737549
[2025-10-10 13:55:25,025] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.831284046173096, 'eval_runtime': 5.8293, 'eval_samples_per_second': 21.787, 'eval_steps_per_second': 10.979, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.36}
 79% 432/549 [43:39<09:02,  4.64s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 79% 433/549 [43:43<16:05,  8.32s/it] 79% 434/549 [43:47<13:22,  6.98s/it] 79% 435/549 [43:51<11:28,  6.04s/it] 79% 436/549 [43:55<10:09,  5.39s/it] 80% 437/549 [43:59<09:13,  4.94s/it] 80% 438/549 [44:02<08:33,  4.63s/it][2025-10-10 13:55:54,066] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:55:56,645] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.282938003540039
[2025-10-10 13:55:57,916] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2711057662963867
[2025-10-10 13:55:59,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2884700298309326
[2025-10-10 13:56:00,852] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6469426155090332
[2025-10-10 13:56:00,852] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.831367492675781, 'eval_runtime': 6.1266, 'eval_samples_per_second': 20.729, 'eval_steps_per_second': 10.446, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.39}
 80% 438/549 [44:15<08:33,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 80% 439/549 [44:19<15:09,  8.27s/it] 80% 440/549 [44:23<12:36,  6.94s/it] 80% 441/549 [44:27<10:50,  6.03s/it] 81% 442/549 [44:31<09:36,  5.39s/it] 81% 443/549 [44:35<08:44,  4.95s/it] 81% 444/549 [44:39<08:06,  4.63s/it][2025-10-10 13:56:30,285] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:56:32,867] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.277099609375
[2025-10-10 13:56:34,157] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2903683185577393
[2025-10-10 13:56:35,441] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2831330299377441
[2025-10-10 13:56:36,811] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3701646327972412
[2025-10-10 13:56:36,811] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.824395656585693, 'eval_runtime': 6.3408, 'eval_samples_per_second': 20.029, 'eval_steps_per_second': 10.093, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.43}
 81% 444/549 [44:51<08:06,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 81% 445/549 [44:55<14:18,  8.26s/it] 81% 446/549 [44:59<11:55,  6.95s/it] 81% 447/549 [45:03<10:14,  6.03s/it] 82% 448/549 [45:07<09:03,  5.38s/it] 82% 449/549 [45:11<08:14,  4.94s/it] 82% 450/549 [45:15<07:38,  4.63s/it][2025-10-10 13:57:06,458] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:57:09,048] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2799561023712158
[2025-10-10 13:57:10,351] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3030431270599365
[2025-10-10 13:57:11,608] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2559781074523926
[2025-10-10 13:57:12,875] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2669663429260254
[2025-10-10 13:57:12,875] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.833980560302734, 'eval_runtime': 6.203, 'eval_samples_per_second': 20.474, 'eval_steps_per_second': 10.318, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.46}
 82% 450/549 [45:27<07:38,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 82% 451/549 [45:31<13:21,  8.18s/it] 82% 452/549 [45:35<11:08,  6.89s/it] 83% 453/549 [45:39<09:34,  5.98s/it] 83% 454/549 [45:43<08:28,  5.35s/it] 83% 455/549 [45:47<07:43,  4.93s/it] 83% 456/549 [45:51<07:10,  4.63s/it][2025-10-10 13:57:42,418] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:57:45,040] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2982087135314941
[2025-10-10 13:57:46,346] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3049485683441162
[2025-10-10 13:57:47,647] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3008086681365967
[2025-10-10 13:57:48,951] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3040728569030762
[2025-10-10 13:57:48,951] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.833824157714844, 'eval_runtime': 5.9057, 'eval_samples_per_second': 21.505, 'eval_steps_per_second': 10.837, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.49}
 83% 456/549 [46:03<07:10,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 83% 457/549 [46:07<12:27,  8.13s/it] 83% 458/549 [46:11<10:23,  6.86s/it] 84% 459/549 [46:15<08:56,  5.96s/it] 84% 460/549 [46:19<07:55,  5.34s/it] 84% 461/549 [46:23<07:11,  4.91s/it] 84% 462/549 [46:26<06:40,  4.60s/it][2025-10-10 13:58:18,149] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:58:20,763] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2804484367370605
[2025-10-10 13:58:22,047] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2839665412902832
[2025-10-10 13:58:23,316] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2688114643096924
[2025-10-10 13:58:24,612] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2959990501403809
[2025-10-10 13:58:24,613] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.826416015625, 'eval_runtime': 5.8235, 'eval_samples_per_second': 21.808, 'eval_steps_per_second': 10.99, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.52}
 84% 462/549 [46:39<06:40,  4.60s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 84% 463/549 [46:43<11:33,  8.06s/it] 85% 464/549 [46:46<09:37,  6.80s/it] 85% 465/549 [46:50<08:16,  5.91s/it] 85% 466/549 [46:54<07:19,  5.30s/it] 85% 467/549 [46:58<06:39,  4.87s/it] 85% 468/549 [47:02<06:11,  4.59s/it][2025-10-10 13:58:53,663] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:58:56,265] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2601747512817383
[2025-10-10 13:58:57,546] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2804811000823975
[2025-10-10 13:58:58,818] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2715575695037842
[2025-10-10 13:59:00,085] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2669270038604736
[2025-10-10 13:59:00,085] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.10it/s][A                                     
                                 [A{'eval_loss': 4.823799133300781, 'eval_runtime': 5.8422, 'eval_samples_per_second': 21.738, 'eval_steps_per_second': 10.955, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.56}
 85% 468/549 [47:14<06:11,  4.59s/it]
100% 4/4 [00:03<00:00,  1.10it/s][A
                                 [A 85% 469/549 [47:18<10:44,  8.05s/it] 86% 470/549 [47:22<08:56,  6.80s/it] 86% 471/549 [47:26<07:43,  5.94s/it] 86% 472/549 [47:30<06:50,  5.33s/it] 86% 473/549 [47:34<06:12,  4.90s/it] 86% 474/549 [47:38<05:45,  4.61s/it][2025-10-10 13:59:29,330] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 13:59:32,429] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2759573459625244
[2025-10-10 13:59:33,742] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3128514289855957
[2025-10-10 13:59:35,045] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3031983375549316
[2025-10-10 13:59:36,349] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3030741214752197
[2025-10-10 13:59:36,349] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.10it/s][A                                     
                                 [A{'eval_loss': 4.822895526885986, 'eval_runtime': 5.8969, 'eval_samples_per_second': 21.537, 'eval_steps_per_second': 10.853, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.59}
 86% 474/549 [47:51<05:45,  4.61s/it]
100% 4/4 [00:03<00:00,  1.10it/s][A
                                 [A 87% 475/549 [47:54<10:11,  8.26s/it] 87% 476/549 [47:58<08:26,  6.94s/it] 87% 477/549 [48:02<07:13,  6.02s/it] 87% 478/549 [48:06<06:21,  5.38s/it] 87% 479/549 [48:10<05:45,  4.93s/it] 87% 480/549 [48:14<05:19,  4.63s/it][2025-10-10 14:00:05,526] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:00:08,809] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4298784732818604
[2025-10-10 14:00:10,095] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.285125732421875
[2025-10-10 14:00:11,356] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2615094184875488
[2025-10-10 14:00:12,652] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.294884443283081
[2025-10-10 14:00:12,652] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.817491054534912, 'eval_runtime': 5.8133, 'eval_samples_per_second': 21.847, 'eval_steps_per_second': 11.009, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.62}
 87% 480/549 [48:27<05:19,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 88% 481/549 [48:31<09:23,  8.28s/it] 88% 482/549 [48:35<07:45,  6.95s/it] 88% 483/549 [48:38<06:37,  6.03s/it] 88% 484/549 [48:42<05:50,  5.39s/it] 88% 485/549 [48:46<05:16,  4.95s/it] 89% 486/549 [48:50<04:52,  4.64s/it][2025-10-10 14:00:41,802] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:00:45,094] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8106629848480225
[2025-10-10 14:00:46,391] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2969591617584229
[2025-10-10 14:00:47,689] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.297926425933838
[2025-10-10 14:00:48,990] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3006622791290283
[2025-10-10 14:00:48,991] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.809765815734863, 'eval_runtime': 5.8183, 'eval_samples_per_second': 21.828, 'eval_steps_per_second': 11.0, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.66}
 89% 486/549 [49:03<04:52,  4.64s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 89% 487/549 [49:07<08:35,  8.31s/it] 89% 488/549 [49:11<07:05,  6.97s/it] 89% 489/549 [49:15<06:02,  6.04s/it] 89% 490/549 [49:19<05:18,  5.39s/it] 89% 491/549 [49:23<04:46,  4.94s/it] 90% 492/549 [49:26<04:23,  4.63s/it][2025-10-10 14:01:18,064] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:01:21,207] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.8038432598114014
[2025-10-10 14:01:22,618] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4110236167907715
[2025-10-10 14:01:23,883] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.264404296875
[2025-10-10 14:01:25,172] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.288550615310669
[2025-10-10 14:01:25,172] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.804036617279053, 'eval_runtime': 5.7598, 'eval_samples_per_second': 22.049, 'eval_steps_per_second': 11.112, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.69}
 90% 492/549 [49:39<04:23,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 90% 493/549 [49:43<07:42,  8.26s/it] 90% 494/549 [49:47<06:21,  6.94s/it] 90% 495/549 [49:51<05:24,  6.01s/it] 90% 496/549 [49:55<04:44,  5.37s/it] 91% 497/549 [49:59<04:16,  4.93s/it] 91% 498/549 [50:03<03:55,  4.62s/it][2025-10-10 14:01:54,168] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:01:56,896] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4492151737213135
[2025-10-10 14:01:58,644] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7475955486297607
[2025-10-10 14:01:59,912] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2675278186798096
[2025-10-10 14:02:01,187] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2756171226501465
[2025-10-10 14:02:01,188] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.802491664886475, 'eval_runtime': 5.7541, 'eval_samples_per_second': 22.071, 'eval_steps_per_second': 11.122, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.72}
 91% 498/549 [50:15<03:55,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 91% 499/549 [50:19<06:51,  8.22s/it] 91% 500/549 [50:23<05:38,  6.91s/it]                                     {'loss': 4.7637, 'grad_norm': 0.1683763712644577, 'learning_rate': 7.053292796908628e-05, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'tokens_per_second_per_gpu': 21.22, 'epoch': 2.73}
 91% 500/549 [50:23<05:38,  6.91s/it] 91% 501/549 [50:27<04:47,  6.00s/it] 91% 502/549 [50:31<04:12,  5.37s/it] 92% 503/549 [50:35<03:46,  4.93s/it] 92% 504/549 [50:39<03:28,  4.63s/it][2025-10-10 14:02:30,241] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:02:32,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2650606632232666
[2025-10-10 14:02:34,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6998417377471924
[2025-10-10 14:02:36,052] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5492708683013916
[2025-10-10 14:02:37,338] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2862026691436768
[2025-10-10 14:02:37,338] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.802091121673584, 'eval_runtime': 5.8288, 'eval_samples_per_second': 21.789, 'eval_steps_per_second': 10.98, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.75}
 92% 504/549 [50:52<03:28,  4.63s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 92% 505/549 [50:55<06:04,  8.28s/it] 92% 506/549 [50:59<04:59,  6.96s/it] 92% 507/549 [51:03<04:13,  6.04s/it] 93% 508/549 [51:07<03:41,  5.40s/it] 93% 509/549 [51:11<03:17,  4.95s/it] 93% 510/549 [51:15<03:00,  4.63s/it][2025-10-10 14:03:06,511] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:03:09,143] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3052573204040527
[2025-10-10 14:03:10,598] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4548754692077637
[2025-10-10 14:03:12,399] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.800527811050415
[2025-10-10 14:03:13,696] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2964775562286377
[2025-10-10 14:03:13,696] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.803107261657715, 'eval_runtime': 5.8267, 'eval_samples_per_second': 21.796, 'eval_steps_per_second': 10.984, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.79}
 93% 510/549 [51:28<03:00,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 93% 511/549 [51:32<05:15,  8.31s/it] 93% 512/549 [51:36<04:18,  6.98s/it] 93% 513/549 [51:39<03:37,  6.04s/it] 94% 514/549 [51:43<03:09,  5.40s/it] 94% 515/549 [51:47<02:48,  4.95s/it] 94% 516/549 [51:51<02:32,  4.64s/it][2025-10-10 14:03:42,833] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:03:45,384] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2671685218811035
[2025-10-10 14:03:46,651] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2670958042144775
[2025-10-10 14:03:48,344] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.6930501461029053
[2025-10-10 14:03:49,888] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.5432307720184326
[2025-10-10 14:03:49,888] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.801834583282471, 'eval_runtime': 5.9383, 'eval_samples_per_second': 21.387, 'eval_steps_per_second': 10.778, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.82}
 94% 516/549 [52:04<02:32,  4.64s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 94% 517/549 [52:08<04:25,  8.31s/it] 94% 518/549 [52:12<03:36,  6.97s/it] 95% 519/549 [52:16<03:01,  6.04s/it] 95% 520/549 [52:20<02:36,  5.40s/it] 95% 521/549 [52:24<02:18,  4.95s/it] 95% 522/549 [52:27<02:04,  4.63s/it][2025-10-10 14:04:19,117] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:04:21,739] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.3012058734893799
[2025-10-10 14:04:23,021] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2816236019134521
[2025-10-10 14:04:24,451] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.4290544986724854
[2025-10-10 14:04:26,225] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.7740297317504883
[2025-10-10 14:04:26,225] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.80092716217041, 'eval_runtime': 5.7804, 'eval_samples_per_second': 21.971, 'eval_steps_per_second': 11.072, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.85}
 95% 522/549 [52:40<02:04,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 95% 523/549 [52:44<03:34,  8.26s/it] 95% 524/549 [52:48<02:53,  6.94s/it] 96% 525/549 [52:52<02:24,  6.01s/it] 96% 526/549 [52:56<02:03,  5.37s/it] 96% 527/549 [53:00<01:48,  4.93s/it] 96% 528/549 [53:04<01:37,  4.62s/it][2025-10-10 14:04:55,239] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:04:57,810] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2706298828125
[2025-10-10 14:04:59,088] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.276904582977295
[2025-10-10 14:05:00,384] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2962520122528076
[2025-10-10 14:05:02,135] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.750046730041504
[2025-10-10 14:05:02,135] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.84it/s][A
 75% 3/4 [00:02<00:00,  1.32it/s][A
100% 4/4 [00:03<00:00,  1.12it/s][A                                     
                                 [A{'eval_loss': 4.8013153076171875, 'eval_runtime': 6.0043, 'eval_samples_per_second': 21.152, 'eval_steps_per_second': 10.659, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.89}
 96% 528/549 [53:16<01:37,  4.62s/it]
100% 4/4 [00:03<00:00,  1.12it/s][A
                                 [A 96% 529/549 [53:20<02:45,  8.26s/it] 97% 530/549 [53:24<02:11,  6.95s/it] 97% 531/549 [53:28<01:48,  6.03s/it] 97% 532/549 [53:32<01:31,  5.39s/it] 97% 533/549 [53:36<01:19,  4.95s/it] 97% 534/549 [53:40<01:09,  4.63s/it][2025-10-10 14:05:31,473] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:05:34,055] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.281491994857788
[2025-10-10 14:05:35,343] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2869987487792969
[2025-10-10 14:05:36,619] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.275632381439209
[2025-10-10 14:05:38,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.460437297821045
[2025-10-10 14:05:38,080] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.801328182220459, 'eval_runtime': 6.2546, 'eval_samples_per_second': 20.305, 'eval_steps_per_second': 10.232, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.92}
 97% 534/549 [53:53<01:09,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 97% 535/549 [53:57<01:55,  8.26s/it] 98% 536/549 [54:00<01:30,  6.93s/it] 98% 537/549 [54:04<01:12,  6.01s/it] 98% 538/549 [54:08<00:59,  5.37s/it] 98% 539/549 [54:12<00:49,  4.93s/it] 98% 540/549 [54:16<00:41,  4.62s/it][2025-10-10 14:06:07,574] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:06:10,143] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2767753601074219
[2025-10-10 14:06:11,428] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2852528095245361
[2025-10-10 14:06:12,701] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2729084491729736
[2025-10-10 14:06:13,962] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2607090473175049
[2025-10-10 14:06:13,963] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.801631927490234, 'eval_runtime': 6.294, 'eval_samples_per_second': 20.178, 'eval_steps_per_second': 10.168, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.95}
 98% 540/549 [54:29<00:41,  4.62s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A 99% 541/549 [54:32<01:05,  8.20s/it] 99% 542/549 [54:36<00:48,  6.91s/it] 99% 543/549 [54:40<00:36,  6.00s/it] 99% 544/549 [54:44<00:26,  5.37s/it] 99% 545/549 [54:48<00:19,  4.94s/it] 99% 546/549 [54:52<00:13,  4.63s/it][2025-10-10 14:06:43,637] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:24741] Running evaluation step...
[2025-10-10 14:06:46,208] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2853760719299316
[2025-10-10 14:06:47,466] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2572674751281738
[2025-10-10 14:06:48,724] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2585771083831787
[2025-10-10 14:06:50,005] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:24741] generate_batches time: 1.2800981998443604
[2025-10-10 14:06:50,005] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:24741] gather_len_batches: [4]

  0% 0/4 [00:00<?, ?it/s][A
 50% 2/4 [00:01<00:01,  1.85it/s][A
 75% 3/4 [00:02<00:00,  1.33it/s][A
100% 4/4 [00:03<00:00,  1.13it/s][A                                     
                                 [A{'eval_loss': 4.80186653137207, 'eval_runtime': 6.011, 'eval_samples_per_second': 21.128, 'eval_steps_per_second': 10.647, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 2.98}
 99% 546/549 [55:04<00:13,  4.63s/it]
100% 4/4 [00:03<00:00,  1.13it/s][A
                                 [A100% 547/549 [55:08<00:16,  8.11s/it]100% 548/549 [55:12<00:06,  6.84s/it]100% 549/549 [55:16<00:00,  5.99s/it][2025-10-10 14:07:07,765] [INFO] [axolotl.core.trainers.base._save:671] [PID:24741] Saving model checkpoint to ./qlora-out/checkpoint-549
                                     {'train_runtime': 3323.7208, 'train_samples_per_second': 0.33, 'train_steps_per_second': 0.165, 'train_loss': 5.692639126803706, 'memory/max_active (GiB)': 3.74, 'memory/max_allocated (GiB)': 3.74, 'memory/device_reserved (GiB)': 4.63, 'epoch': 3.0}
100% 549/549 [55:23<00:00,  5.99s/it]100% 549/549 [55:23<00:00,  6.05s/it]